# Reference
- https://www.kaggle.com/ranjeetshrivastav/tabular-playground-series-apr-2021

# Load data

In [None]:
import numpy as np
import pandas as pd

train = pd.read_csv('../input/tabular-playground-series-apr-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-apr-2021/test.csv')
sub = pd.read_csv('../input/tabular-playground-series-apr-2021/sample_submission.csv')

First of all, let's check the data.

In [None]:
train

In [None]:
test

In [None]:
sub

In [None]:
train['source'] = 'train'
test['source'] = 'test'
data = pd.concat([train, test], ignore_index=True)
data

# Missing Values

In [None]:
data.isnull().sum()

In [None]:
data.drop(['Name','Ticket'], axis=1, inplace=True)

In [None]:
data.isnull().sum()

## Age

In [None]:
data['Age'].isnull().sum()

In [None]:
data['Age'].describe()

In [None]:
data['Age'].mode()

In [None]:
data['Age'].replace(np.nan, data['Age'].mode()[0], inplace=True)

In [None]:
data.loc[data.Age < 1, "Age"] = data.Age * 100
data['Age'].astype(int)

In [None]:
data['Age'].isnull().sum()

## Embarked

In [None]:
data['Embarked'].isnull().sum()

In [None]:
data['Embarked'].describe()

In [None]:
data['Embarked'].mode()

In [None]:
data['Embarked'].replace(np.nan,data['Embarked'].mode()[0], inplace=True)

In [None]:
data['Embarked'].isnull().sum()

## Fare

In [None]:
data['Fare'].isnull().sum()

In [None]:
data['Fare'].describe()

In [None]:
data['Fare'].mean()

In [None]:
data['Fare'].fillna(data['Fare'].mean(), inplace=True)

In [None]:
data['Fare'].isnull().sum()

### Check the distribution by RainCloud

In [None]:
!pip install ptitprince

In [None]:
from ptitprince import RainCloud

In [None]:
import matplotlib.pyplot as plt

fig, ax = plt.subplots(figsize=(10, 10))
RainCloud(data=data, y='Fare', orient='h')
ax.grid()

In [None]:
# log scale
data['Fare'] = data['Fare'].map(lambda i: np.log(i) if i > 0 else 0)

fig, ax = plt.subplots(figsize=(10, 10))
RainCloud(data=data, y='Fare', orient='h')
ax.grid()

In [None]:
data['Fare'].describe()

## Cabin

In [None]:
data['Cabin'].isnull().sum()

In [None]:
data['Cabin'].describe()

In [None]:
data['Cabin'].mode()

In [None]:
data['Cabin'].replace(np.nan, data['Cabin'].mode()[0], inplace=True)

In [None]:
data

In [None]:
data.isnull().sum()

## Feature engineering

## Cabin

In [None]:
data["Cabin"]

In [None]:
data['Cabin'] = data['Cabin'].apply(lambda x: x[0:1])
data['Cabin'].value_counts()

In [None]:
import seaborn as sns

sns.countplot(x='Cabin',hue='Survived',data=data)
plt.show()

## Family size (SibSp+Parch)

In [None]:
data['family_size'] = data['SibSp']+data['Parch']+1

In [None]:
data

In [None]:
train_modified = data.loc[data['source']=='train']
test_modified = data.loc[data['source']=='test']

In [None]:
train_modified.head()

In [None]:
test_modified.head()

In [None]:
train_modified.drop('source',axis=1,inplace=True)
test_modified.drop(['source','Survived'],axis=1,inplace=True)

# Preprocess and Feature Engineering by PyCaret

In this notebook, I will use pycaret.  
[PyCaret](https://pycaret.org/) is an open source, low-code machine learning library in Python that allows you to go from preparing your data to deploying your model within minutes in your choice of notebook environment.

[This kaggle notebook](https://www.kaggle.com/frtgnn/pycaret-introduction-classification-regression) would be helpful.  

In [None]:
!pip install pycaret

In [None]:
from pycaret.classification import *

In [None]:
clf1 = setup(data = train_modified,
             target = 'Survived',
             numeric_features=['SibSp', 'Parch', 'family_size'],
             categorical_features=['Pclass'],
             silent = True)

# Compare models

In [None]:
compare_models()

Algorithms are arranged in descending order of "Accuracy".  
Now, use **lightGBM**, which is relatively rapid, accurate and popular in kaggle

Next, create a model with lightGBM.

In [None]:
lgbm = create_model('lightgbm')

# Hyperparameter tuning

In [None]:
tuned_lightgbm = tune_model(lgbm)

Let's plot some of the results.

In [None]:
plot_model(estimator = tuned_lightgbm, plot = 'learning')

In [None]:
plot_model(estimator = tuned_lightgbm, plot = 'auc')

In [None]:
plot_model(estimator = tuned_lightgbm, plot = 'confusion_matrix')

Feature Importance is below.

In [None]:
plot_model(estimator=tuned_lightgbm, plot='feature')

You can see that the influence of fare and age is very high.

Below you can see various other results as well.

In [None]:
evaluate_model(tuned_lightgbm)

Let's actually make a prediction.

In [None]:
interpret_model(tuned_lightgbm)

In [None]:
predictions = predict_model(tuned_lightgbm, data=test_modified)
predictions.head()

The type of survival is object so you should change this to int.

In [None]:
sub['Survived'] = list(map(int, predictions['Label'].astype(float)))
sub.to_csv('tuned_lightgbm.csv', index=False)

# Ensemble

Let's get back to compare_models() cell and check it.  
Following lightgbm, catboost and gbc are ranked high.  
So we'll use tuned_lightgbm, catboost and gbc.

In [None]:
xgboost  = create_model('xgboost')
catboost  = create_model('catboost')

In [None]:
blend = blend_models(estimator_list=[tuned_lightgbm, xgboost, catboost])

As well as tuned_lightgbm, check the result of this blend model.

In [None]:
predictions = predict_model(blend, data=test_modified)
predictions.head()

In [None]:
sub['Survived'] = list(map(int, predictions['Label'].astype(float)))
sub.to_csv('blend.csv', index=False)