In [None]:
import warnings
warnings.filterwarnings('ignore')
from IPython.display import set_matplotlib_formats
set_matplotlib_formats('retina')

import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(palette='cubehelix',context='notebook',
       font='cambria',style='white')
import missingno as msn

import eli5
from eli5.sklearn import PermutationImportance
import shap
from lightgbm import LGBMClassifier, LGBMRegressor
from catboost import CatBoostClassifier

from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_predict
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, train_test_split, KFold, StratifiedKFold
from sklearn.ensemble import StackingClassifier

from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import plot_precision_recall_curve
from sklearn.metrics import plot_roc_curve
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## Data Load
------

In [None]:
train = pd.read_csv('../input/tabular-playground-series-apr-2021/train.csv').set_index('PassengerId')
test = pd.read_csv('../input/tabular-playground-series-apr-2021/test.csv').set_index('PassengerId')

combined = pd.concat([train,test])
msn.matrix(combined.drop('Survived', axis=1), figsize=(8,5), fontsize=12)
plt.show()

## EDA & Data Pre-Processing
--------

In [None]:
fig, axes = plt.subplots(ncols=3, figsize=(12,4))

sns.kdeplot(x=train.Survived, hue=train.Pclass, ax=axes[0], fill=True)
sns.kdeplot(x=train.Survived, hue=train.Sex,ax=axes[1], fill=True)
sns.kdeplot(x=train.Survived, hue=train.Embarked,ax=axes[2], fill=True)
sns.despine()

#### Filling nans

In [None]:
def alt_age(df)->pd.Series:
    
    for sex in df.Sex.unique():
        for pc in df.Pclass.unique():
            df.loc[(df.Pclass == pc) & (df.Sex == sex),'Age']=\
            df.loc[(df.Pclass == pc) & (df.Sex == sex),'Age'].fillna(
            df[(df.Pclass == pc) & (df.Sex == sex)].Age.median())
            
    return df

combined = alt_age(combined)

fig, axes = plt.subplots(ncols=2, figsize=(11,4))

sns.histplot(x=train.Age, ax=axes[0],legend='Train',
             kde=True, bins=20, edgecolor='darkgreen')
sns.histplot(x=test.Age,ax=axes[1], legend='Test',
             kde=True, bins=20, edgecolor='darkgreen')
sns.despine()

In [None]:
def alt_fare(df)->pd.Series:
    
    for sex in df.Sex.unique():
        for pc in df.Pclass.unique():
            df.loc[(df.Pclass == pc) & (df.Sex == sex),'Fare']=\
            df.loc[(df.Pclass == pc) & (df.Sex == sex),'Fare'].fillna(
            df[(df.Pclass == pc) & (df.Sex == sex)].Fare.median())
            
    return df

combined=alt_fare(combined)

fig, axes = plt.subplots(ncols=2, figsize=(11,4), sharey=True)

sns.kdeplot(x=train.Fare, ax=axes[0],label='Train',
            fill=True,edgecolor='darkgreen')
sns.kdeplot(x=test.Fare,ax=axes[1], label='Test',
            fill=True,edgecolor='darkgreen')
sns.despine()

In [None]:
def fam_size(df)->pd.Series:
    return df.SibSp+df.Parch+1

def is_mother(series)->pd.Series:
    mask = series > 0
    return np.where(mask,1,0)

def is_alone(series)->pd.Series:
    mask= series > 0
    return np.where(mask,1,0)

def ticket(series)->pd.Series:
    return [str(i).split()[0] if len(str(i).split())>1 else 'N' 
     for i in series]

def fill_cabin(series)->pd.Series:
    series = series.fillna('N')
    return series.apply(lambda x: x[0][0])

combined['Famsize'] = fam_size(combined)
combined['IsMother'] = is_mother(combined['Parch'])
combined['IsAlone'] = is_alone(combined['SibSp'])
combined['Ticket'] = ticket(combined['Ticket'])
#combined['Embarked']=combined['Embarked'].fillna(np.random.choice(['S','C','Q']))
combined['Embarked']=combined['Embarked'].fillna('ffill')
combined['Cabin'] = fill_cabin(combined['Cabin'])

combined = combined.drop(['Name','SibSp','Parch'], axis=1)

## Model Selection
------

#### Label Encoding & MinMax Scaling

In [None]:
encoder = LabelEncoder()
hot_encoder = OneHotEncoder()

for col in combined.select_dtypes('object'):
    if col == 'Cabin':
        combined = combined.join(pd.get_dummies(combined[col]))
    else:
        combined[col] = encoder.fit_transform(combined[col])

scaler = MinMaxScaler()

combined[['Fare','Age','Famsize','Ticket','Pclass','Embarked']] = scaler.fit_transform(
    combined[['Fare','Age','Famsize','Ticket','Pclass','Embarked']])

combined = combined.drop('Cabin',axis=1)

In [None]:
a_train = combined[combined.index.isin(train.index)]
a_target = a_train.pop('Survived')
b_test = combined[combined.index.isin(test.index)]
b_target = b_test.pop('Survived')

#### Defining KFold Parameters

In [None]:
n_folds = 10
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

#### DecisionTree Classifier

In [None]:
scores = {}

mod = ('DecisionTreeClassifier',DecisionTreeClassifier(random_state=42,
            max_depth=10,min_samples_split=818, min_samples_leaf=35))

acc_mean,roc_auc_score_mean,f1_mean = [],[],[]
y_pred = 0

for fold,(train_index, valid_index) in enumerate(kf.split(a_train,a_target)):

    X_train, X_valid = a_train.iloc[train_index], a_train.iloc[valid_index]
    y_train, y_valid = a_target.iloc[train_index], a_target.iloc[valid_index]

    model = mod[1]
    model.fit(X_train, y_train) 

    preds = model.predict(X_valid)

    acc = accuracy_score(y_valid, preds)
    roc = roc_auc_score(y_valid,preds)
    f1 = f1_score(y_valid,preds)

    acc_mean.append(acc),roc_auc_score_mean.append(roc), f1_mean.append(f1)

    y_pred += model.predict_proba(b_test)[:,1]
    
y_pred /= n_folds

scores['accuracy score'] = np.mean(acc_mean)
scores['roc auc score'] = np.mean(roc_auc_score_mean)
scores['f1 score'] = np.mean(f1_mean)

pd.DataFrame([scores], index=[mod[0]])

#### CatBoost Classifier

In [None]:
scores = {}

mod = ('CatBoostClassifier',CatBoostClassifier(random_state=42, verbose=False))

acc_mean,roc_auc_score_mean,f1_mean = [],[],[]
y_pred = 0

for fold,(train_index, valid_index) in enumerate(kf.split(a_train,a_target)):

    X_train, X_valid = a_train.iloc[train_index], a_train.iloc[valid_index]
    y_train, y_valid = a_target.iloc[train_index], a_target.iloc[valid_index]

    model = mod[1]
    model.fit(X_train, y_train) 

    preds = model.predict(X_valid)

    acc = accuracy_score(y_valid, preds)
    roc = roc_auc_score(y_valid,preds)
    f1 = f1_score(y_valid,preds)

    acc_mean.append(acc),roc_auc_score_mean.append(roc), f1_mean.append(f1)

    y_pred += model.predict_proba(b_test)[:,1]
    
y_pred /= n_folds

scores['accuracy score'] = np.mean(acc_mean)
scores['roc auc score'] = np.mean(roc_auc_score_mean)
scores['f1 score'] = np.mean(f1_mean)

pd.DataFrame([scores], index=[mod[0]])

#### XGBoost Classifier

In [None]:
scores = {}

mod = ('XGBoost Classifier',XGBClassifier(random_state=42,
    cv=5,n_estimator=40,verbosity=0,n_jobs=-1,learning_rate=.1))

acc_mean,roc_auc_score_mean,f1_mean = [],[],[]
y_pred = 0

for fold,(train_index, valid_index) in enumerate(kf.split(a_train,a_target)):

    X_train, X_valid = a_train.iloc[train_index], a_train.iloc[valid_index]
    y_train, y_valid = a_target.iloc[train_index], a_target.iloc[valid_index]

    model = mod[1]
    model.fit(X_train, y_train) 

    preds = model.predict(X_valid)

    acc = accuracy_score(y_valid, preds)
    roc = roc_auc_score(y_valid,preds)
    f1 = f1_score(y_valid,preds)

    acc_mean.append(acc),roc_auc_score_mean.append(roc), f1_mean.append(f1)

    y_pred += model.predict_proba(b_test)[:,1]
    
y_pred /= n_folds

scores['accuracy score'] = np.mean(acc_mean)
scores['roc auc score'] = np.mean(roc_auc_score_mean)
scores['f1 score'] = np.mean(f1_mean)

pd.DataFrame([scores], index=[mod[0]])

#### LGBM Classifier

In [None]:
scores = {}

mod = ('LGBM Classifier',LGBMClassifier(random_state=42))

acc_mean,roc_auc_score_mean,f1_mean = [],[],[]
y_pred = 0

for fold,(train_index, valid_index) in enumerate(kf.split(a_train,a_target)):

    X_train, X_valid = a_train.iloc[train_index], a_train.iloc[valid_index]
    y_train, y_valid = a_target.iloc[train_index], a_target.iloc[valid_index]

    model = mod[1]
    model.fit(X_train, y_train) 

    preds = model.predict(X_valid)

    acc = accuracy_score(y_valid, preds)
    roc = roc_auc_score(y_valid,preds)
    f1 = f1_score(y_valid,preds)

    acc_mean.append(acc),roc_auc_score_mean.append(roc), f1_mean.append(f1)

    y_pred += model.predict_proba(b_test)[:,1]
    
y_pred /= n_folds

scores['accuracy score'] = np.mean(acc_mean)
scores['roc auc score'] = np.mean(roc_auc_score_mean)
scores['f1 score'] = np.mean(f1_mean)

pd.DataFrame([scores], index=[mod[0]])

$\implies$ LGBMClassifier has the best score.

In [None]:
## final prediction

y_pred = np.where(y_pred>.5,1,0)

## Submission
------

In [None]:
## makes a submission

submission = pd.DataFrame({
    'PassengerId':b_test.index,'Survived': y_pred})

submission.to_csv(
    'submission.csv',index=False)