In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV

In [None]:
data = pd.read_csv('../input/passenger-list-for-the-estonia-ferry-disaster/estonia-passenger-list.csv')

## EDA

In [None]:
data.head()

In [None]:
data.tail()

In [None]:
data.info()

In [None]:
data.describe().T

In [None]:
data.isnull().sum()

In [None]:
data.Survived.value_counts()

In [None]:
data.columns

In [None]:
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(18, 14))
sns.countplot(data=data,palette="Set3", x="Sex",ax=axes[0][0])
sns.countplot(data=data,palette="Set3", x="Category",ax=axes[0][1])
sns.countplot(data=data,palette="Set3", x="Survived",ax=axes[1][0])
sns.histplot(data=data, x="Age",hue="Survived", kde=True,ax=axes[1][1])

In [None]:
sns.catplot(x = "Category", y="Survived", kind = 'bar',data = data)

In [None]:
sns.catplot(x = "Sex", y="Survived", kind = 'bar',data = data)

In [None]:

sns.catplot(x = "Country", y="Survived", kind = 'bar',data = data,height=9, aspect=13/9)
plt.show

## Preprocessing

In [None]:
data = pd.get_dummies(data, columns = ['Category', 'Sex'],drop_first=True)
data.head()

In [None]:
data = data.drop(['PassengerId','Country','Firstname','Lastname'],axis=1)

In [None]:
data.head()

In [None]:
data.corr()

In [None]:
X = data.drop(["Survived"],axis=1)
y = data["Survived"]

In [None]:
X_train,X_test,y_train,y_test= train_test_split(X,y,test_size=0.3,random_state=42,stratify=y)

In [None]:
over = SMOTE()
under = RandomUnderSampler()
steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)

In [None]:
X_train, y_train = pipeline.fit_resample(X_train, y_train)

In [None]:
standard_sc = StandardScaler() 

In [None]:
X_train=standard_sc.fit_transform(X_train)
X_test=standard_sc.transform(X_test)

In [None]:
def confusion(y_test,y_test_pred,X):
    names=['Survived','Dead']
    cm=confusion_matrix(y_test,y_test_pred)
    f,ax=plt.subplots(figsize=(10,10))
    sns.heatmap(cm,annot=True,linewidth=.5,linecolor="r",fmt=".0f",ax=ax)
    plt.title(X, size = 25)
    plt.xlabel("y_pred")
    plt.ylabel("y_true")
    ax.set_xticklabels(names)
    ax.set_yticklabels(names)
    plt.show()

    return

## RF

In [None]:
RF = RandomForestClassifier()
RF.fit(X_train, y_train)
pred = RF.predict(X_test)
score = RF.score(X_test,y_test)

In [None]:
score

In [None]:
confusion(y_test,pred,"RF")

In [None]:
print(classification_report(y_test, pred))

## XGB

In [None]:
gbm = XGBClassifier(verbosity=1)
gbm.fit(X_train,y_train)
gbm_pred = gbm.predict(X_test)
gbm_score = gbm.score(X_test,y_test)

In [None]:
gbm_score

In [None]:
confusion(y_test,gbm_pred,"XGB")

In [None]:
print(classification_report(y_test, gbm_pred))

## MLP

In [None]:
clf = MLPClassifier(random_state=42)
clf.fit(X_train,y_train)
clf_pred = clf.predict(X_test)
clf_score = clf.score(X_test,y_test)

In [None]:
clf_score

In [None]:
confusion(y_test,clf_pred,"MLP")

In [None]:
print(classification_report(y_test, clf_pred))

## Optimized XGB

In [None]:
params_xgb = {
        "n_estimators":[250,500,1000,1500],
        "learning_rate":[0.01,0.1,0.3,0.6],
        'gpu_id': [0],
        "predictor":["gpu_predictor"],
        'tree_method': ['gpu_hist'],
        "updater":["grow_gpu_hist"],
        "sampling_method":["gradient_based"],
        "updater":["grow_gpu_hist"]
}

In [None]:
model_xgb = GridSearchCV(gbm,param_grid=params_xgb, cv=3,n_jobs=-1)
model_xgb.fit(X_train,y_train)

In [None]:
print("Best params: "+str(model_xgb.best_params_))
print("Best Score: "+str(model_xgb.best_score_)+'\n')

In [None]:
scores = pd.DataFrame(model_xgb.cv_results_)
scores.sort_values(by="rank_test_score")

In [None]:
y_train_pred_xgb = model_xgb.predict(X_train)
y_test_pred_xgb = model_xgb.predict(X_test)

In [None]:
confusion(y_test,y_test_pred_xgb,"Optimized XGB")

In [None]:
print(classification_report(y_test, y_test_pred_xgb))