In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import shap
import matplotlib.pyplot as plt
import missingno as msn
import seaborn as sns
from IPython.display import set_matplotlib_formats
set_matplotlib_formats('svg')

import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import GridSearchCV, train_test_split

from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import plot_roc_curve
from sklearn.metrics import plot_precision_recall_curve
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score

from sklearn.model_selection import RepeatedStratifiedKFold

from sklearn.ensemble import GradientBoostingClassifier, ExtraTreesClassifier
from tpot import TPOTClassifier

# EDA on the train data
-------

In [None]:
train = pd.read_csv('../input/spaceship-titanic/train.csv')
test= pd.read_csv('../input/spaceship-titanic/test.csv')
msn.matrix(train, figsize=(10,5), fontsize=8)
train.info()
train.head(1)

In [None]:
fig, axes = plt.subplots(ncols=2, figsize=(7,3
                         ))
sns.countplot(train['Transported'], ax=axes[0])
axes[0].set_title('Transported or not')
train_uniqs = train.nunique().sort_values()

train_uniqs[train_uniqs>4].plot.barh(edgecolor='gray', ax=axes[1])

axes[1].grid(alpha=.1
       )
axes[1].set_title('Features with mostly unique values')
for spine in ['right','top']:
    axes[1].spines[spine].set_visible(False)
sns.despine()

plt.tight_layout()

train['Group'] = train.PassengerId.apply(lambda x: x[:4])
train = train.drop(['Name','PassengerId'], axis=1)
#train = train.drop('Name', axis=1)
#train = train.fillna(method='ffill')

In [None]:
##pd.crosstab(index=train.HomePlanet, columns=train.Transported)

train['Cabin_f'] = train.Cabin.apply(lambda x: str(x)[:1])
train['Cabin_n'] = train.Cabin.apply(lambda x: str(x)[2])
train['Cabin_b'] = train.Cabin.apply(lambda x: str(x)[-1:])
train.pop('Cabin')

fig, axes = plt.subplots(ncols=3, nrows=2, sharex = True, figsize=(12,7))

columns = ['HomePlanet','CryoSleep','Transported','Cabin_f','Cabin_n','Cabin_b']

for ax, column in zip(axes.ravel(), train[columns]):
    sns.kdeplot(x=train.Age, hue=train[column].astype(str), ax=ax, fill=True, color='viridis')
    ax.set_title(f'{column}')
    ax.legend(train[column].astype(str).unique(), fontsize=8)

sns.despine()

plt.tight_layout()

# Observing the test data
-------

In [None]:
test= pd.read_csv('../input/spaceship-titanic/test.csv')
msn.matrix(test, figsize=(10,5), fontsize=8)
test.info()

In [None]:

test_id = test['PassengerId']
test['Group'] = test.PassengerId.apply(lambda x: x[:4])

fig = plt.figure(figsize=(3,2
                         ))
ax = plt.axes()
train_uniqs = test.nunique().sort_values()
train_uniqs[train_uniqs>4].plot.barh(edgecolor='gray', ax=ax)
ax.grid(alpha=.1)

for spine in ['right','top']:
    ax.spines[spine].set_visible(False)

ax.set_title('Test features with mostly unique value')

plt.show()

test['Cabin_f'] = test.Cabin.apply(lambda x: str(x)[:1])
test['Cabin_n'] = test.Cabin.apply(lambda x: str(x)[2])
test['Cabin_b'] = test.Cabin.apply(lambda x: str(x)[-1:])

test.pop('Cabin')

test = test.drop(['Name','PassengerId'], axis=1)

# Preprocessing
-----

In [None]:
y_train = train.pop('Transported')
y_train = y_train.apply(lambda x: 1 if x is True else 0)

le = LabelEncoder()
scl = StandardScaler()
#mxs = MinMaxScaler()

cols = train.select_dtypes('object').columns
comb = train[cols].append(test[cols])

for col in comb.columns:
    comb[col] = le.fit(comb[col])
    train[col] = le.transform(train[col])
    test[col] = le.transform(test[col])
    
train = train.fillna(train.median())
test = test.fillna(test.median())
    
X_train = train
X_test = test

X_train[X_train.columns] = scl.fit_transform(X_train)
X_test[X_test.columns] = scl.transform(X_test)

#X_train[X_train.columns] = mxs.fit_transform(X_train)
#X_test[X_test.columns] = mxs.transform(X_test)

In [None]:
fig = plt.figure(figsize=(9,7))

sns.heatmap(X_train.corr(), cmap='viridis_r', annot=True, linewidth=.75, annot_kws={'fontsize':8})
plt.show()

# Some additional code to show metrics
------

In [None]:
def confusion_plot(label, y_valid, y_pred, ax=None):

    co_ma = confusion_matrix(y_valid, y_pred)
    groups = ['True Neg','False Pos','False Neg','True Pos']
    counts = [int(value) for value in co_ma.flatten()]
    shares = ['{0:.2%}'.format(value) for value in
             co_ma.flatten()/np.sum(co_ma)]
    labels = [f'{v1}\n{v2}\n{v3}' for v1, v2, v3 in
              zip(groups,counts,shares)]
    labels = np.asarray(labels).reshape(2,2)
    sns.heatmap(co_ma,annot=labels,cmap='binary', alpha=.55, ax=ax,
             cbar=False, fmt='', linewidth=1,linecolor='black')
    plt.axis('off')
    plt.title(f'Confusion Matrix for {label}')

def show_metrics(metrics): 
    
    try:return pd.DataFrame(metrics).set_index('classifier') 
    except:return pd.DataFrame([metrics]).set_index('classifier')
    
def metrics(estimators, X_train, y_train):
    
    metrics = []

    for name, model in estimators.items():

        fig, axes = plt.subplots(ncols=3, figsize=(12,4))

        mod = model.fit(X_train, y_train)
        y_pred = mod.predict(X_train)
        plot_precision_recall_curve(mod, X_train, y_train,
                                    ax=axes[0], color='black')
        plot_roc_curve(mod, X_train, y_train, drop_intermediate=True,
                       ax = axes[1], color='black')

        axes[0].set_title(f'Precision-Recall Curve for {name}')
        axes[1].plot([1,0],[1,0], c='green',ls='--', lw=.9)
        axes[1].set_title(f'ROC Curve for {name}')
        confusion_plot(name,y_train, y_pred, axes[2])
        for ax in axes.ravel():
            ax.legend(frameon=False)

        for spine in ['top','right']:
            for ax in axes:
                ax.spines[spine].set_visible(False)
                ax.legend(loc='lower center', frameon=False, fontsize=9)

        scores = {}
        scores['classifier'] = name
        scores['accuracy_score'] = accuracy_score(y_train, y_pred)
        scores['roc_auc_score']=roc_auc_score(y_train, y_pred)
        scores['f1_score'] = f1_score(y_train,y_pred)

        plt.tight_layout()
        metrics.append(scores)
    return show_metrics(metrics)

# Model & Prediction
-------

```cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42)
#define search
mod = TPOTClassifier(generations=5, population_size=50, cv=cv, scoring='accuracy', verbosity=2, random_state=42, n_jobs=-1)
#perform the search
mod.fit(X_train, y_train)
#export the best model
mod.export('tpot_best_model.py')```

In [None]:
mlp = MLPClassifier()
gbc = GradientBoostingClassifier(random_state=42, learning_rate=0.1, max_depth=5, max_features=0.7000000000000001,
                                 min_samples_leaf=18, min_samples_split=11, n_estimators=100, subsample=0.7500000000000001)
lgbm = LGBMClassifier(random_state=42)


#gbc = ExtraTreesClassifier(bootstrap=True, criterion='entropy', max_features=0.7000000000000001, min_samples_leaf=5, min_samples_split=17, n_estimators=100)

estimators = {' MLP':mlp, 'GradientBoosting':gbc, 'LGBM':lgbm}

metrics(estimators, X_train, y_train)

$\implies$ LGBMCLassifier will be used for prediction

In [None]:
y_pred = lgbm.predict(X_test)
y_pred = y_pred.astype('bool')
submission = pd.DataFrame({'PassengerId':test_id,'Transported': y_pred})
submission.to_csv('submission.csv', index=False)

In [None]:
X_sampled = X_train.sample(100, random_state=10)
explainer = shap.TreeExplainer(lgbm)
shap_values = explainer.shap_values(X_sampled)
shap.initjs()
cmap = ['#778899','#D3D3D3']
shap.force_plot(explainer.expected_value[0], shap_values[0][:1,:], X_sampled.iloc[:1,:], plot_cmap=cmap)