In [None]:
>>> import matplotlib.pyplot as plt
>>> import pandas as pd
>>> from sklearn import (
...     ensemble,
...     preprocessing,
...     tree,
... )

In [None]:
>>> from sklearn.metrics import (
...     auc,
...     confusion_matrix,
...     roc_auc_score,
...     roc_curve,
... )

In [None]:
>>> from sklearn.model_selection import (
...     train_test_split,
...     StratifiedKFold,
... )

In [None]:
>>> from yellowbrick.classifier import (
...     ConfusionMatrix,
...     ROCAUC,
... )

In [None]:
>>> from yellowbrick.model_selection import (
...     LearningCurve,
... )

In [None]:
>>> url = (
...     "http://biostat.mc.vanderbilt.edu/"
...     "wiki/pub/Main/DataSets/titanic3.xls"
... )
>>> df = pd.read_excel(url)
>>> orig_df = df

In [None]:
df.head()


In [None]:
df.dtypes

In [None]:
>>> # import pandas_profiling
>>> # pandas_profiling.ProfileReport(df)

In [None]:
df.shape

In [None]:
df.describe().iloc[:,:4]

In [None]:
# checking missing values by index
df.isnull().sum()

In [None]:
# get count of missing feature for each sample
df.isnull().sum(axis=1).loc[:5]

In [None]:
print(df['sex'].value_counts(dropna=False), df['embarked'].value_counts())


In [None]:
>>> df = df.drop(
...     columns=[
...         "name",
...         "ticket",
...         "home.dest",
...         "boat",
...         "body",
...         "cabin",
...     ]
... )

In [None]:
df.head()


In [None]:
df.info()

In [None]:
# During dummmy creation we have to drop firstcolumn to avoid multi collinearity 
df = pd.get_dummies(df, drop_first=True)


In [None]:
df.info()

In [None]:
>>> y = df.survived
>>> X = df.drop(columns="survived")

In [None]:
# Train Test split 
>>> X_train, X_test, y_train, y_test = train_test_split(
...     X, y, test_size=0.3, random_state=42
... )

In [None]:
X_train.isnull().sum()

In [None]:
# Using iterative imputer from SKlearn 
>>> from sklearn.experimental import (
...     enable_iterative_imputer,
... )
>>> from sklearn import impute
>>> num_cols = [
...     "pclass",
...     "age",
...     "sibsp",
...     "parch",
...     "fare",
...     "sex_male",
... ]

In [None]:
>>> imputer = impute.IterativeImputer()
>>> imputed = imputer.fit_transform(X_train[num_cols])

In [None]:
# Once we have the fit method on train data we can use it on test set using only transform , we will not do fit on test set otherwise model will leak the information from train to test 
X_train.loc[:,num_cols]=imputed
imputed=imputer.transform(X_test[num_cols])
X_test.loc[:,num_cols]=imputed

In [None]:
X_train.isnull().sum()

In [None]:

>>> sca = preprocessing.StandardScaler()
>>> X_train = sca.fit_transform(X_train)
>>> X_train = pd.DataFrame(X_train)
>>> X_test = sca.transform(X_test)
>>> X_test = pd.DataFrame(X_test)


In [None]:
X_train.head()

In [None]:
# Smoothing of the code using function , one function for dropping unwanted columns and get dummies for categorical to numeric 
>>> def red_titanic(df):
...     df = df.drop(
...         columns=[
...             "name",
...             "ticket",
...             "home.dest",
...             "boat",
...             "body",
...             "cabin",
...         ]
...     ).pipe(pd.get_dummies, drop_first=True)
...     return df

In [None]:
# Second function for train test split ,imputing the missing values and scaling the numerical columns 
>>> def get_train_test_X_y(
...     df, y_col, size=0.3, std_cols=None
... ):
...     y = df[y_col]
...     X = df.drop(columns=y_col)
...     X_train, X_test, y_train, y_test = train_test_split(
...         X, y, test_size=size, random_state=42)
...     cols = X.columns
...     num_cols = [
...         "pclass",
...         "age",
...         "sibsp",
...         "parch",
...         "fare"]
...     fi = impute.IterativeImputer()
...     X_train.loc[:, num_cols] = fi.fit_transform(X_train[num_cols])
...     X_test.loc[:, num_cols] = fi.transform(X_test[num_cols])
...     if std_cols:
...         std = preprocessing.StandardScaler()
...         X_train.loc[
...             :, std_cols
...         ] = std.fit_transform(
...             X_train[std_cols]
...         )
...         X_test.loc[
...             :, std_cols
...         ] = std.transform(X_test[std_cols])
...
...     return X_train, X_test, y_train, y_test

In [None]:
>>> ti_df = red_titanic(orig_df)
>>> std_cols = "pclass,age,sibsp,fare".split(",")
>>> X_train, X_test, y_train, y_test = get_train_test_X_y(
...     ti_df, "survived", std_cols=std_cols
... )

In [None]:
X_train.head()

In [None]:
# Baseline model with dummy classifier 

from sklearn.dummy import DummyClassifier
bm = DummyClassifier()
bm.fit(X_train, y_train)
bm.score(X_test, y_test)  # accuracy


In [None]:
>>> from sklearn import metrics
>>> metrics.precision_score(
...     y_test, bm.predict(X_test)
... )

In [None]:
# importing the various classifier models to compare the AUC. Model with less score but still tighter std dev can be a good modelfor us 

X = pd.concat([X_train, X_test])
y = pd.concat([y_train, y_test])
from sklearn import model_selection
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import xgboost

In [None]:
>>> for model in [
...     DummyClassifier,
...     LogisticRegression,
...     DecisionTreeClassifier,
...     KNeighborsClassifier,
...     GaussianNB,
...     SVC,
...     RandomForestClassifier,
...     xgboost.XGBClassifier]:
...     cls = model()
...     kfold = model_selection.KFold(n_splits=10, random_state=42)
...     s = model_selection.cross_val_score(cls, X, y, scoring="roc_auc", cv=kfold)
...     print(f"{model.__name__:22}  AUC: "f"{s.mean():.3f} STD: {s.std():.2f}")

In [None]:
# to improve the model we may use stacking ,it takes other model and use their output to predict the target class 
>>> from mlxtend.classifier import (
...     StackingClassifier,
... )
>>> clfs = [
...     x()
...     for x in [
...         LogisticRegression,
...         DecisionTreeClassifier,
...         KNeighborsClassifier,
...         GaussianNB,
...         SVC,
...         RandomForestClassifier,
...     ]
... ]
>>> stack = StackingClassifier(
...     classifiers=clfs,
...     meta_classifier=LogisticRegression(),
... )
>>> kfold = model_selection.KFold(
...     n_splits=10, random_state=42
... )
>>> s = model_selection.cross_val_score(
...     stack, X, y, scoring="roc_auc", cv=kfold
... )
>>> print(
...     f"{stack.__class__.__name__}  "
...     f"AUC: {s.mean():.3f}  STD: {s.std():.2f}"
... )

In [None]:
# Train and fit the modelusing RF classfier 
>>> rf = ensemble.RandomForestClassifier(
...     n_estimators=100, random_state=42
... )
>>> rf.fit(X_train, y_train)
>>> rf.score(X_test, y_test)

In [None]:
# Using the other metrics 
>>> metrics.precision_score(
...     y_test, rf.predict(X_test)
... )

In [None]:
# Getting the feature importance of model
>>> for col, val in sorted(
...     zip(X_train.columns,rf.feature_importances_),
...     key=lambda x: x[1],reverse=True)[:5]:
...     print(f"{col:10}{val:10.3f}")

In [None]:
# Optimize the model using hyper parameter tuning with Grid search 
>>> rf4 = ensemble.RandomForestClassifier()
>>> params = {
...     "max_features": [0.4, "auto"],
...     "n_estimators": [15, 200],
...     "min_samples_leaf": [1, 0.1],
...     "random_state": [42],
... }
>>> cv = model_selection.GridSearchCV(
...     rf4, params, n_jobs=-1
... ).fit(X_train, y_train)
>>> print(cv.best_params_)

In [None]:
# Using the best params to fit and test the score 
>>> rf5 = ensemble.RandomForestClassifier(
...     **{
...         "max_features": 0.4,
...         "min_samples_leaf": 1,
...         "n_estimators": 200,
...         "random_state": 42,
...     }
... )
>>> rf5.fit(X_train, y_train)
>>> rf5.score(X_test, y_test)

In [None]:
# Getting confusion matrix 
>>> from sklearn.metrics import confusion_matrix
>>> y_pred = rf5.predict(X_test)
>>> confusion_matrix(y_test, y_pred)

In [None]:
>>> mapping = {0: "died", 1: "survived"}
>>> fig, ax = plt.subplots(figsize=(6, 6))
>>> cm_viz = ConfusionMatrix(
...     rf5,
...     classes=["died", "survived"],
...     label_encoder=mapping,
... )
>>> cm_viz.score(X_test, y_test)
>>> cm_viz.poof()


In [None]:
# Roc curve 
>>> y_pred = rf5.predict(X_test)
>>> roc_auc_score(y_test, y_pred)



In [None]:
# Get the learning to see if we need more data in case CV score continues to rise 
>>> import numpy as np
>>> fig, ax = plt.subplots(figsize=(6, 4))
>>> cv = StratifiedKFold(12)
>>> sizes = np.linspace(0.3, 1.0, 10)
>>> lc_viz = LearningCurve(
...     rf5,
...     cv=cv,
...     train_sizes=sizes,
...     scoring="f1_weighted",
...     n_jobs=4,
...     ax=ax,
... )
>>> lc_viz.fit(X, y)
>>> lc_viz.poof()

In [None]:
# Deploying model using python pickle topersist the model and load them 
>>> import pickle
>>> pic = pickle.dumps(rf5)
>>> rf6 = pickle.loads(pic)
>>> y_pred = rf6.predict(X_test)
>>> roc_auc_score(y_test, y_pred)