In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **I am pleased that such a large number of views of this notebook. Removed broken code cells**

**Import required libraries**

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import (
    ensemble, 
    preprocessing, 
    tree,
)
from sklearn.metrics import (
auc,
confusion_matrix,
roc_auc_score,
roc_curve
)
from sklearn.model_selection import (
train_test_split,
)
from yellowbrick.classifier import (
ConfusionMatrix,
ROCAUC,
)
from yellowbrick.model_selection import (
LearningCurve,
)

**Read the data Titanic dataset**

In [None]:
df = pd.read_csv ('../input/titanic/train.csv')
orig_df = df

df.head (10)

**Cleaning the data**

In [None]:
df.dtypes

In [None]:
import pandas_profiling
pandas_profiling.ProfileReport (df)

In [None]:
df.shape

In [None]:
df.describe ().iloc[:, :2]

In [None]:
df.isnull ().sum ()

In [None]:
df.isnull ().sum (axis = 1).loc[:10]

In [None]:
mask = df.isnull ().any (axis = 1)
mask.head ()  # rows

In [None]:
df [mask].Embarked.head ()

In [None]:
df.Sex.value_counts (dropna = False)

In [None]:
df.Embarked.value_counts (dropna = False)

In [None]:
df = df.drop (
     columns = [
         "Name",
         "Ticket",
         "Cabin"
])
df = pd.get_dummies (df)
df.columns

**Sample data**

In [None]:
from sklearn.model_selection import train_test_split
y = df.Survived
X = df.drop (columns = "Survived")

**Impute data**

In [None]:
X_train, X_test, y_train, y_test = train_test_split (X, y, test_size = 0.3, random_state = 42)

from sklearn.experimental import enable_iterative_imputer
from sklearn import impute
num_cols = ['PassengerId', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare',
       'Sex_female', 'Sex_male', 'Embarked_C', 'Embarked_Q', 'Embarked_S']

imputer = impute.IterativeImputer ()
imputed = imputer.fit_transform (
X_train [num_cols])
X_train.loc [:, num_cols] = imputed

meds = X_train.median ()
X_train = X_train.fillna (meds)
X_test = X_test.fillna (meds)

**Normalize data**

In [None]:
cols = "PassengerId, Pclass, Age, SibSp, Parch, Fare, Sex_female, Sex_male, Embarked_C, Embarked_Q, Embarked_S".split (",")
sca = preprocessing.StandardScaler ()
X_train = sca.fit_transform (X_train)
X_train = pd.DataFrame (X_train, columns = cols)
X_test = sca.transform (X_test)
X_test = pd.DataFrame (X_test, columns = cols)

**Refactor**

In [None]:
from sklearn.model_selection import train_test_split
def tweak_titanic (df):
    df = df.drop (
    columns = [
        "PassengerId",
        "Name",
        "Ticket",
        "Cabin",
    ]
    ).pipe (pd.get_dummies, drop_first = True)
    return df

def get_train_test_X_y (
df, y_col, size = 0.3, std_cols = None
):
    y = df [y_col]
    X = df.drop (columns = y_col)
    X_train, X_test, y_train, y_test = train_test_split (
    X, y, test_size = size, random_state = 42
    )
    cols = X.columns
    num_cols = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 
                'Sex_male', 'Embarked_Q', 'Embarked_S']
    fi = impute.IterativeImputer ()
    X_train.loc [
        :, num_cols
    ] = fi.fit_transform (X_train [num_cols])
    X_test.loc [:, num_cols] = fi.transform (
    X_test [num_cols])
    
    if std_cols:
        std = preprocessing.StandardScaler ()
        X_train.loc [
            :, std_cols
        ] = std.fit_transform (
        X_train [std_cols])
        
        X_test.loc [
            :, std_cols
        ] = std.transform (X_test [std_cols])
        return X_train, X_test, y_train, y_test
    
ti_df = tweak_titanic (orig_df)
std_cols = "Pclass".split (",")
X_train, X_test, y_train, y_test = get_train_test_X_y (ti_df, "Survived", std_cols = std_cols)

**Baseline model**

In [None]:
from sklearn.dummy import DummyClassifier
bm = DummyClassifier ()
bm.fit (X_train, y_train)
bm.score (X_test, y_test) # accuracy

**K-fold cross-validation**

In [None]:
X = pd.concat ([X_train, X_test])
y = pd.concat ([y_train, y_test])
from sklearn import model_selection
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

for model in [
    DummyClassifier,
    LogisticRegression,
    DecisionTreeClassifier,
    KNeighborsClassifier, 
    GaussianNB,
    SVC,
    RandomForestClassifier,
    ]:
    cls = model ()
    kfold = model_selection.KFold (n_splits=10, random_state=None)
    s = model_selection.cross_val_score (
    cls, X, y, scoring = 'roc_auc', cv = kfold)

    print (f"{model.__name__:22}   AUC: "
          f"{s.mean (): .3f} STD: {s.std(): .2f}")

**Staking**

In [None]:
from mlxtend.classifier import StackingClassifier
clfs = [
    x()
    for x in [
        LogisticRegression,
        DecisionTreeClassifier,
        GaussianNB,
        SVC,
        RandomForestClassifier
    ]
]
stack = StackingClassifier (classifiers=clfs,
                          meta_classifier = LogisticRegression ())
kfold = model_selection.KFold (n_splits = 10, random_state = None)

s = model_selection.cross_val_score (stack, X, y, scoring = 'roc_auc', cv = kfold)

print (f"{stack.__class__.__name__}  "
      f"AUC: {s.mean():.3f}  STD: {s.std():.2f}")

**Create and evaluate Model**

In [None]:
# create model
rf = ensemble.RandomForestClassifier (n_estimators = 100, random_state = 42)
rf.fit (X_train, y_train)

# evaluate model
rf.score (X_test, y_test)

In [None]:
# metrics precision
metrics.precision_score (y_test, rf.predict (X_test))

In [None]:
# inspect feature performance
for col, val in sorted (zip (X_train.columns, rf.feature_importances_),
                       key = lambda x: x[1], reverse = True,
                       )[:5]:
    print (f"{col:10}{val:10.3f}")

**Optimize model**

In [None]:
rf4 = ensemble.RandomForestClassifier ()
params = {
    "max_features": [0.4, "auto"],
    "n_estimators": [15, 200],
    "min_samples_leaf": [1, 0.1],
    "random_state": [42],
}
cv = model_selection.GridSearchCV (rf4, params, n_jobs=-1
                                  ).fit(X_train, y_train)
print (cv.best_params_)

In [None]:
rf5 = ensemble.RandomForestClassifier (
**{
    "max_features": "auto",
    "min_samples_leaf": 0.1,
    "n_estimators": 200,
    "random_state": 42,
})
rf5.fit (X_train, y_train)

rf5.score (X_test, y_test)

**Confusion matrix**

In [None]:
from sklearn.metrics import confusion_matrix
y_pred = rf5.predict (X_test)
confusion_matrix (y_test, y_pred)

In [None]:
mapping = {0: "died", 1: "survived"}
fig, ax = plt.subplots (figsize = (6, 6))
cm_viz = ConfusionMatrix (rf5, classes = ["died", "survived"],
                         label_encoder = mapping)
cm_viz.score (X_test, y_test)
cm_viz.poof ()

**ROC Curve**

In [None]:
y_pred = rf5.predict (X_test)
roc_auc_score (y_test, y_pred)

**Learning curve**

In [None]:
from sklearn.model_selection import StratifiedKFold
import numpy as np

fig, ax = plt.subplots (figsize = (6, 4))
cv = StratifiedKFold (12)
sizes = np.linspace (0.3, 1.0, 10)
lc_viz = LearningCurve (
rf5, 
cv = cv,
train_sizes = sizes,
scoring = "f1_weighted",
n_jobs = 4,
ax = ax)

lc_viz.fit (X, y)
lc_viz.poof ()

**Deploing model**

In [None]:
import pickle
pic = pickle.dumps (rf5)
rf6 = pickle.loads (pic)
y_pred = rf6.predict (X_test)
roc_auc_score (y_test, y_pred)

# **The model did not show the most optimal result, so let's start all over again and carefully work through each step of titanic dataset**

**Examining missing data**

In [None]:
df.isnull ().mean () * 100

**Using missingno library**

In [None]:
import missingno as msno
ax = msno.matrix (df.sample (500))

**Bar plot**

In [None]:
fig, ax = plt.subplots (figsize = (6, 4))
(df.isnull().mean().abs().plot.bar (ax=ax))

**Missingno library bar plot**

In [None]:
ax = msno.bar (df.sample (500))

**Create a dendrogram**

In [None]:
ax = msno.dendrogram (df)

**Dropping missnig data**

In [None]:
df1 = orig_df.dropna ()
df1 = orig_df.drop (columns = "Cabin")
df1 = orig_df.dropna (axis = 1)

**Imputing data**

In [None]:
from sklearn.impute import SimpleImputer 
num_cols = df.select_dtypes (include = "number").columns
im = SimpleImputer ()  # mean
imputed = im.fit_transform (df[num_cols])

**Adding indicator columns**

In [None]:
def add_indicator (col):
    def wrapper (df):
        return df[col].isna().astype (int)
    
    return wrapper

df1 = df.assign (cabin_missing = add_indicator ("Age"))

**Exploring data**

In [None]:
df.shape 

In [None]:
df.describe ().iloc [:, [0, -1]]

In [None]:
df.iloc [[1, 4], -3:]

In [None]:
df.loc [[101, 417], "Embarked_C":]

In [None]:
df.head (5)

**Histogram**

In [None]:
fig, ax = plt.subplots (figsize = (6, 4))
df.Fare.plot (kind ="hist", ax=ax)

In [None]:
import seaborn as sns
fig, ax = plt.subplots (figsize = (12, 8))
mask = y_train == 1
ax = sns.histplot (X_train [mask].Parch, label = 'Survived')
ax = sns.histplot (X_train [~mask].Parch, label = 'Died')
ax.set_xlim(-1.5, 1.5)
ax.legend ()

**Scatter plot**

In [None]:
fig, ax = plt.subplots (figsize= (6, 4))
df.plot.scatter (x = "Age", y = "Fare", ax = ax, alpha = 0.3)

In [None]:
df.Parch.corr (df.Age)

**Joint plot**

In [None]:
from yellowbrick.features import JointPlotVisualizer

fig, ax = plt.subplots (figsize = (6, 6))
jpv = JointPlotVisualizer (feature = "Pclass", target = "SibSp")
jpv.fit (df["Pclass"], df["SibSp"])
jpv.poof ()

In [None]:
# use seaborn library to create a joint plot
from seaborn import jointplot
fig, ax = plt.subplots (figsize = (6, 6))
new_df = df.copy ()
new_df = df.copy ()
new_df["target"] = y
p = jointplot ("Pclass", "Parch", data = new_df, kind = "reg")

**Pair grid**

In [None]:
from seaborn import pairplot
fig, ax = plt.subplots(figsize = (6, 6))
new_df = df.copy ()
new_df ["target"] = y
vars = ["Pclass", "Fare", "Age"]
p = pairplot (new_df, vars = vars, hue = "target", kind = "reg")

**Box and violin plots**

In [None]:
from seaborn import boxplot
fig, ax = plt.subplots (figsize = (8, 6))
new_df = df.copy ()
new_df ["target"] = y
boxplot (x = "target", y = "Age", data= new_df)

In [None]:
from seaborn import violinplot
fig, ax= plt.subplots (figsize = (8, 6))
new_df = df.copy ()
new_df ["target"] = y
violinplot (x = "target", y = "Age", data = new_df)

**Comparing two ordinal values**

In [None]:
fig, ax = plt.subplots (figsize = (8, 6))
(df.assign (age_bin = pd.qcut (df.Age, q = 10, labels = False),
          class_bin = pd.cut (df.Pclass, bins = 3, labels = False),
          ).pipe (lambda df: pd.crosstab (df.age_bin, df.class_bin))
 .pipe (lambda df: df.div (df.sum (1), axis = 0))
 .plot.bar (stacked = True,
           width = 1,
           ax = ax,
           cmap = 'viridis',
           )
 .legend (bbox_to_anchor = (1, 1)))

**Correlation**

In [None]:
from yellowbrick.features import Rank2D
fig, ax = plt.subplots (figsize = (6, 6))
pcv = Rank2D (features = X.columns, algorithm = "pearson")
pcv.fit (X, y)
pcv.transform (X)
pcv.poof ()

In [None]:
from seaborn import heatmap
fig, ax = plt.subplots (figsize = (8, 8))
ax = heatmap (
X.corr (),
fmt = ".2f",
annot = True,
ax = ax,
cmap = "RdBu_r",
vmin = -1,
vmax = 1)

In [None]:
df.corr ().iloc [:, :2]

In [None]:
def correlated_columns (df, threshold = 0.95):
    return (df.corr ().pipe (
    lambda df1: pd.DataFrame (np.tril (df1, k=-1),
                             columns = df.columns,
                             index = df.columns)).stack ()
           .rename ("pearson").pipe (lambda s: s [
               s.abs () > threshold].reset_index ())
           .query ("level_0 not in level_1"))

correlated_columns (df)

**RadViz**

In [None]:
from yellowbrick.features import RadViz
fig, ax = plt.subplots (figsize = (6, 6))
rv = RadViz (classes = ["died", "survived"],
            features = X.columns)
rv.fit (X, y)
_ = rv.transform (X)
rv.poof ()

In [None]:
# pandas library can plot RadViz
from pandas.plotting import radviz
fig, ax = plt.subplots (figsize = (6, 6))
new_df = X.copy ()
new_df ["target"] = y
radviz (new_df, "target", ax = ax, colormap = "PiYG")

**Parallel coordinates**

In [None]:
from yellowbrick.features import ParallelCoordinates
fig, ax = plt.subplots (figsize = (6, 4))
pc = ParallelCoordinates (classes = ["died", "survived"],
                         features = X.columns)
pc.fit (X, y)
pc.transform (X)
ax.set_xticklabels (ax.get_xticklabels (), rotation = 45)
pc.poof ()

In [None]:
# pandas library can plot parallel coordinates
from pandas.plotting import parallel_coordinates
fig, ax = plt.subplots (figsize = (6, 4))
new_df = X.copy ()
new_df ["target"] = y
parallel_coordinates (new_df, "target", ax = ax, colormap = "viridis", alpha = 0.5)
ax.set_xticklabels (ax.get_xticklabels (), rotation = 45)

**Standardize data**

In [None]:
from sklearn import preprocessing
X2 = df
std = preprocessing.StandardScaler ()
std.fit_transform (X)

In [None]:
std.scale_

In [None]:
std.mean_

In [None]:
std.var_

In [None]:
X_std = (X2 - X2.mean ()) / X2.std ()
X_std

In [None]:
X_std.mean ()

In [None]:
X_std.std ()

**Scale to range**

In [None]:
from sklearn import preprocessing
mms = preprocessing.MinMaxScaler ()
mms.fit (X2)
mms.transform (X2)

In [None]:
# pandas preprocessing
(X2 - X2.min ()) / (X2.max () - X2.min ())

**Dummy variables**

In [None]:
pd.get_dummies (df, drop_first = True)

**Label encoder**

In [None]:
df1 = pd.read_csv ('../input/titanic/train.csv')
df1.head (5)

In [None]:
from sklearn import preprocessing
lab = preprocessing.LabelEncoder ()
lab.fit_transform (df1.Name)

In [None]:
lab.inverse_transform ([1, 1, 0])

In [None]:
df1.Name.astype ("category").cat.as_ordered ().cat.codes + 1


**Frequency encoding**

In [None]:
from collections import Counter
c = Counter ()
def triples (val):
    for i in range (len(val)):
        c[val[i : i + 3]] += 1
df1.Name.apply (triples)
c.most_common (10)

In [None]:
df1.Name.str.extract ("([A-Za-z]+)\.", expand = False).head ()

In [None]:
df1.Name.str.extract ("([A-Za-z]+)\.", expand = False).value_counts ()

**Other categorical encoding**

In [None]:
import category_encoders as ce
he = ce.HashingEncoder (verbose = 1)
he.fit_transform (df1)

**Feature selection - Collinear columns**

In [None]:
cols_to_remove = [
    "Pclass",
    "SibSp",
    "Parch",
    "Embarked_Q"
]
rf3 = RandomForestClassifier (random_state = 42)
rf3.fit (X_train [[c for c in X_train.columns
                 if c not in cols_to_remove]], y_train)
rf3.score (X_test [[c for c in X_train.columns
                   if c not in cols_to_remove]], y_test)

In [None]:
rf4 = RandomForestClassifier (random_state = 42)
rf4.fit (X_train, y_train)
rf4.score (X_test, y_test)

**Recursive feature elimination**

In [None]:
from yellowbrick.features import RFECV
fig, ax = plt.subplots (figsize = (6, 4))
rfe = RFECV (ensemble.RandomForestClassifier (n_estimators = 100), cv = 5)
rfe.fit (X, y)

rfe.rfe_estimator_.ranking_

rfe.rfe_estimator_.n_features_

rfe.rfe_estimator_.support_

rfe.poof ()

**Mutual information between feature and target**

In [None]:
from sklearn import feature_selection
mic = feature_selection.mutual_info_classif (X, y)
fig, ax = plt.subplots (figsize = (10, 8))
(pd.DataFrame ({"feature":X.columns, "vimp": mic}).set_index ("feature").plot.barh (ax = ax))

**Unsampling minority**

In [None]:
from sklearn.utils import resample
mask = df.Survived == 1
surv_df = df [mask]
death_df = df[~mask]
df_usample = resample (surv_df, replace = True, n_samples = len (death_df), random_state = 42)
df2 = pd.concat ([death_df, df_usample])
df2.Survived.value_counts ()

**Downsampling majority**

In [None]:
from sklearn.utils import resample
mask = df.Survived == 1
surv_df = df [mask]
death_df = df[~mask]
df_downsample = resample (death_df, replace = False, n_samples = len (surv_df), random_state = 42)
df3 = pd.concat ([surv_df, df_downsample])
df3.Survived.value_counts ()

**Logistic regression**

In [None]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression (random_state = 42)
lr.fit (X_train, y_train)
lr.score (X_test, y_test)

In [None]:
lr.predict_proba (X.iloc [[1]])

In [None]:
lr.predict_log_proba (X.iloc [[2]])

In [None]:
lr.decision_function (X.iloc [[3]])

In [None]:
lr.intercept_

In [None]:
def inv_logit (p):
    return np.exp (p) / (1 + np.exp (p))

inv_logit (lr.intercept_)

In [None]:
# inspect the coefficients

cols = X.columns
for col, val in sorted (zip (cols, lr.coef_[0]), key = lambda x: x[1], reverse = True):
    print (f"{col:10}{val:10.3f} {inv_logit (val):10.3f}")

In [None]:
# inspect coefficients with yellowbrick

from yellowbrick.model_selection import FeatureImportances
fig, ax = plt.subplots (figsize = (6, 4))
fi_viz = FeatureImportances (lr)
fi_viz.fit (X, y)
fi_viz.poof ()

**Naive Bayes**

In [None]:
from sklearn.naive_bayes import GaussianNB
nb = GaussianNB ()
nb.fit (X_train, y_train)
nb.score (X_test, y_test)

In [None]:
nb.predict_proba (X.iloc [[1]])

In [None]:
nb.predict_log_proba (X.iloc [[2]])

**Support vector machine (SVM)**

In [None]:
# scikit-learn's SVM implementation

from sklearn.svm import SVC
svc = SVC (random_state = 42, probability = True)
svc.fit (X_train, y_train)
svc.score (X_test, y_test)

In [None]:
svc.predict_proba (X.iloc [[2]])

In [None]:
svc.predict_log_proba (X.iloc [[3]])

**K-Nearest Neighbor**

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knc = KNeighborsClassifier ()
knc.fit (X_train, y_train)
knc.score (X_test, y_test)

In [None]:
knc.predict_proba (X.iloc [[1]])

**Decision Tree**

In [None]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier (random_state = 42, max_depth = 3)
dt.fit (X_train, y_train)
dt.score (X_test, y_test)

In [None]:
dt.predict_proba (X.iloc [[1]])

In [None]:
dt.predict_proba (X.iloc [[2]])

In [None]:
dt.predict_log_proba (X.iloc [[3]])

In [None]:
# feature importance

for col, val in sorted (
    zip (X.columns, dt.feature_importances_),
    key = lambda x: x[1],reverse = True) [:5]:
    print (f"{col:10}{val:10.3f}") 

In [None]:
# feature importance

from yellowbrick.model_selection import FeatureImportances
fig, ax = plt.subplots (figsize = (6, 4))
fi_viz = FeatureImportances (dt)
fi_viz.fit (X, y)
fi_viz.poof ()

**Random Forest**

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier (random_state = 42)
rf.fit (X_train, y_train)

rf.score (X_test, y_test)

In [None]:
rf.predict_proba (X.iloc [[1]])

In [None]:
rf.predict_log_proba (X.iloc [[1]])

In [None]:
# feature importance
for col, val in sorted (zip (X.columns, rf.feature_importances_),
                       key = lambda x: x[1], reverse = True) [:5]:
    print (f"{col:10}{val:10.3f}")

**XGBoost**

In [None]:
import xgboost as xgb
xgb_class = xgb.XGBClassifier (random_state = 42, use_label_encoder=False) 
xgb_class.fit (X_train, y_train, early_stopping_rounds = 10, eval_set = [(X_test, y_test)])

xgb_class.score (X_test, y_test)

In [None]:
xgb_class.predict_proba (X.iloc [[1]])

In [None]:
# feature importance
for col, val in sorted (zip (X.columns, xgb_class.feature_importances_), key = lambda x: x[1],
                       reverse = True)[:5]:
    print (f"{col:10}{val:10.3f}")

In [None]:
fig, ax = plt.subplots (figsize = (6, 4))
xgb.plot_importance (xgb_class, ax = ax)

In [None]:
fig, ax = plt.subplots (figsize = (6, 4))
fi_viz = FeatureImportances (xgb_class)
fi_viz.fit (X, y)
fi_viz.poof ()

In [None]:
# text representation of the trees

booster = xgb_class.get_booster ()
print (booster.get_dump () [0])

In [None]:
# score from first tree leaf 7
1 / (1 + np.exp (- 1 * 0.1535))

In [None]:
# graphical version tree model

fig, ax = plt.subplots (figsize = (16, 12))
xgb.plot_tree (xgb_class, ax = ax, num_trees = 0)

**Gradient Boosted with LightGBM**

In [None]:
import lightgbm as lgb
lgbm_class =lgb.LGBMClassifier (random_state = 42)
lgbm_class.fit (X_train, y_train)

lgbm_class.score (X_test, y_test)

In [None]:
lgbm_class.predict_proba (X.iloc [[1]])

In [None]:
lgbm_class.predict_proba (X.iloc[[2]])

In [None]:
# feature importance based on 'splits'

for col, val in sorted (zip (cols, lgbm_class.feature_importances_),
                       key = lambda x: x[1], reverse = True)[:5]:
    print (f"{col:10}{val:10.3f}")

In [None]:
fig, ax = plt.subplots (figsize = (6, 4))
lgb.plot_importance (lgbm_class, ax = ax)
fig.tight_layout ()

In [None]:
fig, ax = plt.subplots (figsize = (16, 12))
lgb.plot_tree (lgbm_class, tree_index = 0, ax = ax)

**TPOT**

In [None]:
from tpot import TPOTClassifier
tc = TPOTClassifier (generations = 2)
tc.fit (X_train, y_train)
tc.score (X_test, y_test)

In [None]:
tc.predict_proba (X.iloc [[1]])

**Learning Curve**

In [None]:
from yellowbrick.model_selection import LearningCurve
fig, ax = plt.subplots (figsize = (6, 4))
lc3_viz = LearningCurve (RandomForestClassifier (n_estimators = 100), cv = 10)
lc3_viz.fit (X, y)
lc3_viz.poof ()

**Validation Curve**

In [None]:
from yellowbrick.model_selection import ValidationCurve
fig, ax = plt.subplots (figsize = (6, 4))
vc_viz = ValidationCurve (RandomForestClassifier (n_estimators = 100),
                         param_name = "max_depth", param_range = np.arange (1, 11),
                          cv = 10, n_jobs = -1)
vc_viz.fit (X, y)
vc_viz.poof()

**Confusion matrix**

In [None]:
y_predict = dt.predict (X_test)
tp = ((y_test == 1) & (y_test == y_predict)).sum () # 123
tn = ((y_test == 0) & (y_test == y_predict)).sum () # 199
fp = ((y_test == 0) & (y_test != y_predict)).sum () # 25
fn = ((y_test == 1) & (y_test != y_predict)).sum () # 46

In [None]:
from sklearn.metrics import confusion_matrix
y_predict = dt.predict (X_test)
pd.DataFrame (confusion_matrix (y_test, y_predict), columns = ["Predict died", "Predict Survive"],
             index = ["True Death", "True Survive"])

In [None]:
import matplotlib.pyplot as plt
from yellowbrick.classifier import ConfusionMatrix
mapping = {0: "died", 1: "survived"}
fig, ax = plt.subplots (figsize = (6, 6))
cm_viz = ConfusionMatrix (dt, classes = ["died", "survived"],
                         label_encoder = mapping)
cm_viz.score (X_test, y_test)
cm_viz.poof ()

**Metrics**

In [None]:
# accuracy 
(tp + tn) / (tp + tn + fp + fn)

In [None]:
# accuracy from sklearn
from sklearn.metrics import accuracy_score
y_predict = dt.predict (X_test)
accuracy_score (y_test, y_predict)

In [None]:
# recall
tp / (tp + fn)

In [None]:
# recall from sklearn
from sklearn.metrics import recall_score
y_predict = dt.predict (X_test)
recall_score (y_test, y_predict)

In [None]:
# precision
tp / (tp + fp)

In [None]:
# precision from sklearn
from sklearn.metrics import precision_score
y_predict = dt.predict (X_test)
precision_score (y_test, y_predict)

In [None]:
# F1
pre = tp / (tp + fp)
rec = tp / (tp + fn)
(2 * pre * rec) / (pre + rec)

In [None]:
# F1 from sklearn 
from sklearn.metrics import f1_score
y_predict = dt.predict (X_test)
f1_score (y_test, y_predict)

**Classification report**

In [None]:
import matplotlib.pyplot as plt
from yellowbrick.classifier import ClassificationReport
fig, ax = plt.subplots (figsize = (6, 3))
cm_viz = ClassificationReport (dt, classes = ["Died", "Survived"],
                              label_encoder = mapping)
cm_viz.score (X_test, y_test)
cm_viz.poof ()

**Precision-Recall curve**

In [None]:
from sklearn.metrics import average_precision_score
y_predict = dt.predict (X_test)
average_precision_score (y_test, y_predict)

In [None]:
from yellowbrick.classifier import PrecisionRecallCurve
fig, ax = plt.subplots (figsize = (6, 4))
viz = PrecisionRecallCurve (DecisionTreeClassifier (max_depth = 3))
viz.fit (X_train, y_train)
print (viz.score (X_test, y_test))
viz.poof ()

**Cumulative Gains Plot**

In [None]:
import scikitplot as skplt
fig, ax = plt.subplots (figsize = (6, 6))
y_probas = dt.predict_proba (X_test)
skplt.metrics.plot_cumulative_gain (y_test, y_probas, ax = ax)

**Lift Curve**

In [None]:
fig, ax = plt.subplots (figsize = (6, 6))
y_probas = dt.predict_proba (X_test)
skplt.metrics.plot_lift_curve (y_test, y_probas, ax = ax)

**Class Balance**

In [None]:
from yellowbrick.classifier import ClassBalance
fig, ax = plt.subplots (figsize = (6, 6))
cb_viz = ClassBalance (labels = ["Died", "Survived"])
cb_viz.fit (y_test)
cb_viz.poof ()

**Class Prediction error**

In [None]:
from yellowbrick.classifier import ClassPredictionError
fig, ax = plt.subplots (figsize = (6, 3))
cpe_viz = ClassPredictionError (dt, classes = ["Died", "Survived"])
cpe_viz.score (X_test, y_test)
cpe_viz.poof ()

**Discrimination Threshold**

In [None]:
from yellowbrick.classifier import DiscriminationThreshold
fig, ax = plt.subplots (figsize = (6, 5))
dt_viz = DiscriminationThreshold (dt)
dt_viz.fit (X, y)
dt_viz.poof ()

**Regression Coefficients**

In [None]:
dt = DecisionTreeClassifier (random_state = 42, max_depth = 3)
dt.fit (X_train, y_train)

In [None]:
# LIME
from lime import lime_tabular
explainer = lime_tabular.LimeTabularExplainer (X_train.values, feature_names = X.columns,
                                              class_names = ["Died", "Survived"])
exp = explainer.explain_instance (X_train.iloc [-1].values, dt.predict_proba)
fig = exp.as_pyplot_figure ()
fig.tight_layout ()

**Partial dependence plots**

In [None]:
rf5 = ensemble.RandomForestClassifier (**{"max_features": "auto",
                                         "min_samples_leaf": 0.1,
                                         "n_estimators": 200,
                                         "random_state": 42})
rf5.fit (X_train, y_train)

from pdpbox import pdp
feat_name = "Age"
p = pdp.pdp_isolate (rf5, X, X.columns, feat_name)
fig, _ = pdp.pdp_plot (p, feat_name, plot_lines = True)

In [None]:
features = ["Fare", "Sex_male"]
p = pdp.pdp_interact (rf5, X, X.columns, features)
fig, _ = pdp.pdp_interact_plot (p, features)

**Surrogate models**

In [None]:
from sklearn import svm
sv = svm.SVC ()
sv.fit (X_train, y_train)
sur_dt = tree.DecisionTreeClassifier ()
sur_dt.fit (X_test, sv.predict (X_test))
for col, val in sorted (zip (X_test.columns,
                            sur_dt.feature_importances_),
                       key = lambda x: x[1], reverse = True)[:7]:
    print (f"{col:10}{val:10.3f}")

**Regression**

In [None]:
import pandas as pd
from sklearn import model_selection
from sklearn.model_selection import train_test_split

In [None]:
# load the Boston housing dataset
import pandas as pd
import numpy as np

b = pd.read_csv ('../input/the-boston-houseprice-data/boston.csv')

# create a split version for training and testing data
bos_X = b
bos_y = b['CRIM']

bos_X_train, bos_X_test, bos_y_train, bos_y_test = train_test_split (bos_X, bos_y, test_size = 0.3, random_state = 42)

bos_sX = preprocessing.StandardScaler ().fit_transform (bos_X)
bos_sX_train, bos_sX_test, bos_sy_train, bos_sy_test = train_test_split (
bos_sX, bos_y, test_size = 0.3, random_state = 42)

**Baseline model**

In [None]:
from sklearn.dummy import DummyRegressor
dr = DummyRegressor ()
dr.fit (bos_X_train, bos_y_train)
dr.score (bos_X_test, bos_y_test)

**Linear regression**

In [None]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression ()
lr.fit (bos_X_train, bos_y_train)
lr.score (bos_X_test, bos_y_test)

In [None]:
lr.coef_

In [None]:
lr2 = LinearRegression ()
lr2.fit (bos_sX_train, bos_sy_train)
lr2.score (bos_sX_test, bos_sy_test)

In [None]:
lr2.intercept_

In [None]:
lr2.coef_

**K-Nearest Neighbor**

In [None]:
from sklearn.neighbors import KNeighborsRegressor
knr = KNeighborsRegressor ()
knr.fit (bos_sX_train, bos_sy_train)
knr.score (bos_sX_test, bos_sy_test)

**Decision tree**

In [None]:
from sklearn.tree import DecisionTreeRegressor
dtr = DecisionTreeRegressor (random_state = 42)
dtr.fit (bos_X_train, bos_y_train)
dtr.score (bos_X_test, bos_y_test)

In [None]:
# feature importance

for col, val in sorted (zip (bos_X.columns, dtr.feature_importances_),
                       key = lambda x: x[1], reverse = True)[:5]:
    print (f"{col:10}{val:10.3f}")

**Random forest**

In [None]:
from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor (random_state = 42, n_estimators = 100)
rfr.fit (bos_X_train, bos_y_train)
rfr.score (bos_X_test, bos_y_test)

In [None]:
# feature importance
for col, val in sorted (zip (bos_X.columns, rfr.feature_importances_),
                       key = lambda x: x[1], reverse = True)[:5]:
    print (f"{col:10}{val:10.3f}")

**LightGBM Regression**

In [None]:
import lightgbm as lgb
lgr = lgb.LGBMRegressor (random_state = 42)
lgr.fit (bos_X_train, bos_y_train)
lgr.score (bos_X_test, bos_y_test)

In [None]:
lgr.predict (bos_X.iloc [[0]])

In [None]:
# feature importances
for col, val in sorted (zip (bos_X.columns, lgr.feature_importances_),
                       key = lambda x: x[1], reverse = True)[:5]:
    print (f"{col:10}{val:10.3f}")

In [None]:
fig, ax= plt.subplots (figsize = (16, 14))
lgb.plot_importance (lgr, ax = ax)
fig.tight_layout ()

**Metrics**

In [None]:
rfr = RandomForestRegressor (random_state = 42, n_estimators = 100)
rfr.fit (bos_X_train, bos_y_train)

from sklearn import metrics
rfr.score (bos_X_test, bos_y_test)

**Residuals plot**

In [None]:
from yellowbrick.regressor import ResidualsPlot
fig, ax = plt.subplots (figsize = (12, 8))
rpv = ResidualsPlot (rfr)
rpv.fit (bos_X_train, bos_y_train)
rpv.score (bos_X_test, bos_y_test)
rpv.poof ()

**Normal Residuals**

In [None]:
fig, ax = plt.subplots (figsize = (6, 4))
resids = bos_y_test - rfr.predict (bos_X_test)
pd.Series(resids, name = "residuals").plot.hist (bins = 20, ax = ax, title = "Residual Histogram")

In [None]:
# show the propbability plot
from scipy import stats
gig, ax = plt.subplots (figsize = (6, 4))
_ = stats.probplot (resids, plot = ax)

In [None]:
stats.kstest (resids, cdf = "norm")

**Prediction error plot**

In [None]:
from yellowbrick.regressor import PredictionError
fig, ax = plt.subplots (figsize = (6, 6))
pev = PredictionError (rfr)
pev.fit (bos_X_train, bos_y_train)
pev.score (bos_X_test, bos_y_test)
pev.poof ()

**Dimensionality reduction**

In [None]:
ti_df = tweak_titanic (orig_df)
std_cols = "Pclass".split (",")
X_train, X_test, y_train, y_test = get_train_test_X_y (ti_df, "Survived", std_cols = std_cols)
X = pd.concat ([X_train, X_test])
y = pd.concat ([y_train, y_test])

**Principal component analysis (PCA)**

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
pca = PCA (random_state = 42)
X_pca = pca.fit_transform (StandardScaler ().fit_transform (X))
pca.explained_variance_ratio_

In [None]:
pca.components_[0]

In [None]:
fig, ax = plt.subplots (figsize = (12, 8))
ax.plot (pca.explained_variance_ratio_)
ax.set (xlabel = "Component", ylabel = "Percent of Explained variance",
       title = "Scree Plot", ylim = (0, 1))