In [None]:
import numpy as np 
import pandas as pd 

import shap
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

from sklearn.preprocessing import StandardScaler

Hi Kagglers
Finally we have some meaningful features we can do some feature engineering so in this notebook I just want to play with those features and have some fun. 

pclass - Passenger Ticket class : Class 1, 2 and 3.

Name - Name of the passenger

sex - Sex of the Passenger

Age - Age in years of the Passenger

sibsp - Number of siblings / spouses aboard the Titanic

parch - Number of parents / children aboard the Titanic

Ticket - Ticket number

Fare - Passenger fare

Cabin - Cabin number

Embarked - Port of Embarkation shows the port from which the passenger boarded the titanic

       C - Cherbourg
       Q - Queenstown
       S - Southampton

In [None]:
train_df = pd.read_csv("/kaggle/input/tabular-playground-series-apr-2021/train.csv")
test_df = pd.read_csv("/kaggle/input/tabular-playground-series-apr-2021/test.csv")
sample = pd.read_csv("/kaggle/input/tabular-playground-series-apr-2021/sample_submission.csv")

In [None]:
train_df.head()

In [None]:
train_df.info()

In [None]:
test_df.info()

In [None]:
import missingno as no

In [None]:
no.matrix(train_df)

In [None]:
no.matrix(test_df)

In [None]:
100 * train_df.isnull().sum()/ len(train_df)

In [None]:
100 * test_df.isnull().sum()/ len(test_df)

## Fill NaN values first

**Age column**

I am going to fill nan values with median of this column.

In [None]:
train_df.drop("PassengerId", axis=1).groupby("Sex").median()["Age"]

In [None]:
def fill_age(df):
    """
    Function impute NaN values in Age column based
    on age median per Sex
    """
    age_median_per_sex = train_df.drop("PassengerId", axis=1).groupby("Sex").median()["Age"]
    m_idx = df[df["Sex"] == "male"].index
    f_idx = df[df["Sex"] == "female"].index
    
    df.loc[m_idx, "Age"] = df[df["Sex"] == "male"].fillna(value=age_median_per_sex.values[1])
    df.loc[f_idx, "Age"] = df[df["Sex"] == "female"].fillna(value=age_median_per_sex.values[0])
    df["Age"] = df["Age"].apply(lambda x: int(x))

fill_age(train_df)
fill_age(test_df)

In [None]:
train_df["Age_interval"] = pd.cut(train_df["Age"], 9, 
                                  labels=["0-9","10-19","20-29","30-39","40-49","50-59","60-69","70-79","80-89"])
test_df["Age_interval"] = pd.cut(test_df["Age"], 9, 
                                 labels=["0-9","10-19","20-29","30-39","40-49","50-59","60-69","70-79","80-89"])

In [None]:
def plot_barplot(df, col):
    x = df[col].value_counts().index
    y = df[col].value_counts().values
    
    fig = go.Figure(data=[go.Bar(x=x, y=y)])
    fig.update_traces(marker_color=px.colors.sequential.Greens, 
                      marker_line_color="rgb(8, 48, 107)",
                      marker_line_width=1.5, opacity=0.6)
    fig.update_layout(title_text=f"{col}")
    #fig.update_layout(xaxis=dict(ticktext=["Not Survived", "Survived"],
                                 #tickvals=[0,1]))
    fig.show()

In [None]:
plot_barplot(train_df, "Age")

**Cabin**

In [None]:
def cabin_feat(df, col):
    # Fill NaN values with None string
    df[col] = df[col].fillna("None")
    
    # Create new features
    df[f"has_{col}"] = df[col].apply(lambda x: 1 if x != "None" else 0)
    df["Deck"] = df[col].apply(lambda x: x[0])
    df.drop(col, axis=1, inplace=True)
    return df

In [None]:
train_df = cabin_feat(train_df, "Cabin")
test_df = cabin_feat(test_df, "Cabin")

In [None]:
plot_barplot(train_df, "has_Cabin")

**Ticket**

In [None]:
train_df.head()

In [None]:
train_df["Ticket"] = train_df["Ticket"].fillna("N").map(lambda x:str(x).split()[0] if len(str(x).split()) > 1 else "N")
test_df["Ticket"] = test_df["Ticket"].fillna("N").map(lambda x:str(x).split()[0] if len(str(x).split()) > 1 else "N")

In [None]:
train_df['Ticket'].unique()

**Name**

In [None]:
train_df["Name"].apply(lambda x: x.split(",")[0]).value_counts()

I wonder how often and what name was in the orginal titanic dataset. Does anyone remember? 

In [None]:
train_df["last_name"] = train_df["Name"].apply(lambda x: x.split(",")[0])
test_df["last_name"] = test_df["Name"].apply(lambda x: x.split(",")[0])

train_df.drop("Name", axis=1, inplace=True)
test_df.drop("Name", axis=1, inplace=True)

In [None]:
train_df.head()

In [None]:
#last_name = train_df["last_name"].value_counts()
#last_name = last_name[last_name.values > 100]

#train_df["last_name"] = train_df["last_name"].apply(lambda x: "others" if x not in last_name.index else x)
#test_df["last_name"] = test_df["last_name"].apply(lambda x: "others" if x not in last_name.index else x)

I am going to drop "Name" column and leave first_name column for label encoding. I want to check if any feature engineering will have any efect on our results by training model with this feature and evaluate effect.

In [None]:
from sklearn.preprocessing import LabelEncoder
col_names = ["last_name", "Ticket"]
for col in col_names:
    le = LabelEncoder()
    le.fit(train_df[col].values.tolist() + test_df[col].values.tolist())
    train_df[col] = le.transform(train_df[col].values)
    test_df[col] = le.transform(test_df[col].values)

**SibSp and Parch**

We could create a family column divided into e.g. categories like (no family, small, medium, large)

In [None]:
plot_barplot(train_df, "SibSp")

In [None]:
plot_barplot(train_df, "Parch")

In [None]:
train_df["family"] = train_df["SibSp"] + train_df["Parch"]
test_df["family"] = test_df["SibSp"] + test_df["Parch"]

In [None]:
train_df["family"]

In [None]:
def family_size(x):
    if x == 0:
        return "alone"
    elif  0 < x <= 5:
        return "small"
    elif 5 < x <= 10:
        return "medium"
    else:
        return "large"

In [None]:
train_df["family"] = train_df["family"].apply(family_size)
test_df["family"] = test_df["family"].apply(family_size)

In [None]:
plot_barplot(train_df, "family")

In [None]:
from sklearn.preprocessing import OrdinalEncoder

In [None]:
family_df = pd.DataFrame({"family": train_df["family"].values})
family_df_test = pd.DataFrame({"family": test_df["family"].values})

In [None]:
ordinal_enc = OrdinalEncoder()
train_df["family"] = ordinal_enc.fit_transform(family_df).reshape(-1)
test_df["family"] = ordinal_enc.transform(family_df_test).reshape(-1)

In [None]:
train_df.head()

**Pclass**

In [None]:
plot_barplot(train_df, "Pclass")

**Fare**

In [None]:
print(f" Missing values in Fare column = {train_df.Fare.isna().sum()}")

In [None]:
train_df["Fare"] = train_df["Fare"].fillna(value=train_df["Fare"].median())
test_df["Fare"] = test_df["Fare"].fillna(value=train_df["Fare"].median())

**Embarked**

Fill missing values with column mode.

In [None]:
train_df['Embarked'].mode()[0]

In [None]:
train_df['Embarked'].value_counts(dropna=False)

In [None]:
train_df["Embarked"].fillna(value=train_df["Embarked"].mode()[0], inplace=True)
test_df["Embarked"].fillna(value=test_df["Embarked"].mode()[0], inplace=True)

**Dummy variables**

In [None]:
train2 = train_df.drop("PassengerId", axis=1).copy()

In [None]:
test2 = test_df.drop("PassengerId", axis=1).copy()

In [None]:
dummy_cols = ["Sex", "Age", "Embarked", "Deck","Age_interval"] 

In [None]:
def convert_columns(df, cols):
    
    dummies_df = pd.get_dummies(df[cols], drop_first=True)
    df.drop(cols, axis=1, inplace=True)
    new_df = pd.concat([df, dummies_df], axis=1)
    new_df.drop("PassengerId", axis=1, inplace=True)
    
    return new_df

In [None]:
new_train = convert_columns(train_df, dummy_cols)
new_test = convert_columns(test_df, dummy_cols)

In [None]:
new_train.head()

**Heatmap**

In [None]:
corr_map = new_train.corr(method="spearman")
mask = np.zeros_like(corr_map)
mask[np.triu_indices_from(mask)] = True
plt.figure(figsize=(20,10))
sns.heatmap(corr_map,
            mask=mask,
            annot=True,
            linewidth=1,
            linecolor="w",
            #square=True,
            cbar=False,
            cmap="coolwarm")

We can see that newly created column has_Cabin is perfectly correlated with Deck_N column, therefore I will drop has_Cabin column. 

In [None]:
new_train = new_train.drop("has_Cabin", axis=1)
new_test = new_test.drop("has_Cabin", axis=1)

**Columns distribution**

In [None]:
cols = new_train.drop("Survived", axis=1).columns

In [None]:
fig, axes = plt.subplots(6, 3, figsize=(16, 20))

for col, ax in zip(cols, axes.flatten()):
    sns.histplot(x=new_train[col], ax=ax)
    sns.histplot(x=new_test[col], ax=ax, color="red", alpha=0.4)
    plt.tight_layout()

## Creating folds with StratifiedKFold

In [None]:
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=10)
new_train["kfold"] = -1

new_train = new_train.sample(frac=1).reset_index(drop=True)

for fold, (train_idx, valid_idx) in enumerate(skf.split(X=new_train, y=new_train["Survived"])):
    new_train.loc[valid_idx, "kfold"] = fold

In [None]:
def run_training(algo, df, test_df, fold, oof):
    train_df = df[df.kfold != fold].reset_index(drop=True)
    valid_df = df[df.kfold == fold].reset_index(drop=True)
    
    xtrain = train_df.drop(["Survived", "kfold"], axis=1)
    xvalid = valid_df.drop(["Survived", "kfold"], axis=1)
    
    sc = StandardScaler()
    xtrain = sc.fit_transform(xtrain)
    xvalid = sc.transform(xvalid)
    test_df = sc.transform(test_df)
    
    ytrain = train_df["Survived"].values
    yvalid = valid_df["Survived"].values
    
    algo.fit(xtrain, ytrain)
    preds = algo.predict(xvalid)
    sub_proba = algo.predict_proba(test_df)[:, 1]
    train_proba = algo.predict_proba(xvalid)[:, 1]
    
    fold_acc = accuracy_score(yvalid, preds)
    
    print(f"fold={fold+1}, accuracy={fold_acc}")
    oof[valid_idx] += fold_acc
    
    return oof, sub_proba, algo, train_proba

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score

rfc = RandomForestClassifier(n_estimators=150)

level2_df = pd.DataFrame()
df_proba = pd.DataFrame()

test_proba = np.zeros(len(new_test))
oof = np.zeros(len(new_train))
train_pred = []
for fold in range(10):
    oof, proba, rfc_model, tt_pred = run_training(rfc,new_train, new_test, fold, oof)
    test_proba += proba
    train_pred.append(tt_pred)
    
level2_df["randomforest"] = np.hstack(train_pred)  
df_proba["randomforest"] = test_proba / 10
print(f"Mean accuracy after 10 folds {np.mean(oof)}")

In [None]:
#from sklearn.neighbors import KNeighborsClassifier
#knn = KNeighborsClassifier(n_jobs=-1)

#test_proba = np.zeros(len(new_test))
#oof = np.zeros(len(new_train))
#for fold in range(5):
    #oof, proba = run_training(knn,new_train, new_test, fold, oof)
    #test_proba += proba
    
#df_proba["knn"] = test_proba / 5
#print(f"Mean accuracy after 5 folds {np.mean(oof)}")

In [None]:
from xgboost import XGBClassifier

xgb = XGBClassifier(use_label_encoder=False)

test_proba = np.zeros(len(new_test))
oof = np.zeros(len(new_train))
train_pred = []
for fold in range(10):
    oof, proba, xgb_model, tt_pred = run_training(xgb,new_train, new_test, fold, oof)
    test_proba += proba
    train_pred.append(tt_pred)
    
level2_df["xgboost"] = np.hstack(train_pred)
df_proba["xgboost"] = test_proba / 10
print(f"Mean accuracy after 10 folds {np.mean(oof)}")

In [None]:
from lightgbm import LGBMClassifier

lgbm = LGBMClassifier()

test_proba = np.zeros(len(new_test))
oof = np.zeros(len(new_train))
train_pred = []
for fold in range(10):
    oof, proba, lgbm_model, tt_pred = run_training(lgbm,new_train, new_test, fold, oof)
    test_proba += proba
    train_pred.append(tt_pred)

level2_df["lgbm"] = np.hstack(train_pred)
df_proba["lgbm"] = test_proba / 10
print(f"Mean accuracy after 10 folds {np.mean(oof)}")

In [None]:
#df_proba["sum"] = df_proba.sum(axis=1) / 4
#df_proba["binary"] = np.where(df_proba["sum"] > 0.5, 1, 0)

In [None]:
df_proba["wavg"] = 0.2 * df_proba["randomforest"] + 0.2 * df_proba["xgboost"] + 0.5 * df_proba["lgbm"]
df_proba["binary_wavg"] = np.where(df_proba["wavg"] > 0.5, 1, 0)

In [None]:
df_proba.head()

## Ensemble submission

In [None]:
submission = sample.copy()

submission["Survived"] = df_proba["binary_wavg"].values
submission.to_csv("ensemble_sub_avg.csv",index=False)

## Lightgbm hyperparameter optimalization with optuna

In [None]:
import optuna
import lightgbm as lgbm
from sklearn.model_selection import train_test_split
from optuna.pruners import SuccessiveHalvingPruner

In [None]:
X = new_train.drop(["Survived", "kfold"], axis=1)
y = new_train["Survived"].values

In [None]:
def objective(trial):
    
    xtrain, xvalid, ytrain, yvalid = train_test_split(X, y, test_size=0.2, random_state=45)
    sc = StandardScaler()
    xtrain = sc.fit_transform(xtrain)
    xvalid = sc.transform(xvalid)
    
    dtrain = lgbm.Dataset(xtrain, label=ytrain)
    dvalid = lgbm.Dataset(xvalid, label=yvalid)
    
    params = {
        "objective":"binary",
        "metric":"binary_logloss",
        "learning_rate": trial.suggest_loguniform("learning_rate", 1e-4, 1e-1),
        "lambda_l1": trial.suggest_loguniform("lambda_l1", 1e-8, 10.0),
        "lambda_l2": trial.suggest_loguniform("lambda_l1", 1e-8, 10.0),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "max_depth": trial.suggest_int("max_depth", 3, 15),
        "feature_fraction": trial.suggest_uniform("feature_fraction", 0.4, 1.0),
        "bagging_fraction": trial.suggest_uniform("bagging_fraction", 0.4, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100)
    }
    
    gbm = lgbm.train(params, dtrain)
    preds = gbm.predict(xvalid)
    pred_labels = np.rint(preds)
    accuracy = accuracy_score(yvalid, pred_labels)
    return accuracy

study = optuna.create_study(direction="maximize", pruner=SuccessiveHalvingPruner())
study.optimize(objective, n_trials=100)

print("Number of finished trials:", len(study.trials))
print("Best trial:", study.best_trial.params)

In [None]:
study.best_value

In [None]:
lgbm_final = LGBMClassifier(**study.best_params)

test_proba = np.zeros(len(new_test))
oof = np.zeros(len(new_train))
train_pred = []
for fold in range(10):
    oof, proba, lgbm_optuna, tt_pred = run_training(lgbm_final,new_train, new_test, fold, oof)
    test_proba += proba
    train_pred.append(tt_pred)

level2_df["lgbm_optuna_10fold"] = np.hstack(train_pred)
df_proba["lgbm_optuna_10fold"] = test_proba / 5
print(f"Mean accuracy after 10 folds {np.mean(oof)}")

## Submission

In [None]:
submission = sample.copy()

sc = StandardScaler()
Xtrain_full = sc.fit_transform(X)
Xtest = sc.transform(new_test)

lgbm_model = LGBMClassifier(**study.best_params)
lgbm_model.fit(Xtrain_full, y)

sub_preds = lgbm_model.predict(Xtest)
submission["Survived"] = sub_preds
submission.to_csv("lgbm_optuna10fold.csv",index=False)

## Different approach

In [None]:
train2.head()

In [None]:
price_per_deck = train2.groupby("Deck").mean()["Fare"]
train2["fare_mean_per_deck"] = train2["Deck"].map(price_per_deck.to_dict())
test2["fare_mean_per_deck"] = test2["Deck"].map(price_per_deck.to_dict())

In [None]:
cols = ["Sex", "Embarked", "Deck", "Age_interval"]
le = LabelEncoder()
for col in cols:
    le.fit(train2[col].values.tolist() + test2[col].values.tolist())
    train2[col] = le.transform(train2[col].values)
    test2[col] = le.transform(test2[col].values)

In [None]:
fig, axes = plt.subplots(5, 3, figsize=(16, 15))

for col, ax in zip(train2.drop("Survived", axis=1).columns, axes.flatten()):
    sns.histplot(x=train2[col], ax=ax)
    sns.histplot(x=test2[col], ax=ax, color="red",alpha=0.4)
    plt.tight_layout()

In [None]:
skf = StratifiedKFold(n_splits=10)
train2["kfold"] = -1

train2 = train2.sample(frac=1).reset_index(drop=True)

for fold, (train_idx, valid_idx) in enumerate(skf.split(X=train2, y=train2["Survived"])):
    train2.loc[valid_idx, "kfold"] = fold

In [None]:
log_cols = ["Fare"]
for col in log_cols:
    train2[col] = np.log1p(train2[col])
    test2[col] = np.log1p(test2[col])

In [None]:
plt.figure(figsize=(12,8))
sns.histplot(x=train2["Fare"])

In [None]:
clf = RandomForestClassifier(n_estimators=150)

test_proba = np.zeros(len(test2))
oof = np.zeros(len(train2))
train_pred = []
for fold in range(10):
    oof, proba, rfc_model2, tt_pred = run_training(rfc,train2, test2, fold, oof)
    test_proba += proba
    train_pred.append(tt_pred)

level2_df["randomforest2"] = np.hstack(train_pred)
df_proba["randomforest2"] = test_proba / 10
print(f"Mean accuracy after 10 folds {np.mean(oof)}")

Mean accuracy after 5 folds 0.7647400000000004

In [None]:
xgb = XGBClassifier(use_label_encoder=False)

test_proba = np.zeros(len(test2))
oof = np.zeros(len(train2))
train_pred = []
for fold in range(10):
    oof, proba, xgb_model2, tt_pred = run_training(xgb,train2, test2, fold, oof)
    test_proba += proba
    train_pred.append(tt_pred)
    
level2_df["xgboost2"] = np.hstack(train_pred)
df_proba["xgboost2"] = test_proba / 10
print(f"Mean accuracy after 10 folds {np.mean(oof)}")

Mean accuracy after 5 folds 0.7789

In [None]:
lgbm = LGBMClassifier()

test_proba = np.zeros(len(new_test))
oof = np.zeros(len(new_train))
train_pred = []
for fold in range(10):
    oof, proba, lgbm_model2, tt_pred = run_training(lgbm, train2, test2, fold, oof)
    test_proba += proba
    train_pred.append(tt_pred)
    
level2_df["lgbm2"] = np.hstack(train_pred)
df_proba["lgbm2"] = test_proba / 10
print(f"Mean accuracy after 10 folds {np.mean(oof)}")

Mean accuracy after 5 folds 0.7819000000000002 to beat

In [None]:
df_proba["wavg2"] = 0.2 * df_proba["randomforest2"] + 0.2 * df_proba["xgboost2"] + 0.5 * df_proba["lgbm2"]
df_proba["binary_wavg2"] = np.where(df_proba["wavg2"] > 0.5, 1, 0)
df_proba.head()

## Ensemble submission 2

In [None]:
submission = sample.copy()

submission["Survived"] = df_proba["binary_wavg2"].values
submission.to_csv("ensemble_sub1.csv",index=False)
submission["Survived"] = np.where(df_proba["lgbm2"] > 0.5, 1, 0)
submission.to_csv("lgbm_sub10fold.csv",index=False)

## LGBM + optuna

In [None]:
X = train2.drop(["kfold", "Survived"], axis=1)
y = train2["Survived"].values

In [None]:
import lightgbm as lgbm

def objective(trial):
    
    xtrain, xvalid, ytrain, yvalid = train_test_split(X, y, test_size=0.2, random_state=45)
    sc = StandardScaler()
    xtrain = sc.fit_transform(xtrain)
    xvalid = sc.transform(xvalid)
    
    dtrain = lgbm.Dataset(xtrain, label=ytrain)
    dvalid = lgbm.Dataset(xvalid, label=yvalid)
    
    params = {
        "objective":"binary",
        "metric":"binary_logloss",
        "learning_rate": trial.suggest_loguniform("learning_rate", 1e-4, 1e-1),
        "lambda_l1": trial.suggest_loguniform("lambda_l1", 1e-8, 10.0),
        "lambda_l2": trial.suggest_loguniform("lambda_l1", 1e-8, 10.0),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "max_depth": trial.suggest_int("max_depth", 3, 15),
        "feature_fraction": trial.suggest_uniform("feature_fraction", 0.4, 1.0),
        "bagging_fraction": trial.suggest_uniform("bagging_fraction", 0.4, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100)
    }
    
    gbm = lgbm.train(params, dtrain)
    preds = gbm.predict(xvalid)
    pred_labels = np.rint(preds)
    accuracy = accuracy_score(yvalid, pred_labels)
    return accuracy

study = optuna.create_study(direction="maximize", pruner=SuccessiveHalvingPruner())
study.optimize(objective, n_trials=100)

print("Number of finished trials:", len(study.trials))
print("Best trial:", study.best_trial.params)

In [None]:
study.best_value

In [None]:
submission = sample.copy()

sc = StandardScaler()
Xtrain_full = sc.fit_transform(X)
Xtest = sc.transform(test2)

lgbm_model = LGBMClassifier(**study.best_params)
lgbm_model.fit(Xtrain_full, y)

sub_preds = lgbm_model.predict(Xtest)
submission["Survived"] = sub_preds
submission.to_csv("lgbm_optuna2.csv",index=False)

In [None]:
study.best_value

In [None]:
lgbm = LGBMClassifier(**study.best_params)

test_proba = np.zeros(len(new_test))
oof = np.zeros(len(new_train))
train_pred = []
for fold in range(10):
    oof, proba, lgbm_model, tt_pred = run_training(lgbm,train2, test2, fold, oof)
    test_proba += proba
    train_pred.append(tt_pred)

level2_df["lgbm_optuna2"] = np.hstack(train_pred)
df_proba["lgbm_optuna2"] = test_proba / 10
print(f"Mean accuracy after 10 folds {np.mean(oof)}")

In [None]:
submission = sample.copy()
submission["Survived"] = np.where(df_proba["lgbm_optuna2"] > 0.5, 1, 0)
submission.to_csv("lgbm_optuna2_10folds_2.csv", index=False)

## Explain LGBMClassifier predictions with Shap

In [None]:
shap.initjs()

In [None]:
explainer = shap.TreeExplainer(lgbm_model)
shap_values = explainer.shap_values(X)

In [None]:
shap.force_plot(explainer.expected_value[1], shap_values[1][0,:], X.iloc[0,:])

In [None]:
shap.force_plot(explainer.expected_value[1], shap_values[1][:1000,:], X.iloc[:1000,:])

In [None]:
shap.summary_plot(shap_values[1], X)

In [None]:
shap.summary_plot(shap_values, X)

## Xgboost + optuna

In [None]:
train2.head()

In [None]:
X = train2.drop(["Survived", "kfold"], axis=1)
y = train2["Survived"].values

In [None]:
import xgboost as xgb

In [None]:
def objective(trial):
    
    Xtrain, Xvalid, ytrain, yvalid = train_test_split(X, 
                                                      y, 
                                                      test_size=0.2,
                                                      random_state=101)
    
    sc = StandardScaler()
    Xtrain = sc.fit_transform(Xtrain)
    Xvalid = sc.transform(Xvalid)
    
    dtrain = xgb.DMatrix(Xtrain, label=ytrain)
    dvalid = xgb.DMatrix(Xvalid, label=yvalid)
    
    params = {
        "objective": "binary:logistic",
        "use_label_encoder": False,
        "eta": trial.suggest_loguniform("eta", 1e-2, 2e-1),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 12),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "gamma": trial.suggest_loguniform("gamma", 1e-4, 1.0),
        "lambda": trial.suggest_loguniform("lambda", 1e-4, 1.0),
        "alpha": trial.suggest_loguniform("alpha", 1e-4, 1.0),
        "subsample": trial.suggest_uniform("subsample", 0.3, 1.0),
        "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.3, 1.0)      
    }
    xgb_optuna = xgb.train(params, dtrain)
    preds = xgb_optuna.predict(dvalid)
    pred_labels = np.rint(preds)
    accuracy = accuracy_score(yvalid, pred_labels)
    return accuracy

study = optuna.create_study(direction="maximize", pruner=SuccessiveHalvingPruner())
study.optimize(objective, n_trials=100)

print("Number of finished trials:", len(study.trials))
print("Best trial:", study.best_trial.params)

In [None]:
study.best_value

In [None]:
xgb = XGBClassifier(**study.best_params, use_label_encoder=False)

test_proba = np.zeros(len(test2))
oof = np.zeros(len(train2))
train_pred = []
for fold in range(10):
    oof, proba, xgb_model2, tt_pred = run_training(xgb,train2, test2, fold, oof)
    test_proba += proba
    train_pred.append(tt_pred)
    
level2_df["xgboost_optuna_5fold2"] = np.hstack(train_pred)
df_proba["xgboost_optuna_5fold2"] = test_proba / 10
print(f"Mean accuracy after 10 folds {np.mean(oof)}")

## Xgboost predictions with shap

In [None]:
explainer = shap.TreeExplainer(xgb_model2, X)
shap_values = explainer.shap_values(X)

In [None]:
shap.initjs()
shap.force_plot(explainer.expected_value, shap_values[1], X.iloc[0,:].index)

In [None]:
shap.summary_plot(shap_values, X)

In [None]:
shap.force_plot(explainer.expected_value, shap_values[:1000,:], X.iloc[:1000,:])

In [None]:
submission["Survived"] = np.where(df_proba["xgboost_optuna_5fold2"] > 0.5, 1, 0)
submission.to_csv("xgboost_optuna_10fold_2.csv", index=False)

The best prediction I've achieved so far is with lgbm_optuna2_5folds_2 which scored 0.80423

## Level 2 model aproach

In [None]:
df_proba = df_proba.drop(['wavg', 'binary_wavg','wavg2','binary_wavg2'], axis=1)

In [None]:
new_train2 = pd.concat([level2_df, train2], axis=1)
new_test2 = pd.concat([df_proba, test2], axis=1)

In [None]:
from sklearn.linear_model import LogisticRegressionCV

lr_model = LogisticRegressionCV(max_iter=100000)

test_proba = np.zeros(len(test2))
oof = np.zeros(len(train2))
for fold in range(10):
    oof, proba, lr_model,_ = run_training(lr_model, new_train2, new_test2, fold, oof)
    test_proba += proba
    
final_preds = test_proba / 10
print(f"Mean accuracy after 10 folds {np.mean(oof)}")

In [None]:
submission["Survived"] = np.where(final_preds > 0.5, 1, 0)
submission.to_csv("level2_sub2.csv", index=False)

In [None]:
X = new_train2.drop(["Survived", "kfold"], axis=1)
y = new_train2["Survived"].values

import lightgbm as lgbm

def objective(trial):
    
    xtrain, xvalid, ytrain, yvalid = train_test_split(X, y, test_size=0.2, random_state=45)
    sc = StandardScaler()
    xtrain = sc.fit_transform(xtrain)
    xvalid = sc.transform(xvalid)
    
    dtrain = lgbm.Dataset(xtrain, label=ytrain)
    dvalid = lgbm.Dataset(xvalid, label=yvalid)
    
    params = {
        "objective":"binary",
        "metric":"binary_logloss",
        "learning_rate": trial.suggest_loguniform("learning_rate", 1e-4, 1e-1),
        "lambda_l1": trial.suggest_loguniform("lambda_l1", 1e-8, 10.0),
        "lambda_l2": trial.suggest_loguniform("lambda_l1", 1e-8, 10.0),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "max_depth": trial.suggest_int("max_depth", 3, 15),
        "feature_fraction": trial.suggest_uniform("feature_fraction", 0.4, 1.0),
        "bagging_fraction": trial.suggest_uniform("bagging_fraction", 0.4, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100)
    }
    
    gbm = lgbm.train(params, dtrain)
    preds = gbm.predict(xvalid)
    pred_labels = np.rint(preds)
    accuracy = accuracy_score(yvalid, pred_labels)
    return accuracy

study = optuna.create_study(direction="maximize", pruner=SuccessiveHalvingPruner())
study.optimize(objective, n_trials=100)

print("Number of finished trials:", len(study.trials))
print("Best trial:", study.best_trial.params)

In [None]:
study.best_value

In [None]:
lgbm = LGBMClassifier(**study.best_params)

test_proba = np.zeros(len(test2))
oof = np.zeros(len(train2))
for fold in range(10):
    oof, proba,_ ,_ = run_training(lgbm, new_train2, new_test2, fold, oof)
    test_proba += proba
    
final_preds = test_proba / 10
print(f"Mean accuracy after 10 folds {np.mean(oof)}")

In [None]:
submission["Survived"] = np.where(final_preds > 0.48, 1, 0)
submission.to_csv("level2_sub3.csv", index=False)

## Finding optimal weights

In [None]:
from sklearn.metrics import roc_auc_score
from functools import partial
from scipy.optimize import fmin

In [None]:
class OptimizerAUC:
    def __init__(self):
        self.coef_ = 0
        
    def auc_(self, coef, X, y):
        x_coef = X * coef
        predictions = np.sum(x_coef, axis=1)
        auc_score = roc_auc_score(y, predictions)
        return -1.0 * auc_score
    
    def fit(self, X, y):
        partial_loss = partial(self.auc_, X=X, y=y)
        init_coef = np.random.dirichlet(np.ones(X.shape[1]))
        self.coef_ = fmin(partial_loss, init_coef, disp=True)
        
    def predict(self, X):
        x_coef = X * self.coef_
        predictions = np.sum(x_coef, axis=1)
        return predictions
    
    
def run_training2(pred_df, fold, col_names):
    train_df = pred_df[pred_df.kfold != fold].reset_index(drop=True)
    valid_df = pred_df[pred_df.kfold == fold].reset_index(drop=True)
    
    xtrain = train_df[col_names].values
    xvalid = valid_df[col_names].values
    
    sc = StandardScaler()
    xtrain = sc.fit_transform(xtrain)
    yvalid = sc.transform(xvalid)
    
    ytrain = train_df.Survived.values
    yvalid = valid_df.Survived.values
    
    opt = OptimizerAUC()
    opt.fit(xtrain, ytrain)
    preds = opt.predict(xvalid)
    
    fold_auc = roc_auc_score(yvalid, preds)
    print(f"fold={fold}, auc={fold_auc}")
    
    return opt.coef_

In [None]:
level2_df["Survived"] = train2["Survived"].values
level2_df["kfold"] = train2["kfold"].values

In [None]:
col_names = [col for col in level2_df.columns if col not in ["Survived", "kfold"]]

In [None]:
level2_df.columns

In [None]:
col_names = ['lgbm_optuna_10fold', 'lgbm2', 'xgboost_optuna_5fold2']

In [None]:
coef = []
for j in range(10):
    coef.append(run_training2(level2_df, j, col_names))

In [None]:
coef = np.array(coef)
coef_mean = np.mean(coef, axis=0)
print(coef_mean)

In [None]:
col_names

In [None]:
wt_avg = (
    coef_mean[0] * level2_df["lgbm_optuna_10fold"].values
    + coef_mean[1] * level2_df["lgbm2"].values
    + coef_mean[2] * level2_df["xgboost_optuna_5fold2"].values
)
print("Optimal acc after finding coefs")
wt_acc = accuracy_score(level2_df["Survived"], np.where(wt_avg > 0.5, 1, 0))
print(f"Optimized weighted avg of acc: {wt_acc}")

In [None]:
wt_avg_sub = (
    coef_mean[0] * df_proba["lgbm_optuna_10fold"].values
    + coef_mean[1] * df_proba["lgbm2"].values
    + coef_mean[2] * df_proba["xgboost_optuna_5fold2"].values
)

In [None]:
submission["Survived"] = np.where(wt_avg_sub > 0.5, 1, 0)
submission.to_csv("optimal_weights_sub.csv", index=False)