In [None]:
import numpy as np 
import pandas as pd
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score#
from sklearn.preprocessing import StandardScaler

import optuna
from sklearn.model_selection import train_test_split
from optuna.pruners import SuccessiveHalvingPruner

In [None]:
train_df = pd.read_csv("/kaggle/input/tabular-playground-series-apr-2021/train.csv")
test_df = pd.read_csv("/kaggle/input/tabular-playground-series-apr-2021/test.csv")
sample_df = pd.read_csv("/kaggle/input/tabular-playground-series-apr-2021/sample_submission.csv")

In [None]:
train_df.tail()

In [None]:
train_df.drop("PassengerId", axis=1, inplace=True)
test_df.drop("PassengerId", axis=1, inplace=True)

In [None]:
def cabin_feat(df):
    df["Cabin"] = df["Cabin"].fillna("None")
    df["has_Cabin"] = df["Cabin"].apply(lambda x: 1 if x != "None" else 0)
    df["Deck"] = df["Cabin"].apply(lambda x: x[0])
    df.drop("Cabin", axis=1, inplace=True)
    
cabin_feat(train_df)
cabin_feat(test_df)

In [None]:
def fill_nan(df, group_col, col):
    """
    This function fill nan values in given column 
    based on groupby column.
    """
    mask_dict = df.groupby(group_col).mean()[col].to_dict()
    missing_mask = df[col].isna()
    df.loc[missing_mask, col] = df.loc[missing_mask, group_col].map(mask_dict)
    
fill_nan(train_df, "Pclass", "Age")
fill_nan(test_df, "Pclass", "Age")
fill_nan(train_df, "Deck", "Fare")
fill_nan(test_df, "Deck", "Fare")

In [None]:
train_df["Fare"] = train_df["Fare"].apply(lambda x: np.log(x) if x != 0 else 0)
test_df["Fare"] = test_df["Fare"].apply(lambda x: np.log(x) if x != 0 else 0)

In [None]:
train_df["Embarked"] = train_df["Embarked"].fillna(train_df["Embarked"].mode()[0])
test_df["Embarked"] = test_df["Embarked"].fillna(test_df["Embarked"].mode()[0])

In [None]:
train_df["Ticket"] = train_df["Ticket"].fillna("NAN")
test_df["Ticket"] = test_df["Ticket"].fillna("NAN")
train_df["Ticket"] = train_df["Ticket"].apply(lambda x: str(x)[:2])
test_df["Ticket"] = test_df["Ticket"].apply(lambda x: str(x)[:2])

In [None]:
train_df["FamilySize"] = train_df["SibSp"] + train_df["Parch"]
test_df["FamilySize"] = test_df["SibSp"] + test_df["Parch"]

In [None]:
train_df["Name_length"] = train_df["Name"].apply(lambda x: len(x.split(",")[0] + x.split(",")[1].strip()))
test_df["Name_length"] = test_df["Name"].apply(lambda x: len(x.split(",")[0] + x.split(",")[1].strip()))
train_df["Last_name"] = train_df["Name"].apply(lambda x: x.split(",")[0])
train_df["First_name"] = train_df["Name"].apply(lambda x: x.split(",")[1].strip())
test_df["Last_name"] = test_df["Name"].apply(lambda x: x.split(",")[0])
test_df["First_name"] = test_df["Name"].apply(lambda x: x.split(",")[1].strip())
train_df.drop("Name", axis=1, inplace=True)
test_df.drop("Name", axis=1, inplace=True)

In [None]:
def age_feat(x):
    if x <= 5:
        return "baby"
    elif 5 < x <= 16:
        return "teen"
    elif 16 < x <= 30:
        return "yound_adult"
    elif 30 < x <= 50:
        return "adult"
    else:
        return "elder"

In [None]:
train_df["age_range"] = train_df["Age"].apply(age_feat)
test_df["age_range"] = test_df["Age"].apply(age_feat)

In [None]:
enc_cols = [col for col in train_df.select_dtypes("object").columns]

def label_encoder():
    for col in enc_cols:
        le = LabelEncoder()
        le.fit(train_df[col].values.tolist() + test_df[col].values.tolist())
        train_df.loc[:, col] = le.transform(train_df[col].values)
        test_df.loc[:, col] = le.transform(test_df[col].values)

label_encoder()

In [None]:
def run_training(algo, df, test_df, fold, oof):
    train_df = df[df.kfold != fold].reset_index(drop=True)
    valid_df = df[df.kfold == fold].reset_index(drop=True)
    
    xtrain = train_df.drop(["Survived", "kfold"], axis=1)
    xvalid = valid_df.drop(["Survived", "kfold"], axis=1)
    
    sc = StandardScaler()
    xtrain = sc.fit_transform(xtrain)
    xvalid = sc.transform(xvalid)
    test_df = sc.transform(test_df)
    
    ytrain = train_df["Survived"].values
    yvalid = valid_df["Survived"].values
    
    algo.fit(xtrain, ytrain)
    preds = algo.predict(xvalid)
    sub_proba = algo.predict_proba(test_df)[:, 1]
    train_proba = algo.predict_proba(xvalid)[:, 1]
    
    fold_acc = accuracy_score(yvalid, preds)
    
    print(f"fold={fold+1}, accuracy={fold_acc}")
    print("\n")
    oof[valid_idx] += fold_acc
    
    return oof, sub_proba, algo, train_proba

In [None]:
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=5)
train_df["kfold"] = -1

train_df = train_df.sample(frac=1).reset_index(drop=True)

for fold, (train_idx, valid_idx) in enumerate(skf.split(X=train_df, y=train_df["Survived"])):
    train_df.loc[valid_idx, "kfold"] = fold

In [None]:
rfc = RandomForestClassifier(n_estimators=150)

level2_df = pd.DataFrame()
df_proba = pd.DataFrame()

test_proba = np.zeros(len(test_df))
oof = np.zeros(len(train_df))
train_pred = []
for fold in range(5):
    oof, proba, rfc_model, tt_pred = run_training(rfc,train_df, test_df, fold, oof)
    test_proba += proba
    train_pred.append(tt_pred)
    
level2_df["randomforest"] = np.hstack(train_pred)  
df_proba["randomforest"] = test_proba / 5
print(f"Mean accuracy after 5 folds {np.mean(oof)}")

In [None]:
from xgboost import XGBClassifier

xgb = XGBClassifier(use_label_encoder=False)

test_proba = np.zeros(len(test_df))
oof = np.zeros(len(train_df))
train_pred = []
for fold in range(5):
    oof, proba, xgb_model, tt_pred = run_training(xgb,train_df, test_df, fold, oof)
    test_proba += proba
    train_pred.append(tt_pred)
    
level2_df["xgboost"] = np.hstack(train_pred)
df_proba["xgboost"] = test_proba / 5
print(f"Mean accuracy after 5 folds {np.mean(oof)}")

In [None]:
from lightgbm import LGBMClassifier

lgbm = LGBMClassifier()

test_proba = np.zeros(len(test_df))
oof = np.zeros(len(train_df))
train_pred = []
for fold in range(5):
    oof, proba, lgbm_model, tt_pred = run_training(lgbm,train_df, test_df, fold, oof)
    test_proba += proba
    train_pred.append(tt_pred)

level2_df["lgbm"] = np.hstack(train_pred)
df_proba["lgbm"] = test_proba / 5
print(f"Mean accuracy after 5 folds {np.mean(oof)}")

In [None]:
df_proba["wavg"] = 0.1 * df_proba["randomforest"] + 0.2 * df_proba["xgboost"] + 0.7 * df_proba["lgbm"]
df_proba["binary_wavg"] = np.where(df_proba["wavg"] > 0.5, 1, 0)

In [None]:
submission = sample_df.copy()

In [None]:
submission["Survived"] = np.where(df_proba["lgbm"] > 0.5, 1, 0)
submission.to_csv("new_5fold_lgbm.csv", index=False)

In [None]:
# Submit weight average of 3 
submission["Survived"] = df_proba["binary_wavg"]
submission.to_csv("new_sub_wavg5.csv", index=False)

## Optuna

In [None]:
import lightgbm as lgbm

fold_params_dict = {}

for fold in range(5):
    def objective(trial):
        train = train_df[train_df.kfold != fold].reset_index(drop=True)
        valid = train_df[train_df.kfold == fold].reset_index(drop=True)
        
        xtrain = train.drop(["Survived", "kfold"], axis=1)
        ytrain = train["Survived"].values
        xvalid = valid.drop(["Survived", "kfold"], axis=1)
        yvalid = valid["Survived"].values
        
        sc = StandardScaler()
        xtrain = sc.fit_transform(xtrain)
        xvalid = sc.transform(xvalid)
        
        dtrain = lgbm.Dataset(xtrain, label=ytrain)
        dvalid = lgbm.Dataset(xvalid, label=yvalid)
        params = {
                "objective": "binary",
                "metric": "binary_logloss",
                "learning_rate": trial.suggest_loguniform("learning_rate", 1e-4, 1e-1),
                "lambda_l1": trial.suggest_loguniform("lambda_l1", 1e-8, 10.0),
                "lambda_l2": trial.suggest_loguniform("lambda_l2", 1e-8, 10.0),
                "num_leaves": trial.suggest_int("num_leaves", 2, 256),
                "max_depth": trial.suggest_int("max_depth", 3, 15),
                "feature_fraction": trial.suggest_uniform("feature_fraction", 0.4, 1.0),
                "bagging_fraction": trial.suggest_uniform("bagging_fraction", 0.4, 1.0),
                "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
                "min_child_samples": trial.suggest_int("min_child_samples", 5, 100)
        }
    
        gbm = lgbm.train(params, dtrain, valid_sets=[dtrain, dvalid], early_stopping_rounds=100)
        preds = gbm.predict(xvalid)
        pred_labels = np.rint(preds)
        accuracy = accuracy_score(yvalid, pred_labels)
        return accuracy
        
    study = optuna.create_study(direction="maximize", pruner=SuccessiveHalvingPruner())
    study.optimize(objective, n_trials=200)
    fold_params_dict[fold] = study.best_trial.params
        
    print("Number of finished trials:", len(study.trials))
    print("Best trial:", study.best_trial.params)

In [None]:
test_proba = np.zeros(len(test_df))
oof = np.zeros(len(train_df))
train_pred = []
for fold in range(5):
    lgbm = LGBMClassifier(**fold_params_dict[fold])
    oof, proba, lgbm_model, tt_pred = run_training(lgbm,train_df, test_df, fold, oof)
    test_proba += proba
    train_pred.append(tt_pred)

level2_df["lgbm_optuna"] = np.hstack(train_pred)
df_proba["lgbm_optuna"] = test_proba / 5
print(f"Mean accuracy after 5 folds {np.mean(oof)}")

Mean accuracy after 5 folds 0.7857200000000003 to beat

In [None]:
submission["Survived"] = np.where(df_proba["lgbm"] > 0.5, 1, 0)
submission.to_csv("new_optuna_lgbm_5fold.csv", index=False)

## Torch model

In [None]:
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.optim as optim
import torch.utils.data
from torch.autograd import Variable