# Loading

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
train = pd.read_csv("../input/tabular-playground-series-apr-2021/train.csv")
test = pd.read_csv("../input/tabular-playground-series-apr-2021/test.csv")
sample = pd.read_csv("../input/tabular-playground-series-apr-2021/sample_submission.csv")
print(train.shape, test.shape)

In [None]:
train.head()

In [None]:
test.head()

# MissingValue

In [None]:
import matplotlib.pyplot as plt
plt.style.use("seaborn-white")
import seaborn as sns

dummy = train.isnull().sum() / train.shape[0]
sns.barplot(x = dummy.index, y = dummy.values)
plt.xticks(rotation = 90)
plt.show()

In [None]:
dummy = test.isnull().sum() / test.shape[0]
sns.barplot(x = dummy.index, y = dummy.values)
plt.xticks(rotation = 90)
plt.show()

In [None]:
train.isnull().sum()

In [None]:
def preprocess(train, test):
    dummy = train["Age"].median()
    train["Age"].fillna(dummy, inplace = True)
    test["Age"].fillna(dummy, inplace = True)
    
    dummy = train["Fare"].median()
    train["Fare"].fillna(dummy, inplace = True)
    test["Fare"].fillna(dummy, inplace = True)
    
    dummy = train["Embarked"].mode()[0]
    train["Embarked"].fillna(dummy, inplace = True)
    test["Embarked"].fillna(dummy, inplace = True)
    return train, test

train, test = preprocess(train, test)

In [None]:
train.isnull().sum()

In [None]:
test.isnull().sum()

# Name

In [None]:
train["Name"].values[:100]

In [None]:
import re
from wordcloud import WordCloud

def name_cloud(df):
    word_text = []
    for i in range(df.shape[0]):
        word_text.append(re.sub(",", "", df["Name"].values[i]))
    word_text = " ".join(word_text)
    wordcloud = WordCloud(width = 800, height = 800, collocations = False).generate(word_text)
    plt.figure(figsize = (8, 8))
    plt.imshow(wordcloud)
    plt.show()
name_cloud(train)

In [None]:
name_cloud(train.loc[train["Survived"] == 0])

In [None]:
name_cloud(train.loc[train["Survived"] == 1])

In [None]:
train["first"] = train["Name"].apply(lambda x: x.split(", ")[0])
train["family"] = train["Name"].apply(lambda x: x.split(", ")[1])
test["first"] = test["Name"].apply(lambda x: x.split(", ")[0])
test["family"] = test["Name"].apply(lambda x: x.split(", ")[1])
train.head()

# Ticket

In [None]:
def clean(text):
    text = text.upper()
    text = re.sub(" ", "", text)
    text = re.sub(r"\.", "", text)
    text = re.sub(r"\/", "", text)
    text = re.sub(r"\\", "", text)
    return text

train["Ticket"] = train["Ticket"].astype(str)
test["Ticket"] = test["Ticket"].astype(str)
train["cleaned_ticket"] = train["Ticket"].apply(clean)
test["cleaned_ticket"] = test["Ticket"].apply(clean)

In [None]:
train["nan_ticket"] = (train["cleaned_ticket"] == "NAN").astype(int)
test["nan_ticket"] = (test["cleaned_ticket"] == "NAN").astype(int)
train["ticket_feat"] = train["cleaned_ticket"].apply(lambda x: re.sub("[0-9]*", "", x))
test["ticket_feat"] = test["cleaned_ticket"].apply(lambda x: re.sub("[0-9]*", "", x))
train.head()

In [None]:
def ticket_cloud(df):
    word_text = []
    for i in range(df.shape[0]):
        text = re.sub("[0-9]*", "", df["cleaned_ticket"].values[i])
        if text != "":
            word_text.append(text)
    word_text = " ".join(word_text)
    wordcloud = WordCloud(width = 800, height = 800, collocations = False).generate(word_text)
    plt.figure(figsize = (8, 8), facecolor = None)
    plt.imshow(wordcloud)
    plt.show()
    
ticket_cloud(train)

In [None]:
ticket_cloud(train.loc[train["Survived"] == 0])

In [None]:
ticket_cloud(train.loc[train["Survived"] == 1])

# Cabin

In [None]:
train["Cabin"].fillna("NAN", inplace = True)
test["Cabin"].fillna("NAN", inplace = True)

In [None]:
def create_cabin_feat(cabin):
    if cabin == "NAN":
        return "N"
    else:
        return cabin[0]
train["cabin_feat"] = train["Cabin"].apply(create_cabin_feat)
test["cabin_feat"] = test["Cabin"].apply(create_cabin_feat)

In [None]:
print(train["cabin_feat"].unique())
print(test["cabin_feat"].unique())

In [None]:
sns.countplot(x = train["cabin_feat"], hue = train["Survived"])
plt.show()

# PreProcess

In [None]:
train.head()

In [None]:
from sklearn.model_selection import StratifiedKFold
kfold = StratifiedKFold(n_splits = 5)
fold_dict = {}
for fold, (train_idx, valid_idx) in enumerate(kfold.split(train, train["Survived"])):
    fold_dict[fold] = (train_idx, valid_idx)

In [None]:
from sklearn.preprocessing import LabelEncoder
lbl = LabelEncoder()
for col in ["Sex", "Embarked", "ticket_feat", "cabin_feat"]:
    lbl.fit(pd.concat([train, test])[col])
    train[col] = lbl.transform(train[col])
    test[col] = lbl.transform(test[col])
train.head()

In [None]:
for col in ["first", "family"]:
    tmp = train[col].value_counts().to_dict()
    train[col] = train[col].map(tmp)
    test[col] = test[col].map(tmp).fillna(0)
    
train["family_size"] = train["SibSp"] + train["Parch"]
test["family_size"] = test["SibSp"] + test["Parch"]
train.head()

# Model

In [None]:
use_cols = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked", "first", "family", "nan_ticket", "ticket_feat", "cabin_feat", "family_size"]

In [None]:
import lightgbm as lgb
params = {
    "objective" : "binary",
    "metric" : "auc",
    "verbosity" : -1,
    'num_leaves': 76,
    'lambda_l1': 0.00018,
    'lambda_l2': 0.0010,
    'feature_fraction': 0.65,
    'bagging_fraction': 0.56,
    'min_child_samples': 29
}

def training(params):
    OOF = np.zeros(train.shape[0])
    models = []
    for fold in range(5):
        train_idx = fold_dict[fold][0]
        valid_idx = fold_dict[fold][1]
        X_train = train.loc[train_idx, use_cols]
        X_valid = train.loc[valid_idx, use_cols]
        y_train = train.loc[train_idx, "Survived"]
        y_valid = train.loc[valid_idx, "Survived"]
        train_set = lgb.Dataset(X_train, y_train)
        valid_set = lgb.Dataset(X_valid, y_valid)
        model = lgb.train(
            params = params, train_set = train_set, valid_sets = [train_set, valid_set],
            num_boost_round = 100, early_stopping_rounds = 10, verbose_eval = 20
        )
        OOF[valid_idx] = model.predict(X_valid)
        models.append(model)
    return OOF, models
OOF, models = training(params)

In [None]:
from sklearn.metrics import accuracy_score
acc = accuracy_score(y_true = train["Survived"], y_pred = np.where(OOF > 0.5, 1, 0))
print(acc)

In [None]:
import optuna

def objective(trial):
    tune_params = {
        "objectve" : "binary",
        "metric" : "auc",
        "verbosity" : -1,
        "num_leaves" : trial.suggest_int("num_leaves", 2, 256),
        "lambda_l1" : trial.suggest_loguniform("lambda_l1", 1e-6, 1),
        "lambda_l2" : trial.suggest_loguniform("lambda_l2", 1e-6, 1),
        "feature_fraction" : trial.suggest_uniform("feature_fraction", 0.5, 1.0),
        "bagging_fraction" : trial.suggest_uniform("bagging_fraction", 0.5, 1.0),
        "min_child_samples" : trial.suggest_int("min_child_samples", 5, 100)
    }
    OOF, _ = training(tune_params)
    acc = accuracy_score(y_true = train["Survived"], y_pred = np.where(OOF > 0.5, 1, 0))
    return acc

TUNING = False
if TUNING:
    study = optuna.create_study(direction = "maximize")
    study.optimize(objective, n_trials = 10)
    print("=" * 100)
    print(study.best_params)

# Inference

In [None]:
preds = []
for model in models:
    preds.append(model.predict(test[use_cols]))
preds = np.mean(preds, axis = 0)
test["Survived"] = np.where(preds > 0.5, 1, 0)
test[["PassengerId", "Survived"]].to_csv("submission.csv", index = False)

In [None]:
import shap
shap.initjs()
explainer = shap.TreeExplainer(model = models[0])
shap_values = explainer.shap_values(X = train[use_cols])
shap.summary_plot(shap_values, train[use_cols])