In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from tqdm.auto import tqdm
tqdm.pandas()

import matplotlib.pyplot as plt
import os

#i added the original titanic dataset also here, just to compare a bit
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


I added the original dataset, just so I can do a bit of comparison. Because I tried copying features from my earlier Titanic notebooks and some did not work.

In [None]:
%%time
df_train_orig = pd.read_csv("/kaggle/input/titanic/train.csv")
df_train_orig.head()

In [None]:
df_train = pd.read_csv("/kaggle/input/tabular-playground-series-apr-2021/train.csv")
df_train.head()

There seems to be the difference, that the synthetic data in this dataset has no titles for people. Thus the title as a feature is not useful in this case.

In [None]:
df_test = pd.read_csv("/kaggle/input/tabular-playground-series-apr-2021/test.csv")
df_test.head()

Combine train and test data to build some common features.

In [None]:
df_train["train"] = 1
df_test["train"] = 0
df_all = pd.concat([df_train, df_test], sort=False)
df_all.head()

In [None]:
def parse_cabin_type(x):
    if pd.isnull(x):
        return None
    cab_id = x[0]
    return cab_id

In [None]:
def parse_cabin_num(x):
    if pd.isnull(x):
        return -1
    cab_num = x[1:]
    return cab_num

In [None]:
cabin_type = df_all["Cabin"].apply(lambda x: parse_cabin_type(x))


In [None]:
cabin_type

In [None]:
cabin_num = df_all["Cabin"].apply(lambda x: parse_cabin_num(x))
cabin_num

In [None]:
cabin_num.unique()

Also, the original dataset had people who were marked as having multiple cabins. This synthetic dataset does not have that.

In [None]:
def parse_cabin_count(x):
    if pd.isnull(x):
        return np.nan
    #a typical passenger has a single cabin but some had multiple. in that case they are space separated
    cabs = x.split()
    return len(cabs)


In [None]:
df_all["cabin_type"] = df_all["Cabin"].apply(lambda x: parse_cabin_type(x))
df_all["cabin_num"] = df_all["Cabin"].apply(lambda x: parse_cabin_num(x))
#no multiple cabins in this set
#df_all["cabin_count"] = df_all["Cabin"].apply(lambda x: parse_cabin_count(x))
df_all["cabin_num"] = df_all["cabin_num"].astype(int)
df_all.head()

In [None]:
df_all["family_size"] = df_all["SibSp"] + df_all["Parch"] + 1

In [None]:
#there are no titles in this dataset

#df_all['Title'] = df_all['Name'].str.extract('([A-Za-z]+)\.', expand=True)
#df_all.head()


In [None]:
df_train_orig['Title'] = df_train_orig['Name'].str.extract('([A-Za-z]+)\.', expand=True)
df_train_orig.head()


So the above shows the titles in the original dataset, extracted to the "Title" column. The new synthetic one gives nothing if you run that on int.

There are some missing values. Age is one, so need to imputate that. Meaning, fill in the blanks..

In [None]:
df_all['Age'].isnull().sum()

In [None]:
df_all["Age"].value_counts().count()

There are 175 different age values, so the age must be reported in fractions of a year. The following confirms this:

In [None]:
df_all["Age"].unique()


In [None]:
df_all[df_all["Fare"].isnull()]

The number of missing fares by passenger group (Pclass):

In [None]:
df_all.groupby('Pclass').agg({'Fare': lambda x: x.isnull().sum()})

Strangely, the fare seems also to vary quite a lot inside each passenger class as well. The following shows over 20k different values for class 1 alone:

In [None]:
df_all.groupby('Pclass')["Fare"].nunique()

In [None]:
p3_median_fare = df_all[df_all["Pclass"] == 2]["Fare"].median()
p3_median_fare

The values are largely collected on the bottom part, with the above median of 21.7:

In [None]:
df_all[df_all["Pclass"] == 3].hist(column="Fare", bins=100)

See how the fares are quite changing within the class, and the diffs are mostly not too big:

In [None]:
df_all[df_all["Pclass"] == 3]["Fare"].value_counts()

This fills the missing fares by the passengers class medium. So passenger in class 1 with a missing fare gets a new fare value that is the median of all reported fares in class 1:

In [None]:
df_all['Fare'] = df_all['Fare'].fillna(df_all.groupby('Pclass')['Fare'].transform('median'))

In [None]:
df_all["Fare"].isnull().sum()

In [None]:
df_all.head()

Change all categorical columns to pandas categorical data type to make use of LGBM's built-in categorical data handling. Thus no need for one-hot encoding:

In [None]:
#pd.Int64Dtype seems to be some kind of int that takes NaN also. 
#however, using it here causes unknown label type for LGBM, so have stick with float
#df_all["Survived"] = df_all["Survived"].astype(pd.Int64Dtype())
df_all["Sex"] = df_all["Sex"].astype('category')
df_all["Embarked"] = df_all["Embarked"].astype('category')
df_all["cabin_type"] = df_all["cabin_type"].astype('category')

In [None]:
#passenger id got ranked high at some point in feature importance.. 
#no idea why. better to remove it anyway
#although i guess it could indicate the order in which people boarded
df_all = df_all.drop(["Cabin", "Name", "Ticket", "PassengerId"], axis=1)
df_all.head()

In [None]:
df_all.dtypes

Now that the data is all processed and features added, split it back to the original train/test set:

In [None]:
df_train = df_all[df_all["train"] == 1]
df_train.head()

In [None]:
df_test = df_all[df_all["train"] == 0]
df_test = df_test.drop(["Survived", "train"], axis=1)
df_test.head()


In [None]:
y = df_train["Survived"]
X = df_train.drop(["Survived", "train"], axis=1)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=314, stratify=y)

In [None]:
%%time
import lightgbm as lgbm


In [None]:
df_train.dtypes

Check our categorical columns are still correct:

In [None]:
cat_cols = df_train.select_dtypes(include=['category']).columns
cat_cols

Set the parameters to use for LGBM fit() function:

In [None]:
fit_params = {"eval_metric": ["binary_logloss", "auc"]}
#fit_params["n_estimators"] = [2000, 5000, 10000, 15000]
fit_params["early_stopping_rounds"] = 50
fit_params["eval_set"] = [(X_test,y_test)]
fit_params['verbose'] = 100 #this results in printing info every 100th round
fit_params['categorical_feature'] = 'auto'
#fit_params['categorical_feature'] = cat_cols

    

This is a method to define a range of values to explore for the Random Search algorithm:

In [None]:
from scipy.stats import randint as sp_randint

frozen = sp_randint(6, 50)
frozen_results = frozen.rvs(size=1000)


Just to see what types of data the above generated:

In [None]:
plt.hist(frozen_results)

Do a randomized search over the search space:

In [None]:
%%time
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

param_space={'num_leaves': sp_randint(6, 100), 
             'min_child_samples': sp_randint(100, 500), 
             'min_child_weight': [1e-5, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4],
             'subsample': sp_uniform(loc=0.2, scale=0.8), 
             'colsample_bytree': sp_uniform(loc=0.4, scale=0.6),
             'reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10, 50, 100],
             'reg_lambda': [0, 1e-1, 1, 5, 10, 20, 50, 100]}

clf = lgbm.LGBMClassifier(max_depth=-1, random_state=314, 
                         silent=True, metric='None', 
                         n_jobs=4, n_estimators=5000)

gs = RandomizedSearchCV(
    estimator=clf, param_distributions=param_space, 
    n_iter=100,
    scoring='roc_auc',
    cv=3,
    refit=True,
    random_state=314,
    verbose=True)

Now run the search that was just configured above.

Unfortunately the following will also print excessive error messages about overriding some categorical value. Quick search turned no solution, so just leaving it here.

In [None]:
%%time
gs.fit(X_train, y_train, **fit_params)


In [None]:
print(f'Best score reached: {gs.best_score_} with params: {gs.best_params_} ')

Take the best parameters that the above search found, and re-train the LGBM with those.

In [None]:
clf = lgbm.LGBMClassifier(max_depth=-1, random_state=314, 
                         silent=True, metric='None', 
                         n_jobs=4, n_estimators=5000, **gs.best_params_)

In [None]:
clf.fit(X_train, y_train, **fit_params)


Just for interest, plot the highest ranked features:

In [None]:
importances = clf.feature_importances_
features = X.columns
feat_importances = pd.Series(importances, index=features)
feat_importances.nlargest(30).sort_values().plot(kind='barh', color='#86bf91', figsize=(10, 8))
plt.show()

Make the predictions for submission:

In [None]:
predictions = clf.predict(df_test)

In [None]:
predictions

And save them. The Kaggle system does not seem to like floats for 1/0 survived here, so have to convert them to ints.

In [None]:
sub_df = pd.read_csv("/kaggle/input/tabular-playground-series-apr-2021/test.csv")
sub_df = sub_df[["PassengerId"]]
sub_df["Survived"] = predictions
sub_df["Survived"] = sub_df["Survived"].astype(int)

In [None]:
sub_df.to_csv("sub.csv", index=False)

In [None]:
sub_df.head()

In [None]:
!head sub.csv

In [None]:
!tail sub.csv