In [None]:
import numpy as np
import pandas as pd
from pathlib import Path

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
import matplotlib.pyplot as plt
import seaborn as sns
sns.set() 
#import missingno

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
        
input_path = Path('/kaggle/input/tabular-playground-series-apr-2021/')

#from matplotlib import pyplot as plt

In [None]:
train = pd.read_csv(input_path / 'train.csv', index_col='PassengerId')
display(train.head())

In [None]:
train.info()

In [None]:
test = pd.read_csv(input_path / 'test.csv', index_col='PassengerId')
display(test.head())

In [None]:
submission = pd.read_csv(input_path / 'sample_submission.csv', index_col='PassengerId')
display(submission.head())

# Exploratory Analysis

In [None]:
#Split numerical and categorical variables
df_num = train[['Age','SibSp', 'Parch', 'Fare']]
df_cat = train[['Survived', 'Pclass', 'Sex', 'Ticket', 'Cabin', 'Embarked']]

In [None]:
train.isnull().sum().sort_values(ascending=False)

In [None]:
test.isnull().sum().sort_values(ascending=False)

Numerical variables

In [None]:
#histograms of numberical variables
fig, axes = plt.subplots(1, 4, figsize=(22,6))
fig.suptitle('Distribution of numerical variables')

sns.histplot(x = train['Age'],kde=True, ax=axes[0])
sns.histplot(x = train['SibSp'],kde=True, ax=axes[1])
sns.histplot(x = train['Parch'],kde=True, ax=axes[2])
sns.histplot(x = train['Fare'],kde=True, ax=axes[3])


In [None]:
#Correlations
print(df_num.corr())
sns.heatmap(df_num.corr())

In [None]:
#Average values for survivors vs deceased
pd.pivot_table(train, index = 'Survived', values = df_num.columns)

In [None]:
sns.displot(data = train, x = 'Age',kde=True, hue = 'Survived', col= 'Pclass')

In [None]:
sns.displot(data = train, x = 'Age',kde=True, hue = 'Survived', col= 'Sex')

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(22,6))
fig.suptitle('Distribution of categorical variables')
#'Pclass', 'Sex', 'Ticket', 'Cabin', 'Embarked']]
sns.barplot(x = 'Sex', y = 'Survived', data = train, ax=axes[0])
sns.barplot(x = 'Pclass', y = 'Survived', data = train, ax=axes[1])
sns.barplot(x = 'Embarked', y = 'Survived', data = train, ax=axes[2])

In [None]:
#Embarked by class - greater proportion of 3rd class in Southampton, could be linked with lower surviaval rate
sns.catplot(x='Pclass', col = 'Embarked', data = train, kind = 'count')

# Feature Engineering

In [None]:
#Cabin letter, n = nan
train['cabin_lett'] = train.Cabin.apply(lambda x: str(x)[0])
test['cabin_lett'] = test.Cabin.apply(lambda x: str(x)[0])
#print(train.cabin_lett.value_counts())
pd.pivot_table(train, index = 'Survived', columns = 'cabin_lett', values = 'Ticket', aggfunc = 'count')

#Cabin number
train['cabin_num'] = train.Cabin.str.extract('(\d+)',expand=True,)
train.cabin_num = pd.to_numeric(train.cabin_num)
test['cabin_num'] = test.Cabin.str.extract('(\d+)',expand=True,)
test.cabin_num = pd.to_numeric(test.cabin_num)

In [None]:
sns.barplot(x = 'cabin_lett', y = 'Survived', data = train)

In [None]:
#log fare
train['log_Fare'] = np.log(train.Fare +1)
test['log_Fare'] = np.log(test.Fare +1)
sns.displot(data = train, x = 'log_Fare',kde=True, hue = 'Survived')

In [None]:
train['family'] = train.SibSp + train.Parch
test['family'] = test.SibSp + test.Parch
#train.head()
sns.kdeplot(data = train, x = 'family',shade=True, hue = 'Survived')

In [None]:
train['log_Fam'] = np.log(train.family+1)
test['log_Fam'] = np.log(test.family+1)
sns.kdeplot(data = train, x = 'log_Fam',shade=True, hue = 'Survived')

In [None]:
#Sex and class
train_Pclass_str = train.Pclass.apply(str)
train['Who'] = train.Sex + train_Pclass_str

test_Pclass_str = test.Pclass.apply(str)
test['Who'] = test.Sex + test_Pclass_str

In [None]:
#Name split
train[['last_name','first_name']] = train.Name.str.split(", ",expand=True,)
test[['last_name','first_name']] = test.Name.str.split(", ",expand=True,)

In [None]:
#Ticket split

train['ticket_num'] = train.Ticket.str.extract('(\d+)',expand=True,)
train.ticket_num = pd.to_numeric(train.ticket_num)
train['log_ticket_num'] = np.log(train.ticket_num+1)
train['ticket_lett'] = train.Ticket.replace('(\d)', '', regex=True)

test['ticket_num'] = test.Ticket.str.extract('(\d+)',expand=True,)
test.ticket_num = pd.to_numeric(test.ticket_num)
test['log_ticket_num'] = np.log(test.ticket_num+1)
test['ticket_lett'] = test.Ticket.replace('(\d)', '', regex=True)

#train.ticket_lett.unique()

In [None]:
train.info()

In [None]:
train.head()

# Preprocessing

In [None]:
#imputing null values
train.Embarked = train.Embarked.fillna(value = 'N')
train.Age = train.Age.fillna(train.Age.mean())
#train.Fare = train.Fare.fillna(train.Fare.median())
train.log_Fare = train.log_Fare.fillna(train.log_Fare.median())
#train.Cabin = train.Cabin.fillna(0)
train.Ticket = train.Ticket.fillna(0)
train.ticket_lett = train.ticket_lett.fillna('')
train.ticket_num = train.ticket_num.fillna(train.ticket_num.mean())
train.cabin_num = train.cabin_num.fillna(train.cabin_num.mean())
train.log_ticket_num = train.log_ticket_num.fillna(train.log_ticket_num.median())

test.Embarked = test.Embarked.fillna(value = 'N')
test.Age = test.Age.fillna(test.Age.mean())
#test.Fare = test.Fare.fillna(test.Fare.median())
#test.Cabin = test.Cabin.fillna(0)
test.Ticket = test.Ticket.fillna(0)
test.log_Fare = test.log_Fare.fillna(test.log_Fare.median())
test.ticket_lett = test.ticket_lett.fillna('')
test.ticket_num = test.ticket_num.fillna(test.ticket_num.mean())
test.log_ticket_num = test.log_ticket_num.fillna(test.log_ticket_num.median())
test.cabin_num = test.cabin_num.fillna(test.cabin_num.mean())

In [None]:
train.pop('Cabin')
train.pop('Ticket')
train.pop('Sex')
train.pop('Fare')
train.pop('Name')
train.pop('log_ticket_num')

In [None]:
test.pop('Cabin')
test.pop('Ticket')
test.pop('Fare')
test.pop('Sex')
test.pop('Name')
test.pop('log_ticket_num')

In [None]:
#label encoding catergoricals
for c in train.columns:
    if train[c].dtype=='object': 
        lbl = LabelEncoder()
        lbl.fit(list(train[c].values) + list(test[c].values))
        train[c] = lbl.transform(train[c].values)
        test[c] = lbl.transform(test[c].values)
        
display(train.head())

In [None]:
train.head()

In [None]:
test.head()

# **Split data into train and validation**

In [None]:
target = train.pop('Survived')
X_train, X_valid, y_train, y_valid = train_test_split(train, target, train_size=0.75, shuffle=False)

# Model tuning

In [None]:
XGBoost

In [None]:
#Import libaries, run a grid search to find best paramters for model
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

xgb_model = XGBClassifier(use_label_encoder=False,eval_metric = "logloss")
params_xgb = [
    {'n_estimators':[10,100,250,500],
     'max_depth':[2,4,6,8],
     'learning_rate':[0.1,0.05,0.01],
     'min_child_weight':[1,2,4,6,8]}]

grid_search = GridSearchCV(xgb_model, params_xgb, cv=5, scoring='accuracy', n_jobs=1)
grid_search.fit(train,target)
print(grid_search.best_score_)
print(grid_search.best_params_)

Base model 0.77925 {'learning_rate': 0.05, 'max_depth': 4, 'min_child_weight': 8, 'n_estimators': 250}
Then start adding in new features from feauture engineering section to find if they improve model, and tune parameters again.

In [None]:
#gridsearch again but with narrower parameters
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

xgb_model = XGBClassifier(learning_rate=0.05,use_label_encoder=False,eval_metric = "logloss")
params_xgb = [
    {'n_estimators':[200,225,250,275,300,325,350,375],
     'max_depth':[3,4,5],
     'min_child_weight':[7,8,9,10]}]

grid_search = GridSearchCV(xgb_model, params_xgb, cv=5, scoring='accuracy', n_jobs=1)
grid_search.fit(train,target)
print(grid_search.best_score_)
print(grid_search.best_params_)

# LightGBM 

In [None]:
from lightgbm import LGBMClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

lgbm = LGBMClassifier(learning_rate=0.05, num_leaves=35, max_depth=7, n_estimators=160, feature_fraction=0.7, reg_alpha=0.6)
params_lgb = [
    { 'reg_lambda': [0.5,0.6,0.7]}]
    #reg_alpha = 0.2,
    #reg_lambda = 0.4)}]

grid_search = GridSearchCV(lgbm, params_lgb, cv=5, scoring='accuracy', n_jobs=1)
grid_search.fit(train,target)
print(grid_search.best_score_)
print(grid_search.best_params_)

# CatBoost

In [None]:
from catboost import CatBoostClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

clf_model = CatBoostClassifier(iterations =575, depth=6, learning_rate = 0.05, l2_leaf_reg=0.4, eval_metric = "Logloss")
params_clf = [
    {'':[100,200,500]}]
    # 'iterations':[10,100,250,500],
     #'depth':[2,4,6,8]}]

grid_search = GridSearchCV(clf_model, params_clf, cv=5, scoring='accuracy', n_jobs=1)
grid_search.fit(train,target)
print(grid_search.best_score_)
print(grid_search.best_params_)

0.7844300000000001
{0.05 learning rate, 'depth': 6, 'iterations': 500}

In [None]:
from catboost import CatBoostClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

clf_model = CatBoostClassifier(learning_rate = 0.05, iterations=575,depth=6, l2_leaf_reg = 10, border_count = 32, eval_metric = "Logloss")
params_clf = [
    {
     'border_count':[32,5,10,20,50,100,200]}]
     

grid_search = GridSearchCV(clf_model, params_clf, cv=5, scoring='accuracy', n_jobs=1)
grid_search.fit(train,target)
print(grid_search.best_score_)
print(grid_search.best_params_)

0.7845799999999999
{'depth': 6, 'iterations': 575

In [None]:
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score

clf = CatBoostClassifier(iterations =575, depth=6, learning_rate = 0.05, l2_leaf_reg=0.4, eval_metric = "Logloss")
clf.fit(X_train, y_train,  
        eval_set=(X_valid, y_valid), 
        verbose=False
)
predictions = clf.predict(X_valid)
print(predictions)
print(accuracy_score(predictions, y_valid))

# Model fitting

In [None]:
from xgboost import XGBClassifier
from xgboost import plot_importance
from sklearn.metrics import accuracy_score

model = XGBClassifier(n_estimators=275,learning_rate=0.05,max_depth=4,min_child_weight=8,use_label_encoder=False,eval_metric = "logloss")
model.fit(train,target)
predictions = model.predict(test)
print(predictions)

plot_importance(model)

In [None]:
clf = CatBoostClassifier(iterations =575, depth=6, learning_rate = 0.05, l2_leaf_reg=0.4, eval_metric = "Logloss")
clf.fit(train,target)
predictions = clf.predict(test)
print(predictions)

In [None]:
lgbm = LGBMClassifier(learning_rate=0.05, num_leaves=35, max_depth=7, n_estimators=160, feature_fraction=0.7, reg_alpha=0.6)
lgbm.fit(train,target)
predictions = lgbm.predict(test)
print(predictions)

In [None]:
#Create submission file
sub = pd.DataFrame({'PassengerId':test.index, 'Survived':predictions})
submission = sub.set_index('PassengerId')
submission.to_csv('cat-a.csv')

Here are my final accuracy scores for my predictions made via my three models. 
XGBClassifier:      private score = 0.79520 (best score)
                    public score  = 0.79724 
CatBoostClassifier: private score = 0.79362 (top 43%)
                    public score  = 0.79757
LGBMClassifier:     private score = 0.79515
                    public score  = 0.79700

XGBClassifier outperformed the other two models with the private dataset with 79.520% accuracy. The CatBoostClassifier model performed the best in the public score, and was automatically used as the final submission for the Kaggle rankings, placing me in the top 43% of entrants.  