### **TITANIC EDA and Model**  - Public Leader Board 0.79385

In [None]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
#import torch
#from torch import nn, optim
import seaborn as sns
from pathlib import Path
import PIL
import json
import gc
from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer

### Read the data

In [None]:
# Read data
train = pd.read_csv('/kaggle/input/tabular-playground-series-apr-2021/train.csv')
test = pd.read_csv('/kaggle/input/tabular-playground-series-apr-2021/test.csv')
sample_sub  = pd.read_csv('/kaggle/input/tabular-playground-series-apr-2021/sample_submission.csv') 
train.shape,test.shape

In [None]:
numcols = train._get_numeric_data().columns
catcols = list(set(train.columns) - set(numcols))
target = 'Survived'

In [None]:
for col in train.columns:
    print(col, " Missing Data Count: ",train[col].isnull().sum())

Age, Ticket, Fare, Cabin and Embarked features missing data need to be imputed. 

In [None]:
train['Survived'].value_counts().plot(kind = 'barh',color="gray")

The distribution of data in both classes of the target variables seems reasonable.

In [None]:
data = pd.concat([train,test])

In [None]:
train.shape,test.shape, data.shape

## Imputing Missing Data

### Age

In [None]:
plt.hist(data['Age'], edgecolor = 'w',color="gray", bins = 25)
plt.title('Age'); 
plt.xlabel('Age (years)'); 
plt.ylabel('Count',);

In [None]:
# Mean of the data in test ad train sets
test['Age'].mean(),train['Age'].mean()

In [None]:
# The mean age in the test data is lower then the test data. Lets examine the corelation of the Age with Survival.

In [None]:
plt.figure(figsize = (10, 8))
sns.kdeplot(train.loc[train['Survived'] == 0, 'Age'] , label = 'Survived == 0',color="gray",)
sns.kdeplot(train.loc[train['Survived'] == 1, 'Age'] , label = 'Survived == 1',color="red",)
plt.xlabel('Age (years)'); plt.ylabel('Survived Density'); 
plt.title('Distribution of Ages');

In [None]:
# It appears that the 20 to 40 age group has differnt densities of survivors.

In [None]:
# lets look at the age of passengers in differnt classes

In [None]:
data['Age'] = data.groupby(['Pclass','Sex'])['Age'].apply(lambda x: x.fillna(x.median())).reset_index()['Age']

In [None]:
#Lets first bin the ages
data.loc[ data['Age'] <= 11, 'Age_bin'] = 0
data.loc[(data['Age'] > 11) & (data['Age'] <= 18), 'Age_bin'] = 1
data.loc[(data['Age'] > 18) & (data['Age'] <= 22), 'Age_bin'] = 2
data.loc[(data['Age'] > 22) & (data['Age'] <= 27), 'Age_bin'] = 3
data.loc[(data['Age'] > 27) & (data['Age'] <= 33), 'Age_bin'] = 4
data.loc[(data['Age'] > 33) & (data['Age'] <= 40), 'Age_bin'] = 5
data.loc[(data['Age'] > 40) & (data['Age'] <= 66), 'Age_bin'] = 6
data.loc[ data['Age'] > 66, 'Age_bin'] = 6

# let's see how it's distributed 
data['Age_bin'].value_counts().plot(color="gray",kind='barh')

In [None]:
data["RANK"] = data.groupby("Age")['Age'].rank(method="first", ascending=True)
data["RANK_avg"] = data.groupby("Age")['Age'].rank(method="average", ascending=True)
data["RANK_max"] = data.groupby("Age")['Age'].rank(method="max", ascending=True)
data["RANK_min"] = data.groupby("Age")['Age'].rank(method="min", ascending=True)

In [None]:
data['AgeBin2']=pd.cut(data['Age'],[-np.inf, 50, np.inf], right=False, labels = ['below 50', 'above 50']).astype(str)

In [None]:
data

In [None]:
plt.figure(figsize = (10, 8))
sns.countplot(x='Pclass',hue='Age_bin',data=data,
              palette=sns.color_palette("icefire"))

In [None]:
train.Sex.value_counts()

In [None]:
# It will be preffered to impute the missing age data by the median of their Classes

### Ticket

In [None]:
imputer = SimpleImputer(missing_values=np.NaN, strategy='most_frequent')
data['Ticket'] = imputer.fit_transform(data['Ticket'].values.reshape(-1,1))[:,0]

In [None]:
#Referenced https://www.kaggle.com/dwin183287/tps-april-2021-models-feature-enginering
    
data['TicketCode'] = data['Ticket'].str.replace('[^\w\s]','')
data['TicketCode'] = data['TicketCode'].str.replace(' ','')
data['TicketCode'] = data['TicketCode'].fillna('NA')

data['TicketNumber'] = data['Ticket'].str.extract('(\d+)')
data['TicketNumber'] = data['TicketNumber'].astype(float)
data['TicketNumber'] = data['TicketNumber'].fillna(0)

### Embarked

In [None]:
data['Embarked'].value_counts().sort_values().plot(kind = 'barh',color="gray")

In [None]:
data['Embarked'].fillna(data['Embarked'].mode()[0], inplace = True)

### Cabin

In [None]:
# sns.countplot(x='Survived',hue='Cabin',data=data,
#               palette=sns.color_palette("icefire"))

In [None]:
imputer = SimpleImputer(missing_values=np.NaN, strategy='most_frequent')
data['Cabin'] = imputer.fit_transform(data['Cabin'].values.reshape(-1,1))[:,0]

### Class

In [None]:
data['Pclass'].value_counts().sort_values().plot(kind = 'barh',color="gray")

In [None]:
sns.countplot(x='Survived',hue='Pclass',data=data,
              palette=sns.color_palette("icefire"))

### Fare

In [None]:
plt.figure(figsize = (10, 8))
sns.kdeplot(train.loc[train['Survived'] == 0, 'Fare'] , label = 'Survived == 0',color="gray",)
sns.kdeplot(train.loc[train['Survived'] == 1, 'Fare'] , label = 'Survived == 1',color="red",)
plt.xlabel('Fare'); plt.ylabel('Survived Density'); 
plt.title('Distribution of Fare');

In [None]:
imputer = SimpleImputer(missing_values=np.NaN, strategy='median')
data['Fare'] = imputer.fit_transform(data['Fare'].values.reshape(-1,1))[:,0]

In [None]:
for col in train.columns:
    print(col, " Missing Data Count: ",data[col].isnull().sum())

## Feature Engineering

In [None]:
data['FamilySize'] = data['SibSp'] + data['Parch'] + 1

In [None]:
data['Fare'] = data['Fare'].round()

In [None]:
lblFare = ['Low_fare','median_fare','Average_fare','high_fare']
data['Fare_bin'] = pd.cut(data['Fare'], bins=4,labels=lblFare).value_counts()

In [None]:
data['Age_Class']= data['Age'] * data['Pclass']

In [None]:
data['Fare_Per_Person'] = data['Fare']/(data['FamilySize'])
data['Fare_Per_Person'] = data['Fare_Per_Person'].astype(int)

In [None]:
def strDeck(strcabin):
    deck_list = ['A', 'B', 'C', 'D', 'E', 'F', 'T', 'G', 'Unknown']
    res = list(filter(lambda x:  x in strcabin, deck_list))
    return str(res[0])

In [None]:
cabin_list = ['A', 'B', 'C', 'D', 'E', 'F', 'T', 'G', 'Unknown']
data['Deck']=data['Cabin'].map(lambda x: strDeck(x))

In [None]:
train = data.loc[~data.Survived.isnull()]
test = data.loc[data.Survived.isnull()]

### Pair Plots

In [None]:
# g = sns.pairplot(data=train, hue='Survived', palette = 'seismic',
#                  size=1.2,diag_kind = 'kde',diag_kws=dict(shade=True),plot_kws=dict(s=10) )
# g.set(xticklabels=[])

### Feature Corelation

In [None]:
sns.heatmap(train.corr(),annot=True, vmin=0.3, vmax=0.7,linewidths=0.3) 
fig=plt.gcf()
fig.set_size_inches(25,12)
plt.show()

### Encode Categorical Variables

In [None]:
encoder=ce.TargetEncoder(cols=['Age_bin','Sex','Deck','Embarked']) 
#Fit and Transform Train Data
#encoder.fit_transform(data['class'],data['Marks'])
data[['Age_bin','Sex','Deck','Embarked']] = encoder.fit_transform(data[['Age_bin','Sex','Deck','Embarked']],data[target])

In [None]:
train = data.loc[~data.Survived.isnull()]
test = data.loc[data.Survived.isnull()]

In [None]:
# LABEL ENCODE
def encode_LE(col,train,test):
    df_comb = pd.concat([train[col],test[col]],axis=0)
    df_comb,_ = df_comb.factorize(sort=True)
    
    train[col] = df_comb[:len(train)].astype('int16')
    test[col] = df_comb[len(train):].astype('int16')
    del df_comb; 
    gc.collect()
    print(col,', ',end='')

    
# FREQ ENCODE
def encode_FE(df1, df2, cols):
    for col in cols:
        df = pd.concat([df1[col],df2[col]])
        col_dict = df.value_counts(dropna=True, normalize=True).to_dict()
        col_dict[-1] = -1
        colname = col+'_FE'
        df1[colname] = df1[col].map(col_dict)
        df1[colname] = df1[colname].astype('float32')
        
        df2[colname] = df2[col].map(col_dict)
        df2[colname] = df2[colname].astype('float32')
        print(colname,', ',end='')

In [None]:
data.columns

In [None]:
encode_FE(train,test,['Cabin','Ticket'])
encode_LE('Sex',train,test)
encode_LE('Deck',train,test)

In [None]:
import category_encoders as ce
encoder= ce.OrdinalEncoder(cols=['AgeBin2'],return_df=True,
                           mapping=[{'col':'AgeBin2','mapping':{'below 50':1,'above 50':2}}])
train = encoder.fit_transform(train)
test = encoder.fit_transform(test)

In [None]:
ncoder = ce.sum_coding.SumEncoder(cols=["Sex","Age_bin","Embarked","Fare_bin"],verbose=False,)

In [None]:
train = ncoder.fit_transform(train)
test = ncoder.fit_transform(test)

In [None]:
# train = pd.get_dummies(train, columns = ["Sex","Age_bin","Embarked","Fare_bin"],
#                              prefix=["Sex","Age_bin","Em_type","Fare_type"])

In [None]:
# test = pd.get_dummies(test, columns = ["Sex","Age_bin","Embarked","Fare_bin"],
#                              prefix=["Sex","Age_bin","Em_type","Fare_type"])

In [None]:
train.columns

In [None]:
## Extra features - Reference - https://www.kaggle.com/subinium/how-to-use-pycaret-with-feature-engineering
def converter(x):
    c, n = '', ''
    x = str(x).replace('.', '').replace('/','').replace(' ', '')
    for i in x:
        if i.isnumeric():
            n += i
        else :
            c += i 
    if n != '':
        return c, int(n)
    return c, np.nan
    
def create_extra_features(data):
    data['Ticket_type'] = data['Ticket'].map(lambda x: converter(x)[0])
    #data['Ticket_number'] = data['Ticket'].map(lambda x: converter(x)[1])
    
    data['Cabin_type'] = data['Cabin'].map(lambda x: converter(x)[0])
    data['Cabin_number'] = data['Cabin'].map(lambda x: converter(x)[1])
    data['Name1'] = data['Name'].map(lambda x: x.split(', ')[0])    
    data['Name2'] = data['Name'].map(lambda x: x.split(', ')[1])
    data['isAlone'] = data['FamilySize'].apply(lambda x : 1 if x == 1 else 0)
    
    return data

train = create_extra_features(train)
test = create_extra_features(test)

In [None]:
from category_encoders.cat_boost import CatBoostEncoder

ce = CatBoostEncoder()

column_name = ['Ticket_type', 'Cabin_type', 'Name1', 'Name2','TicketCode']
train[column_name] = ce.fit_transform(train[column_name], train['Survived'])
test[column_name] = ce.transform(test[column_name])

In [None]:
# train['TicketCode'] = ce.fit_transform(train['TicketCode'], train['Survived'])
# test['TicketCode'] = ce.transform(test['TicketCode'])

### Modelling

In [None]:
usecols = list(train.columns.values)

In [None]:
usecols.remove('PassengerId')
usecols.remove('Name')
usecols.remove('Cabin')
usecols.remove('Ticket')
usecols.remove('Survived')
usecols.remove('intercept')

In [None]:
#usecols.remove('intercept')
usecols.remove('Age_bin_0')
usecols.remove( 'Age_bin_1')
usecols.remove( 'Age_bin_2')
usecols.remove( 'Age_bin_3')
usecols.remove('Age_bin_4')
usecols.remove('Age_bin_5')
# usecols.remove('RANK')
# usecols.remove('RANK_avg')
# usecols.remove('RANK_max')
# usecols.remove('RANK_min')
usecols.remove('TicketCode')

In [None]:
list(usecols)

In [None]:
# %%time
# from sklearn.model_selection import GridSearchCV
# from sklearn.model_selection import KFold,StratifiedKFold
# kfold=StratifiedKFold(n_splits=5,shuffle=True,random_state=2021)
# model = lgb.LGBMClassifier(objective='binary',
#                             metric='auc')

# param_grid = {
#               'boosting' : ["gbdt"],
#               'n_estimators' : [300,500],
#               'learning_rate': [0.1,0.01],
#               'max_depth': [4, 8],
#               'num_leaves': [100,150],
#               'feature_fraction': [0.3, 0.1,0.6], 
#               'bagging_fraction' : [0.65,0.25],
#                'min_child_samples': [20,150],
#               'reg_alpha' : [0.1,0.5],
#               'reg_lambda' : [0.25,0.40],
#               }

# modelf = GridSearchCV(model,param_grid = param_grid, cv=kfold, 
#                       scoring="accuracy", n_jobs= 4, verbose = 2)

# modelf.fit(train[usecols],train[target])

# # Best score
# modelf.best_score_

# # Best Estimator
# modelf.best_estimator_

In [None]:
params = {}
params["objective"] = "binary"
params["boosting"] = "gbdt"
params['metric']= "AUC",

params["max_depth"] = 45
params["min_data_in_leaf"] = 1
params["min_child_samples"] = 100
params["colsample_bytree"] = 0.18
params["subsample"] = 0.013

params["cat_l2"] =  22
params["max_bin"] =  33
params["min_data_per_group"] =  90

params["reg_alpha"] =  0.003
params["reg_lambda"] = 8.97
params["learning_rate"] = 0.002
params["bagging_fraction"] = 0.65
params["feature_fraction"] = 0.65
params["num_leaves"] = 20   #50
params["n_estimators"] = 1000
#params["cat_smooth"] = 60
params["nthread"] =  4
params["verbosity"] = -1
params['early_stopping_rounds'] = 500
num_rounds = 1000

In [None]:
%%time
import lightgbm as lgb
from sklearn.model_selection import KFold,StratifiedKFold

cv_scores = []
pred_test_full = 0
ooflgb = np.zeros(train.shape[0])
predictionslgb= np.zeros(test.shape[0])

fold=StratifiedKFold(n_splits=5,shuffle=True,random_state=2021)
i=1

for dev_index, val_index in fold.split(train[usecols],train[target]):    

    dev_X, val_X = train[usecols].loc[dev_index,:], train[usecols].loc[val_index,:]
    dev_y, val_y = train[target][dev_index], train[target][val_index]
    
    lgtrain = lgb.Dataset(dev_X, label=dev_y)
    lgtest = lgb.Dataset(val_X, label=val_y)
    model = lgb.train(params, lgtrain, num_rounds,
                          valid_sets=[lgtest], early_stopping_rounds=300, verbose_eval=50)
    
    pred_val  = model.predict(val_X, num_iteration=model.best_iteration)
    pred_test = model.predict(test[usecols], num_iteration=model.best_iteration)
      
    ooflgb[val_index] = pred_val
    predictionslgb += pred_test
    
predictionslgb /= 5.

In [None]:
from sklearn.metrics import accuracy_score, f1_score
def get_best_thresholds(true, preds):
    thresholds = [i/100 for i in range(100)]
    best_thresholds = []
    
    f1_scores = [f1_score(true, (preds > thresh) * 1, average='micro') for thresh in thresholds]
    best_thresh = thresholds[np.argmax(f1_scores)]
    best_thresholds.append(best_thresh)
    return best_thresholds

In [None]:
#['Pclass','Embarked','Cabin','Ticket',]

In [None]:
train.Pclass = train.Pclass.astype('category')
train.Cabin = train.Cabin.astype('category')
train.Ticket = train.Ticket.astype('category')

test.Pclass = test.Pclass.astype('category')
test.Cabin = test.Cabin.astype('category')
test.Ticket = test.Ticket.astype('category')


In [None]:
X= train[usecols + ['Cabin']]
y= train[target]

In [None]:
len(usecols),len(X.columns)

In [None]:
%%time
from sklearn.metrics import accuracy_score,confusion_matrix,roc_auc_score
from catboost import CatBoostClassifier


categorical_features_indices = np.where(X.dtypes =='category')[0]
categorical_features_indices
oofcat = np.zeros(X.shape[0])

errcb=[]
y_pred_totcb=[]
y_pred_totcb = 0 

from sklearn.model_selection import KFold,StratifiedKFold
fold=StratifiedKFold(n_splits=5,shuffle=True,random_state=2021)
i=1
for train_index, test_index in fold.split(X,y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]
    m=CatBoostClassifier(n_estimators=50000,random_state=2021,
                         eval_metric='Accuracy',max_depth=6,min_data_in_leaf=3,
                         max_ctr_complexity=5,
                         learning_rate=0.04,
                         l2_leaf_reg=10,cat_features=categorical_features_indices,
                         od_wait=500,od_type='Iter',
                         bagging_temperature=0.80,random_strength=100,
                         use_best_model=True)
    
    m.fit(X_train,y_train,eval_set=[(X_test, y_test)], early_stopping_rounds=100,verbose=100)
    
    oofcat[test_index] = m.predict_proba(X_test)[:,-1]
    #preds=m.predict(X_test)[:,-1]

    p = m.predict_proba(test[usecols + ['Cabin']])[:,-1]
    
    y_pred_totcb += p

y_pred_totcb = y_pred_totcb/5 

In [None]:
oof = (ooflgb + oofcat)/2

In [None]:
from sklearn.metrics import accuracy_score, f1_score
best_thresholds = get_best_thresholds(train[target].values, ooflgb)
oof[:] = (oof[:] > best_thresholds) * 1
f1_score(train[target], oof, average='micro')

In [None]:
ypred = y_pred_totcb * 0.80 + predictionslgb * 0.20

In [None]:
ypred = (ypred[:] > best_thresholds) * 1

In [None]:
sample_sub[target] = ypred
sample_sub.to_csv('submission_blendcatandlgb1.csv',index=False)

In [None]:
ypred1 = (predictionslgb[:] > 0.50) * 1
sample_sub[target] = ypred1
sample_sub.to_csv('submission.csv',index=False) # 0.79385

### Please upvote if you find this helpful...