In [None]:
import numpy as np 
import pandas as pd 

import matplotlib.pyplot as plt
plt.style.use('ggplot')
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing 

import math

from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, ExtraTreesClassifier, GradientBoostingClassifier

import tensorflow as tf
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout

## References

- https://www.kaggle.com/erikgarcia/apr-21-logistic-regression
- https://www.kaggle.com/startupsci/titanic-data-science-solutions
- https://www.kaggle.com/bhavikjain/tabular-playground-series-april-ensemble
- https://www.kaggle.com/pranjalverma08/tps-april-21-ann-pseudo-label-score-81-101/#data



In [None]:
train = pd.read_csv("../input/tabular-playground-series-apr-2021/train.csv")

train.head(10)

Some attributes of the data

```
survival: Survival 0 = No, 1 = Yes
pclass: Ticket class 1 = 1st, 2 = 2nd, 3 = 3rd
sex: Sex
Age: Age in years
sibsp: # of siblings / spouses aboard the Titanic
parch: # of parents / children aboard the Titanic
ticket:Ticket number
fare: Passenger fare
cabin: Cabin number
embarked: C = Cherbourg, Q = Queenstown, S = Southampton
```

## Exploratory Data Analysis

In [None]:
# Name of the features
print(train.columns)

In [None]:
# Data types of features
print(train.dtypes)

In [None]:
print(train.describe())

In [None]:
# Presence of null values
print(train.isna().sum())

In [None]:
# Get bar plots for the categorical features
columns_str = ["Pclass","Sex","SibSp","Parch","Embarked","Survived"]

fig, axs = plt.subplots(3, 2, sharex=False, sharey=True, figsize=(30,30))

count_row = 0
count_columns = 0
for column in columns_str:    
    
    #print(c)
    Bar_Plot = sns.countplot(x=column,hue='Survived',data=train,ax = axs[count_row][count_columns]).set_title("Frequeny distribution for: " + str(column))
        
   
    count_columns +=1
    
    if count_columns == 2:
        count_row+=1
        count_columns=0

In [None]:
# Substitute cabin with deck 
train["Cabin"] = train["Cabin"].str[0]
train["Cabin"] = train["Cabin"].fillna("N")
print(train.groupby(["Cabin"]).agg({"Cabin":"count",'Survived': 'mean'}))

# Extract initial alphabhet of the ticket. For completely numbered tickets substitute with X
train['Ticket'] = train['Ticket'].map(lambda x:str(x).split()[0] if len(str(x).split()) > 1 else 'X')

- Unknown cabins have a lower survivability (Lower mean)
- Cabin A has a similar survivability to the missing decks. 
- Let's analyse further with factor plots

In [None]:
train["Embarked"] = train["Embarked"].fillna("N")
columns_str = ["Pclass","Sex","SibSp","Parch","Embarked","Cabin"]


for column in columns_str:    
    
    
    fact_plot = sns.factorplot(x=column,y='Survived',data=train)

# Factor plot not recognising axes

- Cabin B and E and Cabin F and G and D and C have similar survivability
- Cabin N (missing) has close survivability with A
- Hence features B,E,F,G,D,C and null values can be clubbed together
- Null values in Embarked can be substituted with Q

In [None]:
f, ax = plt.subplots(1, 1,figsize=(15,15))

sns.distplot(ax= ax,a = train[train['Survived']==1]['Age'], color="blue", label="Survived")
sns.distplot(ax=ax,a = train[train['Survived']==0]['Age'], color="red", label="Expired")

plt.legend(labels=['Survived', 'Expired'])
ax.set_xlabel("Age")
plt.show()

- A higher proportion of people above the age of 40 have survived than between 0 and 10

In [None]:
f, ax = plt.subplots(1, 1,figsize=(15,15))

sns.distplot(ax= ax,a = train[train['Survived']==1]['Fare'], color="blue", label="Survived")
sns.distplot(ax=ax,a = train[train['Survived']==0]['Fare'], color="red", label="Expired")

plt.legend(labels=['Survived', 'Expired'])
ax.set_xlabel("Fare")
plt.show()

In [None]:
# See if any features are related to fare and age for imputaton

fig, axes = plt.subplots(1, 2, sharex=False, sharey=True, figsize=(20,5))
sns.boxplot(x="Pclass", y="Age", data=train, ax=axes[0]).set_title("Boxplot of Age with Pclass")
sns.boxplot(x="Pclass", y="Fare", data=train, ax=axes[1]).set_title("Boxplot of Fare with Pclass")
plt.show()

fig, axes = plt.subplots(1, 2, sharex=False, sharey=True, figsize=(20,5))
sns.boxplot(x="SibSp", y="Age", data=train,ax=axes[0]).set_title("Boxplot of Age with number of siblings")
sns.boxplot(x="SibSp", y="Fare", data=train,ax=axes[1]).set_title("Boxplot of Fare with number of siblings")
plt.show()

fig, axes = plt.subplots(1, 2, sharex=False, sharey=True, figsize=(20,5))
sns.boxplot(x="Embarked", y="Age", data=train,ax=axes[0]).set_title("Boxplot of Age with port of destination")
sns.boxplot(x="Embarked", y="Fare", data=train,ax=axes[1]).set_title("Boxplot of Fare with port of destination")
plt.show()

fig, axes = plt.subplots(1, 2, sharex=False, sharey=True, figsize=(20,5))
sns.boxplot(x="Sex", y="Age", data=train,ax=axes[0]).set_title("Boxplot of Age with Sex")
sns.boxplot(x="Sex", y="Fare", data=train,ax=axes[1]).set_title("Boxplot of Fare with Sex")
plt.show()

fig, axes = plt.subplots(1, 2, sharex=False, sharey=True, figsize=(20,5))
sns.boxplot(x="Parch", y="Age", data=train,ax=axes[0]).set_title("Boxplot of Age with number of Parents/Children")
sns.boxplot(x="Parch", y="Fare", data=train,ax=axes[1]).set_title("Boxplot of Fare with number of Parents/Children")
plt.show()

fig, axes = plt.subplots(1, 2, sharex=False, sharey=True, figsize=(20,5))
sns.boxplot(x="Cabin", y="Age", data=train,ax=axes[0]).set_title("Boxplot of Age with Cabin no.")
sns.boxplot(x="Cabin", y="Fare", data=train,ax=axes[1]).set_title("Boxplot of Fare with Cabin no.")
plt.show()


- Age and Fare has good correlation with Pclass. Hence imputation can be done with that

## Feature engineering


In [None]:
train.Fare = train.groupby('Pclass')['Fare'].apply(lambda x: x.fillna(x.mean()))
train.Age = train.groupby('Pclass')['Age'].apply(lambda x: x.fillna(x.mean()))
train["Pclass"] = train["Pclass"].apply(str)

train['Family_Size'] = train['SibSp'] + train['Parch']+1

train.loc[train.Cabin=="N","Cabin"] = "A"
train.loc[train.Cabin=="D","Cabin"] = "C"
train.loc[train.Cabin=='G','Cabin']  = "F"
train.loc[train.Cabin=='B','Cabin']  = "E"
train.loc[train.Embarked=="N","Embarked"] = "Q"

train["Not Alone"] = 0
train.loc[train.Family_Size>1,"Not Alone"]=1

train.drop(columns=["PassengerId","Name"],inplace=True)

In [None]:
print(train.isna().sum())

## Loading test set and performing feature engineering

In [None]:
y_train = train["Survived"]
train.drop(columns=["Survived"],inplace=True)

X_train = train
y_train = y_train.values
y_train = y_train.reshape((len(y_train), 1))

In [None]:
test = pd.read_csv("../input/tabular-playground-series-apr-2021/test.csv")

test["Cabin"] = test["Cabin"].str[0]
test["Cabin"] = test["Cabin"].fillna("A")
test["Pclass"] = test["Pclass"].apply(str)

test.loc[test.Cabin=='G','Cabin']  = "F"
test.loc[test.Cabin=='B','Cabin']  = "E"
test.loc[test.Cabin=="D","Cabin"] = "C"

test["Embarked"] = test["Embarked"].fillna("Q")

test['Ticket'] = test['Ticket'].map(lambda x:str(x).split()[0] if len(str(x).split()) > 1 else 'X')

test.Fare = test.groupby('Pclass')['Fare'].apply(lambda x: x.fillna(x.mean()))
test.Age = test.groupby('Pclass')['Age'].apply(lambda x: x.fillna(x.mean()))

test_without_ID = test.drop(columns=["PassengerId","Name"])

test_without_ID['Family_Size'] = test_without_ID['SibSp'] + test_without_ID['Parch']+1
test_without_ID["Not Alone"] = 0
test_without_ID.loc[test_without_ID.Family_Size>1,"Not Alone"]=1

X_test= test_without_ID


In [None]:
encode_df = pd.concat([X_train,X_test], join="inner")

In [None]:
encode_df = pd.get_dummies(encode_df)

In [None]:
X_train = encode_df.iloc[0: len(X_train),:]
X_test = encode_df.iloc[len(X_train):,:]

## Implementation with voting classifier

- Logistic regression
- LightGBM
- ExtraTreesClassifier
- GradientBoostingClassifier
- Neural network

### References for models -:
- https://www.kaggle.com/bhavikjain/tabular-playground-series-april-ensemble
- https://www.kaggle.com/pranjalverma08/tps-april-21-ann-pseudo-label-score-81-101/#data

In [None]:
LR = LogisticRegression()


In [None]:
LG = LGBMClassifier(boosting_type = 'dart',num_leaves = 32,max_depth = 10,colsample_bytree = 0.8,extra_trees = True,n_jobs = -1,random_state = 42)


In [None]:
ext = ExtraTreesClassifier(n_estimators = 1000,max_depth = 17,min_samples_split = 25,min_samples_leaf = 18,n_jobs = -1,random_state = 42)


In [None]:
gb=GradientBoostingClassifier(max_depth= 2, n_estimators = 400)

In [None]:
def build_model():
    
    model = Sequential()
    model.add(Dense(units=30,kernel_initializer='normal',activation='elu',input_dim=len(X_train.columns)))
    model.add(Dropout(0.2))
    model.add(Dense(units=30,kernel_initializer='normal',activation='elu'))
    model.add(Dropout(0.2))
    model.add(Dense(units=1,kernel_initializer='normal',activation='sigmoid'))    
    model.compile(optimizer="Adam",loss='binary_crossentropy',metrics=['accuracy'])
    
    return model

keras_clf = tf.keras.wrappers.scikit_learn.KerasClassifier(
                            build_model,
                            epochs=200,batch_size=16,
                        verbose = 0)

keras_clf._estimator_type = "classifier"

In [None]:
clf = VotingClassifier(estimators=[('LR',LR),('LGBM' , LG),("EXT",ext),("GB",gb),("NN",keras_clf)], voting='soft')
clf.fit(X_train, y_train)
pred = clf.predict(X_test)

## For submission

In [None]:
submission = pd.DataFrame()
submission['PassengerId'] = test['PassengerId']
submission['Survived'] = pred
print(submission.head(5))

submission.to_csv("Submission.csv", index=False)