# **Titanic Competition**

The goal was to apply supervised machine learning methods to predict if a person survived the Titanic or did not. 


## Code Imports and Data Loading

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.pipeline import Pipeline, make_pipeline
from imblearn.over_sampling import SMOTE,ADASYN
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay,plot_confusion_matrix,accuracy_score,recall_score,precision_score,f1_score
from sklearn.model_selection import cross_val_predict, cross_validate
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor
from sklearn.naive_bayes import BernoulliNB, ComplementNB, GaussianNB, MultinomialNB
from sklearn.neural_network import MLPClassifier
from sklearn.impute import KNNImputer

In [2]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [3]:
df_train = pd.read_csv("/kaggle/input/titanic/train.csv")
df_train.set_index('PassengerId',inplace = True)


In [4]:
df_train.head()

**The dataset**

Variable	|Definition	|Key
-------------------|------------------|----------
survival	|Survival	|0 = No, 1 = Yes
pclass	|Ticket class	|1 = 1st, 2 = 2nd, 3 = 3rd
sex	|Sex|	
Age|	Age in years|	
sibsp|	# of siblings / spouses aboard the Titanic	|
parch|	# of parents / children aboard the Titanic	|
ticket|	Ticket number	|
fare|	Passenger fare	|
cabin|	Cabin number|	
embarked|	Port of Embarkation|	C = Cherbourg, Q = Queenstown, S = Southampton


I do not know what can I do wil cabin number, ticket number and the person`s name. I will remove them and start the analysis.

In [5]:
interesting_cols = ['Survived','Pclass','Sex','Age','SibSp','Parch','Fare','Cabin','Embarked']
ndf_train = df_train[interesting_cols]

ndf_train.describe()

In [6]:
ndf_train.isnull().sum(axis=0)

Some of the age are blank and the port of embarkation. For starters, I will input the average and the mode of these features.

In [7]:
# values = {'Age':ndf_train['Age'].median(),
#           'Embarked':ndf_train['Embarked'].mode()[0],
#           'Cabin':'N'}
# ndf_train = ndf_train.fillna(value=values)

# ndf_train.isnull().sum(axis=0)

In [8]:
ndf_train.loc[df_train['Cabin'].notnull(),['Cabin']]=1
ndf_train.loc[ndf_train['Cabin'].isnull(),['Cabin']]=0

In [9]:
#ndf_train['Cabin'] = ndf_train['Cabin'].str[0]
#ndf_train['Cabin'] = ndf_train['Cabin'].astype(str).str[0]

In [10]:
ndf_train.Embarked = ndf_train.Embarked.astype('object')

fig,axs = plt.subplots(3,3,figsize=(20,16))

# plt.figure(figsize=(20,16));ax1 = plt.subplot(3,3,1);ax2 = plt.subplot(3,3,2)
# ax3 = plt.subplot(3,3,3);ax4 = plt.subplot(3,3,4);ax5 = plt.subplot(3,3,5); 
# ax6 = plt.subplot(3,3,6); ax7 = plt.subplot(3,3,7);ax8 = plt.subplot(3,3,8)
# axes = [ax1,ax2,ax3,ax4,ax5,ax6,ax7,ax8]
cols = ndf_train.columns

a = 0; b = 0;

for i in cols:
  sns.histplot(data = ndf_train, x = i, hue = 'Survived', ax = axs[a][b], 
               palette = 'mako',multiple="stack")
  if b==2:
    a=a+1;b=0;
  else:
    b=b+1;


We can notice that sex is a very important factor for survival, since most of the women succeeded in being rescued.

In [11]:
ndf_train

In [12]:
ndf_train = pd.get_dummies(ndf_train,columns=['Sex','Embarked'])
#ndf_train.drop(columns='Embarked',inplace=True)
cols = ndf_train.columns
imputer = KNNImputer(n_neighbors = 11, weights = 'distance')
ndf_train[cols[1:]] = pd.DataFrame(imputer.fit_transform(ndf_train[cols[1:]]),
                         columns=cols[1:],index=ndf_train.index)

In [13]:
ndf_train.isnull().sum(axis=0)

In [14]:
Y = ndf_train.Survived
X = ndf_train.drop(columns = ['Survived'])
#X['Sex'].replace(['female', 'male'],[0, 1], inplace=True)
#X['Embarked'].replace(['S', 'C','Q'],[0, 1, 2], inplace=True)
#X.drop(columns = 'Cabin_N', inplace=True)

In [15]:
X

In [16]:
print ('Guessing all and only women survived:')
print ('Accuracy is:')
print (accuracy_score (Y,X.Sex_female))
print ('Minimun Recall is:')
print (recall_score(Y,X.Sex_female,average = None).min())
print ('Minimum Precision is:')
print (precision_score(Y,X.Sex_female,average = None).min())
print ('f1 is:')
print (f1_score(Y,X.Sex_female,average = None).min())    

In [17]:
model = RandomForestRegressor(n_estimators=100,max_depth=50)
model.fit(X,Y)
features = X.columns
importances = model.feature_importances_
rfc_imp = pd.DataFrame()
rfc_imp['features'] = features;rfc_imp['importances'] = importances
plt.figure(figsize=(10, 8));ax1 = plt.subplot(1,1,1)
sns.barplot(data = rfc_imp.sort_values(by=['importances'],ascending=False), 
            x='importances',y='features',ax=ax1,palette = 'mako')

The plot of the importances using a regressor gives us a clearer idea of how a features can be decisive in determining the class. We have to remember that this takes in to account the class survived and not survived. Probably, that is the reason the sex was less important than Fare and Age. 

## Oversampling

In [18]:
os = SMOTE(k_neighbors = 5, random_state=0)
#os = ADASYN(n_neighbors = 11, random_state=0)

X_backup = X.copy(); Y_backup = Y.copy()
X,Y = os.fit_resample(X,Y)

In [19]:
plt.figure(figsize=(8,6));ax1 = plt.subplot(1,1,1)

sns.histplot(data = Y, ax = ax1, 
               palette = 'mako',multiple="stack")

## KNN

In [20]:
def appKNN(n_neighbors,weights,X,y):
  pipeKNN = make_pipeline(StandardScaler(), 
                          KNeighborsClassifier(n_neighbors=n_neighbors,
                                               weights=weights)) 
  # pipeKNN = make_pipeline(MinMaxScaler(), 
  #                         KNeighborsClassifier(n_neighbors=n_neighbors,
  #                                              weights=weights))
  predictions = cross_val_predict(pipeKNN,X,y.squeeze(),cv=10)
  accuracy = accuracy_score(y,predictions)
  f1 = f1_score(y,predictions,average=None)
  
  return accuracy, f1[0],f1[1]

In [21]:
cols=['n_neighbors','weights','accuracy','f1_0','f1_1']
KNN_results = pd.DataFrame(columns=cols)
for n_neighbors in [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31,33,35,37,39,41]:
  for weights in ['uniform','distance']:
      accuracy, f1_0,f1_1 = appKNN(n_neighbors,weights,X,Y)
      KNN_results = KNN_results.append(pd.DataFrame([[n_neighbors,weights,accuracy,
                                                      f1_0,f1_1]],columns=cols))
      
KNN_results.reset_index(drop=True,inplace=True)

#fig, ax = plt.subplots(figsize=(12, 9))
plt.figure(figsize=(24, 6));ax1 = plt.subplot(1,3,1);ax2 = plt.subplot(1,3,2)
ax3 = plt.subplot(1,3,3)
ax1.grid(True);ax2.grid(True);ax3.grid(True)
sns.lineplot(data=KNN_results,y='accuracy',x='n_neighbors',hue='weights',ax=ax1)
sns.lineplot(data=KNN_results,y='f1_0',x='n_neighbors',hue='weights',ax=ax2)
sns.lineplot(data=KNN_results,y='f1_1',x='n_neighbors',hue='weights',ax=ax3)

In [22]:
n_neighbors = 9; weights = 'uniform'

Clearly "distance" is a more accurate method for this dataset, and n_neighbors=5.

## SVM

In [23]:
def appSVM(C,kernel,degree,X,y):
  pipeSVM = make_pipeline(StandardScaler(), SVC(C=C,kernel=kernel,
                                                    degree = degree,
                                                    random_state=0)) 
  # pipeSVM = make_pipeline(MinMaxScaler(), SVC(C=C,kernel=kernel,
  #                                                   degree = degree,
  #                                                   random_state=0))
  predictions = cross_val_predict(pipeSVM,X,y.squeeze(),cv=10)
  accuracy = accuracy_score(y,predictions)
  f1 = f1_score(y,predictions,average=None)
  return accuracy, f1[0],f1[1]

In [24]:
cols=['C','kernel','degree','accuracy','f1_0','f1_1']
SVM_results = pd.DataFrame(columns=cols)
for C in [1,3,57,9,11]:
  for kernel in ['rbf','poly']:
    for degree in [3,5]:
      #print('C = ',C,'; kernel = ',kernel,'; degree = ',degree)
      accuracy, f1_0,f1_1 = appSVM(C,kernel,degree,X,Y)
      SVM_results = SVM_results.append(pd.DataFrame([[C,kernel,degree,accuracy,
                                                      f1_0,f1_1]],columns=cols))
      
SVM_results.reset_index(drop=True,inplace=True)
SVM_results.drop(SVM_results[(SVM_results.kernel!='poly') & 
                             (SVM_results.degree==5)].index,inplace=True)
#fig, ax = plt.subplots(figsize=(12, 9))
plt.figure(figsize=(24, 8));ax1 = plt.subplot(1,3,1);ax2 = plt.subplot(1,3,2)
ax3 = plt.subplot(1,3,3)
ax1.grid(True);ax2.grid(True);ax3.grid(True)
sns.lineplot(data=SVM_results,y='accuracy',x='C',hue='kernel',ax=ax1)
sns.lineplot(data=SVM_results,y='f1_0',x='C',hue='kernel',ax=ax2)
sns.lineplot(data=SVM_results,y='f1_1',x='C',hue='kernel',ax=ax3)

In [25]:
C = 3; kernel = 'rbf'

## Random Forest

In [26]:
def print_confMat (y,predictions,graph):
  cm = confusion_matrix(y, predictions)
  accuracy = accuracy_score(y,predictions)
  f1 = f1_score(y,predictions,average=None)
  text = 'Accuracy = '+"{:.2f}".format(accuracy)+'; '+'f1_0 = '+"{:.2f}".format(f1[0])+'; '+'f1_1 = '+"{:.2f}".format(f1[1])+'; '
  if graph:
    fig, ax = plt.subplots(figsize=(12, 9))
    ax.grid(False); ax.set_title(text)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    disp.plot(cmap=plt.cm.Blues,ax=ax)
    plt.show()
  else: return accuracy,f1[0],f1[1]

In [27]:
cols=['n_estimators','criterion','accuracy','f1_0','f1_1']
rfc_results = pd.DataFrame(columns=cols)

for n in [20,50,80,100,200,300,500,1000]:
  for criterion in ['gini','entropy']:
    rfc = RandomForestClassifier(n_estimators=n,criterion=criterion,verbose=0,
                             random_state=1)
    predictions = cross_val_predict(rfc,X,Y.squeeze(),cv=10)
    accuracy, f1_0,f1_1 = print_confMat (Y, predictions,graph=0)
    rfc_results = rfc_results.append(pd.DataFrame([[n,criterion,accuracy,
                                                      f1_0,f1_1]],columns=cols))
    
rfc_results.reset_index(drop=True,inplace=True)

plt.figure(figsize=(24, 8));ax1 = plt.subplot(1,3,1);ax2 = plt.subplot(1,3,2)
ax3 = plt.subplot(1,3,3)
ax1.grid(True);ax2.grid(True);ax3.grid(True)
sns.lineplot(data=rfc_results,y='accuracy',x='n_estimators',hue='criterion',ax=ax1)
sns.lineplot(data=rfc_results,y='f1_0',x='n_estimators',hue='criterion',ax=ax2)
sns.lineplot(data=rfc_results,y='f1_1',x='n_estimators',hue='criterion',ax=ax3)

In [28]:
n_estimators = 50; criterion = 'gini'

## Logistic Regression

In [29]:
def appLogReg(penalty,X,y):
  pipeLR = make_pipeline(StandardScaler(), LogisticRegression(penalty=penalty,
                                                    random_state=0)) 

  predictions = cross_val_predict(pipeLR,X,y.squeeze(),cv=10)
  accuracy = accuracy_score(y,predictions)
  f1 = f1_score(y,predictions,average=None)
  return accuracy, f1[0],f1[1]

In [30]:
cols=['penalty','accuracy','f1_0','f1_1']
LR_results = pd.DataFrame(columns=cols)
for penalty in ['none','l2']:
  accuracy, f1_0,f1_1 = appLogReg(penalty,X,Y)
  LR_results = LR_results.append(pd.DataFrame([[penalty,accuracy,
                                                      f1_0,f1_1]],columns=cols))
      
LR_results.reset_index(drop=True,inplace=True)
plt.figure(figsize=(24, 8));ax1 = plt.subplot(1,3,1);ax2 = plt.subplot(1,3,2)
ax3 = plt.subplot(1,3,3)
ax1.grid(True);ax2.grid(True);ax3.grid(True)
sns.barplot(data=LR_results,y='accuracy',x='penalty',ax=ax1)
sns.barplot(data=LR_results,y='f1_0',x='penalty',ax=ax2)
sns.barplot(data=LR_results,y='f1_1',x='penalty',ax=ax3)

## Naive Bayes

In [31]:
def appNB(model,X,y):
  predictions = cross_val_predict(model,X,y.squeeze(),cv=10)
  accuracy = accuracy_score(y,predictions)
  f1 = f1_score(y,predictions,average=None)
  return accuracy, f1[0],f1[1]

In [32]:
model = [BernoulliNB(), ComplementNB(), GaussianNB(), MultinomialNB()]
model_string = ['BernoulliNB', 'ComplementNB', 'GaussianNB', 'MultinomialNB'];aux=0;
cols=['model','accuracy','f1_0','f1_1']
NB_results = pd.DataFrame(columns=cols)

for model in model:
  accuracy, f1_0,f1_1 = appNB(model,X,Y)
  NB_results = NB_results.append(pd.DataFrame([[model_string[aux],accuracy,
                                                      f1_0,f1_1]],columns=cols))
  aux = aux+1
      
NB_results.reset_index(drop=True,inplace=True)
plt.figure(figsize=(24, 8));ax1 = plt.subplot(1,3,1);ax2 = plt.subplot(1,3,2)
ax3 = plt.subplot(1,3,3)
ax1.grid(True);ax2.grid(True);ax3.grid(True)
sns.barplot(data=NB_results,y='accuracy',x='model',ax=ax1)
sns.barplot(data=NB_results,y='f1_0',x='model',ax=ax2)
sns.barplot(data=NB_results,y='f1_1',x='model',ax=ax3)

## Multi-layer Perceptron classifier

In [33]:
def appRNA(layer,activation,solver):
  max_iter = 1000;

  RNApipe = make_pipeline(StandardScaler(),
                        MLPClassifier(activation = activation,
                                      hidden_layer_sizes = layer,
                                      solver = solver,
                                      max_iter = max_iter,
                                      learning_rate = 'invscaling',
                                      verbose=False))
  predict = cross_val_predict(RNApipe,X,Y.squeeze(),cv=10)
  
  accuracy = accuracy_score(Y,predict)
  min_recall = recall_score(Y,predict,average = None).min()
  min_precision = precision_score(Y,predict,average = None).min()

  return accuracy, min_recall, min_precision

In [None]:
layer =  [(10,),(10,5),(15,5),(35,10),(15,10,5)]
activation = ['relu','logistic','tanh']
solver = ['adam']

cols = ['layer','activation','solver','accuracy',
                                    'min_recall','min_precision']
results = pd.DataFrame(columns = cols)
for l in layer:
  for act in activation:
      for sol in solver:
        accuracy, min_recall, min_precision = appRNA(l,act,sol)

        aux = pd.DataFrame([[l,act,sol,accuracy,min_recall,min_precision]],
                           columns=cols)
        results = pd.concat([results,aux],axis=0)

results

In [None]:
layer = (35, 10)	; solver='adam';activation = 'tanh'

## Ensemble Voting

In [None]:
cols = ['KNN','SVM','RF','LR','NB','MLPC','Vote','Survived']
pred_ensemble = pd.DataFrame(columns=cols)
pred_ensemble['Survived'] = Y

pipeKNN = make_pipeline(StandardScaler(), SVC(C=C,kernel=kernel,
                                                    random_state=0)) 
pred_ensemble['KNN'] = cross_val_predict(pipeKNN,X,Y.squeeze(),cv=10)

pipeSVM = make_pipeline(StandardScaler(), SVC(C=C,kernel=kernel,
                                                    random_state=0)) 
pred_ensemble['SVM'] = cross_val_predict(pipeSVM,X,Y.squeeze(),cv=10)

rfc = RandomForestClassifier(n_estimators=n_estimators,
                             criterion=criterion,verbose=0,
                             random_state=1)
pred_ensemble['RF'] = cross_val_predict(rfc,X,Y.squeeze(),cv=10)

pipeLR = make_pipeline(StandardScaler(), LogisticRegression())
pred_ensemble['LR'] = cross_val_predict(pipeLR,X,Y.squeeze(),cv=10)

pred_ensemble['NB'] = cross_val_predict(BernoulliNB(),X,Y.squeeze(),cv=10)

RNApipe = make_pipeline(StandardScaler(),MLPClassifier(activation = activation,
                                      hidden_layer_sizes = layer,
                                      solver = solver,
                                      max_iter = 5000,
                                      learning_rate = 'invscaling',
                                      verbose=False))
pred_ensemble['MLPC'] = cross_val_predict(RNApipe,X,Y.squeeze(),cv=10)


pred_ensemble['Vote'] = pred_ensemble[['KNN','SVM','RF','LR','NB','MLPC']].sum(axis=1)
a = np.array(pred_ensemble['Vote'].values.tolist())
pred_ensemble['Vote'] = np.where(a > 2, 1, 0).tolist()

In [None]:
print_confMat(Y,pred_ensemble['Vote'],1)

In [None]:
df_comp = X.copy()
df_comp[cols]=pred_ensemble[cols]
df_comp.drop(columns=['Vote','Survived'],inplace=True)
df_comp


In [None]:
test_pred_ensemble = cross_val_predict(rfc,df_comp,Y.squeeze(),cv=10)
accuracy = accuracy_score(Y,test_pred_ensemble)
min_recall = recall_score(Y,test_pred_ensemble,average = None).min()
min_precision = precision_score(Y,test_pred_ensemble,average = None).min()

print ('Accuracy is:')
print (accuracy)
print ('Minimun Recall is:')
print (min_recall)
print ('Minimum Precision is:')
print (min_recall)

In [None]:
print_confMat(Y,test_pred_ensemble,1)

In [None]:
cols=['n_estimators','criterion','accuracy','f1_0','f1_1']
rfc_results = pd.DataFrame(columns=cols)

for n in [20,50,80,100,200,300,500,1000]:
  for criterion in ['gini','entropy']:
    rfc = RandomForestClassifier(n_estimators=n,criterion=criterion,verbose=0,
                             random_state=1)
    predictions = cross_val_predict(rfc,df_comp,Y.squeeze(),cv=10)
    accuracy, f1_0,f1_1 = print_confMat (Y, predictions,graph=0)
    rfc_results = rfc_results.append(pd.DataFrame([[n,criterion,accuracy,
                                                      f1_0,f1_1]],columns=cols))
    
rfc_results.reset_index(drop=True,inplace=True)

plt.figure(figsize=(24, 8));ax1 = plt.subplot(1,3,1);ax2 = plt.subplot(1,3,2)
ax3 = plt.subplot(1,3,3)
ax1.grid(True);ax2.grid(True);ax3.grid(True)
sns.lineplot(data=rfc_results,y='accuracy',x='n_estimators',hue='criterion',ax=ax1)
sns.lineplot(data=rfc_results,y='f1_0',x='n_estimators',hue='criterion',ax=ax2)
sns.lineplot(data=rfc_results,y='f1_1',x='n_estimators',hue='criterion',ax=ax3)

In [None]:
rfc_ens = RandomForestClassifier(n_estimators=300,
                             criterion='entropy',verbose=0,
                             random_state=1)

I guess this is the best that I can do for now. I will try to create a model then.

## Model

In [None]:
df_test = pd.read_csv("/kaggle/input/titanic/test.csv")
df_test.set_index('PassengerId',inplace = True)

# df_test = pd.read_csv('../input/titanic/test.csv')
# df_test.set_index('PassengerId',inplace=True)

interesting_cols = ['Pclass','Sex','Age','SibSp','Parch','Fare','Cabin','Embarked']
ndf_test = df_test[interesting_cols]
ndf_test.loc[df_test['Cabin'].notnull(),['Cabin']]=1
ndf_test.loc[ndf_test['Cabin'].isnull(),['Cabin']]=0
ndf_test = pd.get_dummies(ndf_test,columns=['Sex','Embarked'])
cols = ndf_test.columns
ndf_test = pd.DataFrame(imputer.transform(ndf_test),columns=cols,index=ndf_test.index)

In [None]:
ndf_test.isnull().sum(axis=0)


In [None]:
knn_m = pipeKNN.fit(X,Y.squeeze())
svm_m = pipeSVM.fit(X,Y.squeeze())
rfc_m = rfc.fit(X,Y.squeeze())
LR_m = pipeLR.fit(X,Y.squeeze())
NB_aux = BernoulliNB()
NB_m = NB_aux.fit(X,Y.squeeze())
RNA_m = RNApipe.fit(X,Y.squeeze())

In [None]:
df_copy = ndf_test.copy()

cols = ['KNN','SVM','RF','LR','NB','MLPC']

ndf_test[cols[0]] = knn_m.predict(df_copy)
ndf_test[cols[1]] = svm_m.predict(df_copy)
ndf_test[cols[2]] = rfc_m.predict(df_copy)
ndf_test[cols[3]] = LR_m.predict(df_copy)
ndf_test[cols[4]] = NB_m.predict(df_copy)
ndf_test[cols[5]] = RNA_m.predict(df_copy)

In [None]:
rfc_ens_m = rfc_ens.fit(df_comp,Y.squeeze())
final_predictions = rfc_ens_m.predict(ndf_test) 

In [None]:
submission = pd.DataFrame(index = df_test.index)
submission['Survived'] = final_predictions
submission.to_csv("./submission.csv",index=False)
