In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from pathlib import Path
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import make_column_transformer
import category_encoders as ce
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV

import matplotlib.pyplot as plt
import seaborn as sns    
plt.style.use('seaborn')

sns.set(font_scale=2.5) 
input_path = Path('/kaggle/input/tabular-playground-series-apr-2021/')

# Load data

In [None]:
train = pd.read_csv(input_path / 'train.csv', index_col='PassengerId')
display(train.head());
test = pd.read_csv(input_path / 'test.csv', index_col='PassengerId')
#display(test.head());
sample_submission = pd.read_csv(input_path / 'sample_submission.csv', index_col='PassengerId')
display(sample_submission)

# **1. Exploratory Data Analysis**

## **1.1 Profile Report** 

In [None]:
from pandas_profiling import ProfileReport
profile_train = ProfileReport(train, title='Pandas Train Profiling Report', html={'style':{'full_width':True}})
profile_train

<font size="4"> I remove the feature that couldn't have a reasonable effect on the dependent variable: ['Name', 'Ticket', 'Fare']. 'Fare' is obviously related to other variables like Pclass and Embarked 
I will show in the following the exploratory analysis to see how the features influence the the independent variable.<\font>

In [None]:
col_to_drop=['Name','Ticket', 'Fare']

## **1.2 Pclass and Sex**

In [None]:
f, ax = plt.subplots(1,2,figsize=(18,8), gridspec_kw={'wspace':1})
sns.countplot(x='Pclass', hue='Survived',data=train, ax=ax[0])
ax[0].set_title('Pclass and Survived counts')
#ax[0].set_yticks(range(0,110,10))
sns.countplot(x='Sex', hue='Survived',data=train, ax=ax[1])
ax[1].set_title('Sex and Survived counts')
plt.show()
display(pd.crosstab(train['Pclass'],train['Survived']).apply(lambda r: r/r.sum(), axis=1))
display(pd.crosstab(train['Sex'],train['Survived']).apply(lambda r: r/r.sum(), axis=1))
display(pd.crosstab(train['Pclass'],[train['Sex'], train['Survived']]
            ,margins=True, margins_name='Total', normalize='columns'));

#sns.distplot(train,x='ParCh', hue='Survived',ax=ax[2])

<font size="4"> In the first table are summarized the survival rates for the different classes. If a passanger belongs to the first class has a 60% of Survival. The percentage here showed does not take into account the Sex of the passanger.
In the second table the survival rate according to the sex of the passanger. Here, it is clear that being a woman increased the chances of surviving. In this case, the class of the [assanger is not taken into account.
In the final table we broke down the first one: we show the rate of survival according to the class differentiating man from women. Almost 60% of the man who did not survive are from the third class. While the majority of the man surviving belongs to the first class. The difference in survival rate between classes is visible for women too. However, in this case the difference in rate is smaller. </font>  

## **1.3 Parch and SibSp**

In [None]:
f, ax = plt.subplots(1,2,figsize=(18,8), gridspec_kw={'wspace':1})
sns.countplot( x='Parch', hue='Survived', data= train, ax=ax[0])
sns.countplot( x='SibSp', hue='Survived', data= train, ax=ax[1])
plt.show()
display(pd.crosstab(train['Parch'],train['Survived']).apply(lambda r: r/r.sum(), axis=1))
display(pd.crosstab(train['SibSp'],train['Survived']).apply(lambda r: r/r.sum(), axis=1));

<font size='4'> The variable ParCh and SibSp do not seem to have a very strong predictive power. In fact, not having 0 sibblings or parents doesn't signficantly improve the chances of surviving.<\font>

In [None]:
col_to_drop= col_to_drop+['Parch', 'SibSp']

## **1.4 Age**

In [None]:
fig, ax = plt.subplots(1,2,figsize=(18,5), gridspec_kw={'wspace':0.5})
sns.kdeplot(data=train, x="Age", ax=ax[0])
sns.kdeplot(train[train['Survived']==1]['Age'],ax=ax[1])
sns.kdeplot(train[train['Survived']==0]['Age'],ax=ax[1])
plt.legend(['Survived', 'Not Survived'])
plt.show()

<font size = "4">From the graph on the right it is visible a change in the probability of surviving at the age of 40. This is quite curious, if somebody thinks that younger people should have higher chances to survive. But, probably the age distribution changes across other categories which also we have seen have a strong influence on the prediction (Sex and Pclass). So for example, the older people are mostly in the Pclass which has we have seen have higher chances to survive.</font>

### **1.4.2 Age and Pclass/Sex vs Survived**

In [None]:
#Check the age distribution for Pclass and Sex
#frequency shows the number of observation divided by the bin width
f, ax= plt.subplots(1,2,figsize=(18,8), gridspec_kw={'wspace':0.5})
sns.histplot(data=train,x='Age',  hue='Pclass', stat='frequency',binwidth=10, multiple="layer", palette='Set1',  ax=ax[0])
ax[0].set_title('Pclass and Age')

sns.histplot(data=train,x='Age',  hue='Sex', stat='frequency',binwidth=10, multiple="layer", ax=ax[1])
ax[1].set_title('Sex and Age');



In [None]:
plt.figure(figsize=(7,8))
sns.countplot(data=train,x='Pclass',  hue='Sex')
plt.title('Pclass and Sex')

p_counts=train.groupby(['Pclass','Sex','Survived']).agg({'Survived':'count'})

In [None]:
f, ax= plt.subplots(1,2,figsize=(18,12), gridspec_kw={'wspace':0.5})
#sns.violinplot(data=train, x='Pclass', y='Age', hue='Survived', palette="muted",split=True, ax=ax[0])
sns.boxplot(data=train, x='Pclass', y='Age', palette="muted", ax=ax[0])
ax[0].set_title('Pclass and Age')
ax[0].set_yticks(range(0,110,10))
ax[0].legend([],[], frameon=False)

#sns.violinplot(data=train, x='Sex', y='Age', hue='Survived', palette="muted",split=True, ax=ax[1])
sns.boxplot(data=train, x='Sex', y='Age', ax=ax[1])
ax[1].set_title('Sex and Age')
ax[1].set_yticks(range(0,110,10))
#plt.legend(bbox_to_anchor=(1.01, 1),borderaxespad=0, title='Survive')
handles, labels = ax[0].get_legend_handles_labels()
plt.legend(bbox_to_anchor=(1.01, 1),borderaxespad=0, title='Survive',handles=handles[:], labels=['No', 'Yes'])

In [None]:
#train.loc[(train.Pclass==1) | (train.Sex=='male')]
p_counts["pct"]= p_counts.groupby(level=[0,1]).apply(lambda x:
                                                 100 * x / float(x.sum()))

p_counts.columns = [col + '_l0' for col in p_counts.columns.values]
p_counts = p_counts.reset_index()
p_counts

<font size="4">In the third class the number of males is much larger than the number of females. This is not true for the higher classes, first and second. For these two the difference is not as significant as for the third class and the number of females seems to be slightly higher. </font>


In [None]:
f, ax= plt.subplots(1,2,figsize=(18,12), gridspec_kw={'wspace':0.5})
#sns.violinplot(data=train, x='Pclass', y='Age', hue='Survived', palette="muted",split=True, ax=ax[0])
sns.boxplot(data=train, x='Pclass', y='Age', hue='Survived', palette="muted", ax=ax[0])
ax[0].set_title('Pclass and Age')
ax[0].set_yticks(range(0,110,10))
ax[0].legend([],[], frameon=False)

#sns.violinplot(data=train, x='Sex', y='Age', hue='Survived', palette="muted",split=True, ax=ax[1])
sns.boxplot(data=train, x='Sex', y='Age', hue='Survived', ax=ax[1])
ax[1].set_title('Sex and Age')
ax[1].set_yticks(range(0,110,10))
#plt.legend(bbox_to_anchor=(1.01, 1),borderaxespad=0, title='Survive')
handles, labels = ax[0].get_legend_handles_labels()
plt.legend(bbox_to_anchor=(1.01, 1),borderaxespad=0, title='Survive',handles=handles[:], labels=['No', 'Yes'])


<font size="4"> This might explain the change in survivor with the age.
The third class has a very low mean age compared to the other classes. However, third class passanger have a lower chance of surviving. Furthermore the higher percentage of male in the third class make the chances of this group of young passanger even lower. </font>

<font size="4"> These two figures shows the incidence of the passanger age and Pclass(or Sex) on the probability of Surviving. 
In the first figure, the mean age of the survivor and the not survivor  very similar to the mean age of the class they belong (around 50 for the first class, 40 for the second, and 30 for the third).
This can also be another indication that the Age is not a very strong predictive feature </font>

### **1.4.3 Imputation of missing values**

<font size="4"> To  use the age feature we need to take care of the missing values. We know there are missing values from the profile report at the beginning of the EDA. <br> From the previous plot we see that the different classes show a different mean age. So for the missing values we can use the mean value for the different Pclasses. We also separate the impuation according to the different gender using the mean value for the specific class and gender. 
</font>
****

In [None]:
#train['Age'].fillna(train.groupby(['Pclass','Sex'])['Age'].transform('mean'))

In [None]:
train['Age'] = train.groupby(['Pclass','Sex'])['Age'].apply(lambda x: x.fillna(x.mode().iloc[0]))
test['Age'] = test.groupby(['Pclass','Sex'])['Age'].apply(lambda x: x.fillna(x.mode().iloc[0]))

## **1.5 Embarked** 

In [None]:
f, ax= plt.subplots(2,2, figsize=(18,20))
sns.countplot(data=train, x='Embarked',ax=ax[0,0])
ax[0,0].set_title('1- Embarked counts')
sns.countplot(data=train, x='Embarked', hue='Survived',ax=ax[0,1])
ax[0,1].set_title('2- Embarked vs Survived')
sns.countplot(data=train, x='Embarked', hue='Pclass',ax=ax[1,0])
ax[1,0].set_title('3- Embarked vs Pclass')
sns.countplot(data=train, x='Embarked', hue='Sex',ax=ax[1,1])
ax[1,1].set_title('4- Embarked vs Sex')

plt.subplots_adjust(wspace=0.5, hspace=0.4)
plt.show()

<font size="4"> From figure 1 we understand that the larger portion of passanger embarkes on the ship from S. From figure 2 we noticed that the frequency of survivor embarked from S is low. This observation make us think that passangers from Southampton were damned :P. However, figure 3 and 4 shows that a large portion of the third class embarked from S and that also the majority of male embarked from there. So, the embarked information doesn't seem to add extra information. <br>  Ideally, we could train models with and without this feature. But, I will train models only without this feature. </font>

## **1.6 Cabin** 

<font size="4"> In a first approach I try to not use this feature to train the model because there are too many missing values for this category. <br>
In the attempt to increase the score of the accuracy in the last section of the notebook I try to include some of the features that I excluded in the first place.</font>

In [None]:
sns.countplot(data=train.loc[train.Survived==0], x ='Cabin')

In [None]:
cabin_na= train.loc[pd.isnull(train['Cabin'])]
cabin_na['Embarked'].value_counts()

In [None]:
## I create two lists for the features that need to be dropped one with Embarked and the second one without.
## I will train my first moe with a Data Set that doesn't include this category.
col_to_drop=col_to_drop+['Embarked', 'Cabin']

# **2. Preprocessing of the data**

<font size ="4">Before starting with the development of the model, let's prepare the data base for the training.
We split the training set in two groups training and validation. All the data sets (the train, the validation, and the test) should not include the columns we have decided to exclude.<br>
Also ery important for someof the models we will use we need to encode the categorical features that have anon numerical values.</font>


In [None]:
## Removing the features that we don't use in the model.
train_dropped=train.drop(col_to_drop, axis=1)
test_dropped=test.drop(col_to_drop, axis=1)

In [None]:
# Encoding OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler

target= train_dropped.pop('Survived')
ct = ColumnTransformer(transformers= [('encoder', OneHotEncoder(drop='first'), ['Pclass', 'Sex']),('scaler', MinMaxScaler(), ['Age'])], remainder='passthrough')
#ct.fit(train_dropped1)
train_encoded=ct.fit_transform(train_dropped)
test_encoded=ct.transform(test_dropped)

##Split

X_train, X_val, y_train, y_val = train_test_split(train_encoded, target, train_size=0.70) 

In [None]:
X_train.shape

In [None]:
#### Scores

def print_score(classifier, X_val, y_val):
    ##Print the predictions. 
    # classifier: the trained classifier
    # X_val: the test inputs
    # y_val: the test labels
    
    y_pred = classifier.predict(X_val) 
    score_accuracy= accuracy_score(y_val, y_pred.T)
    y_prob=classifier.predict_proba(X_val)[:,1] # This grabs the probability that the classifier
                                                # assign to the positive class. 
                                                
    score_auc=roc_auc_score(y_val,y_prob)

    print('The accuracy scores:')
    print(f'{score_accuracy:0.5f}') 
    print('The auc score:')
    print(f'{score_auc:0.5f}')

def visualize_binary_predictions(y_prob,y_pred, y_val):
    plt.figure(figsize=(8,4))
    plt.hist(y_prob[np.where(y_val == 0)], alpha=0.5, label='Not Surv')
    plt.hist(y_prob[np.where(y_val == 1)], alpha=0.5, label='Surv')
    plt.legend()
    plt.show()



    result=pd.DataFrame({'y_pred':y_pred, 'y_val':y_val.values} )



    plt.figure(figsize=(8,4))
    sns.countplot(x=result.y_pred, hue= result.y_val)
    #plt.legend()
    plt.show()
    
####Hyperparameter search
def hyperparameter_summary(gridresults, name=None):
    ## hyperparameter_summary is a method to create a table with the results of hyperparameters tuning.
    
    params=list(gridresults['params'])#gridresults['params'] we are passing a list of dictionaries
                                      #We use list() because we make a copy of the values. So that the oriinal is not modified
    para_scores=[]
    for i in range(len(params)):
        d = dict(params[i])
        d.update({'Score':gridresults['mean_test_score'][i] ,
                 'Rank':gridresults['rank_test_score'][i]})
        para_scores.append(d)
    
    df = pd.DataFrame(para_scores)
    df1 = df.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])
    df1.set_properties(**{'text-align': 'center'})
    
    if name!=None:
        df1.set_caption('Table with the results of the hyperparameter search on %s' %name)
    
    return df1


# **3. Logistic Regression**

Using GridSearchCV we will train the model using a cross validation method and also we select the best hyperparameters for each model with a grid search alghoritm.
The classifier needs to predic if the passanger, given certain features, will survive or not the catastrophe. I think for this problem it is better to train the model optimizing the accuracy. We care that the total of true positive (TP) and true negative (TN) over the toatl positive and negative outcomes is as higher as possible. 

## **3.1 Building and training the model**

In [None]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
parameters = {
    'penalty':['l2'],
    'C':[0.0001,0.001,0.01,0.1,1.0]
  }

#lr.fit(X_train, y_train)

grid_lr=GridSearchCV(lr, cv=5, param_grid=parameters ,n_jobs=-1, scoring='accuracy')

grid_lr.fit(X_train,y_train)

print(grid_lr.best_params_)

y_pred = grid_lr.predict(X_val) # This grabs the positive class prediction
score_accuracy= accuracy_score(y_val, y_pred)

y_prob=grid_lr.predict_proba(X_val)[:,1]
score_auc=roc_auc_score(y_val,y_prob)

print('The accuracy scores:')
print(f'{score_accuracy:0.5f}') 
print('The auc score:')
print(f'{score_auc:0.5f}')

hyperparameter_summary(grid_lr.cv_results_)

## **3.2 Visualization of the predictions**

In [None]:
y_pred = grid_lr.predict(X_val)
y_prob = grid_lr.predict_proba(X_val)[:,1]
visualize_binary_predictions(y_prob, y_pred, y_val)
pd.crosstab(y_pred, y_val, rownames=['Predictions']).apply(lambda r: r/r.sum(), axis=1)#style.background_gradient(cmap='cool')

<font size= '4'> The wrong predictions are equally distributed between positive and negative labels </font>

In [None]:
wrong_predictions = train_dropped.loc[y_val.loc[(y_val!=y_pred)]] 
wrong_predictions['Survived'] = y_val.loc[(y_val!=y_pred)].values
wrong_predictions

In [None]:
wrong_predictions.Survived.value_counts() / wrong_predictions.Survived.size

In [None]:
wrong_predictions.Pclass.value_counts() / wrong_predictions.Pclass.size

## **3.3 Submit predictions**

In [None]:
lr = LogisticRegression()
lr.set_params(**grid_lr.best_params_)
lr.fit(train_encoded, target)
y_pred_LR = lr.predict(test_encoded) 
sub_LR = pd.DataFrame({"Survived": y_pred_LR} , index= test.index)
sub_LR.to_csv("submission_LogistRegression.csv")

In [None]:
sub_LR

# **4. Random Forest Classifier**

## **4.1 Building and training the model**

In [None]:
## Choose classifier
clf = RandomForestClassifier()
pipeline=make_pipeline(clf)
parameters = {
  'randomforestclassifier__n_estimators':[100,200,300],
  'randomforestclassifier__max_depth': [5,7,11,15]
  }

grid_RF=GridSearchCV(pipeline, cv=5, n_jobs=-1, param_grid=parameters ,scoring='accuracy')

grid_RF.fit(X_train,y_train)

print(grid_RF.best_params_)

y_pred = grid_RF.predict(X_val) # This grabs the positive class prediction
score_accuracy= accuracy_score(y_val, y_pred)

y_prob = grid_RF.predict_proba(X_val)[:, 1] # This grabs the positive class prediction
score_auc = roc_auc_score(y_val, y_prob)



print('The accuracy scores:')
print(f'{score_accuracy:0.5f}') 
print('The auc score:')
print(f'{score_auc:0.5f}')

hyperparameter_summary(grid_RF.cv_results_)

## **4.2 Visualization of the predictions**

In [None]:
clf_RF = RandomForestClassifier()
pipeline=make_pipeline(clf_RF)
pipeline.set_params(**grid_RF.best_params_)
pipeline.fit(X_train, y_train)
y_pred = grid_RF.predict(X_val)
y_prob = grid_RF.predict_proba(X_val)[:,1]
visualize_binary_predictions(y_prob, y_pred, y_val)
pd.crosstab(y_pred, y_val, rownames=['Predictions']).apply(lambda r: r/r.sum(), axis=1)#.style.background_gradient(cmap='cool')

## **4.3 Submit Random Forest predictions**

<font size='4'> We first train the model on the entire training data set we the best parameter obtain with the GridSearchCV. <br>
After, we make the predictions on the test and we submit it.</font> 

In [None]:
clf_RF = RandomForestClassifier()
pipeline=make_pipeline(clf_RF)
pipeline.set_params(**grid_RF.best_params_)
pipeline.fit(train_encoded, target)
y_pred_RF = pipeline.predict(test_encoded) 
sub_RF = pd.DataFrame({"Survived": y_pred_RF} , index= test.index)
sub_RF.to_csv("submission_RandomForest.csv")



# **5. XGBoost Classifier**

## **5.1 Building and training the model**

In [None]:
## Choose classifier

from xgboost import XGBClassifier


clf = XGBClassifier(use_label_encoder=False)
pipeline=make_pipeline(clf)
parameters = {
  'xgbclassifier__n_estimators':[50,100,150,200],
  'xgbclassifier__learning_rate': [0.01,0.001,0.0001],
  'xgbclassifier__max_depth' : [2, 4, 6, 8],
}

grid_XGB=GridSearchCV(pipeline, cv=5, n_jobs=-1, param_grid=parameters ,scoring='accuracy')

grid_XGB.fit(X_train,y_train)

print(grid_XGB.best_params_)

y_pred = grid_XGB.predict(X_val) # This grabs the positive class prediction
score_accuracy= accuracy_score(y_val, y_pred)

y_prob = grid_XGB.predict_proba(X_val)[:, 1] # This grabs the positive class prediction
score_auc = roc_auc_score(y_val, y_prob)

print('The accuracy scores:')
print(f'{score_accuracy:0.5f}') 
print('The auc score:')
print(f'{score_auc:0.5f}')

hyperparameter_summary(grid_XGB.cv_results_)

## **5.2 Visualization of the predictions**

In [None]:
y_pred = grid_XGB.predict(X_val)
y_prob = grid_XGB.predict_proba(X_val)[:,1]
visualize_binary_predictions(y_prob, y_pred, y_val)
pd.crosstab(y_pred, y_val, rownames=['Predictions']).apply(lambda r: r/r.sum(), axis=1)

## **5.3 Submit XGB predictions**


In [None]:
clf_XGB = XGBClassifier(use_label_encoder=False)
pipeline=make_pipeline(clf_XGB)
pipeline.set_params(**grid_XGB.best_params_)
pipeline.fit(train_encoded, target)
y_pred_XGB = pipeline.predict(test_encoded) 
sub_XGB = pd.DataFrame({"Survived": y_pred_XGB} , index= test.index)
sub_XGB.to_csv("submission_XGB.csv")

# **6. Neural Network**

<font size='4'> Let's build a NN model. <br>
Before doing that, I need to scale the age column to use it together with the NeuralNetwork. <br>
I will keep this section short and I will not make a hyperparameter search for the NN. </font>

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import callbacks
from numpy.random import seed
seed(1)
tf.random.set_seed(89)
#set_random_seed(2)


input_shape = [X_train.shape[1]]

model = keras.Sequential(
[layers.Dense(units=4, activation = 'relu', input_shape=input_shape),
 layers.Dropout(0.25),
 layers.Dense(units=1, activation= 'sigmoid')])

model.compile(
    optimizer="adam",
    loss="binary_crossentropy",
    metrics=['binary_accuracy'])
    

early_stopping=callbacks.EarlyStopping(min_delta=0.01,
                                   patience=20,
                                   restore_best_weights=True)

history = model.fit(
    X_train, y_train,
    validation_data = (X_val, y_val),
    batch_size=1000,
    epochs=60,
    callbacks=[early_stopping]
)

history_df= pd.DataFrame(history.history)
history_df.loc[:, ['loss', 'val_loss']].plot(title="Cross-entropy")
history_df.loc[:, ['binary_accuracy', 'val_binary_accuracy']].plot(title="Accuracy")

In [None]:
y_prob = model.predict(X_val) # This grabs the positive class prediction
y_pred = np.copy(y_prob) # This grabs the positive class prediction
#
y_pred[y_pred <= 0.5] = 0.
y_pred[y_pred > 0.5] = 1.
score_accuracy= accuracy_score(y_val, y_pred)


score_auc = roc_auc_score(y_val, y_prob)



print('The accuracy scores:')
print(f'{score_accuracy:0.5f}') 
print('The auc score:')
print(f'{score_auc:0.5f}')


In [None]:
plt.figure(figsize=(8,4))
plt.hist(y_prob[np.where(y_val == 0)], alpha=0.5, label='Not Surv')
plt.hist(y_prob[np.where(y_val == 1)], alpha=0.5, label='Surv')
plt.legend()
plt.show()



result=pd.DataFrame({'y_pred':y_pred[:,0], 'y_val':y_val.values} )



plt.figure(figsize=(8,4))
sns.countplot(x=result.y_pred, hue= result.y_val)
#plt.legend()
plt.show()
pd.crosstab(result['y_pred'], result['y_val']).apply(lambda r: r/r.sum(), axis=1)#.style.background_gradient(cmap='cool')

In [None]:

history = model.fit(
    train_encoded, target,
    batch_size=1000,
    epochs=60,
    callbacks=[early_stopping]
)


y_prob = model.predict(test_encoded) 
y_pred_NN = np.copy(y_prob) # This grabs the positive class prediction
#



In [None]:
y_pred_NN[y_pred_NN <= 0.5] = 0.
y_pred_NN[y_pred_NN > 0.5] = 1.


In [None]:
sub_NN = pd.DataFrame({"Survived":y_pred_NN.T[0]} , index= test.index)
sub_NN.to_csv("submission_NN.csv")


# **7. Improving the accuracy score**

<font size = '4'> My first attempt to improve the score is to include some of the features I have previously excluded. For example, I try first to include Embarked. Embarked has some missing value. We can create a new class for this value, however,   </font>

In [None]:
col_to_drop_1=['Name','Ticket', 'Fare','Parch', 'SibSp', 'Cabin']

## Removing the features that we don't use in the model.
train_dropped=train.drop(col_to_drop_1, axis=1)
test_dropped=test.drop(col_to_drop_1, axis=1)

train_dropped.Embarked=train_dropped.Embarked.fillna('U')
test_dropped.Embarked=test_dropped.Embarked.fillna('U')

target= train_dropped.pop('Survived')
ct = ColumnTransformer(transformers= [('encoder', OneHotEncoder(drop='first'), ['Pclass', 'Sex', 'Embarked']),('scaler', MinMaxScaler(), ['Age'])], remainder='passthrough')
#ct.fit(train_dropped1)
train_encoded=ct.fit_transform(train_dropped)
test_encoded=ct.transform(test_dropped)

##Split

X_train, X_val, y_train, y_val = train_test_split(train_encoded, target, train_size=0.70) 

lr = LogisticRegression()
parameters = {
    'penalty':['l2'],
    'C':[0.0001,0.001,0.01,0.1,1.0]
  }

#lr.fit(X_train, y_train)

grid_lr=GridSearchCV(lr, cv=5, param_grid=parameters ,n_jobs=-1, scoring='accuracy')

grid_lr.fit(X_train,y_train)

print(grid_lr.best_params_)

y_pred = grid_lr.predict(X_val) # This grabs the positive class prediction
score_accuracy= accuracy_score(y_val, y_pred)

y_prob=grid_lr.predict_proba(X_val)[:,1]
score_auc=roc_auc_score(y_val,y_prob)

print('The accuracy scores:')
print(f'{score_accuracy:0.5f}') 
print('The auc score:')
print(f'{score_auc:0.5f}')

hyperparameter_summary(grid_lr.cv_results_)

In [None]:
wrong_predictions = train_dropped.loc[y_val.loc[(y_val!=y_pred)]] 
wrong_predictions['Survived'] = y_val.loc[(y_val!=y_pred)].values
wrong_predictions

wrong_predictions.Pclass.value_counts() / wrong_predictions.Pclass.size

In [None]:
sns.boxplot(data=wrong_predictions, x='Sex', y='Age', hue='Survived')

<font size = '4'> The wrong predictions seems to corresponds to the passangers for who we imputed the Age.
Specifically, the wrong predictions corresponds to male of the 1st and 3rd class. <br> 
I will  try to train the model using a dataset where the na are removed from the training set, instead of being imputed. I will leave the imputation in the test dataset.</font>

In [None]:
train2 = pd.read_csv(input_path / 'train.csv', index_col='PassengerId')
display(train.head());

In [None]:
col_to_drop_1=['Name','Ticket', 'Fare','Parch', 'SibSp', 'Cabin']

## Removing the features that we don't use in the model.
train_dropped=train.drop(col_to_drop_1, axis=1)
train_dropped=train_dropped.dropna(subset=['Age'])
test_dropped=test.drop(col_to_drop_1, axis=1)

train_dropped.Embarked=train_dropped.Embarked.fillna('U')
test_dropped.Embarked=test_dropped.Embarked.fillna('U')

target= train_dropped.pop('Survived')
ct = ColumnTransformer(transformers= [('encoder', OneHotEncoder(drop='first'), ['Pclass', 'Sex', 'Embarked']),('scaler', MinMaxScaler(), ['Age'])], remainder='passthrough')
#ct.fit(train_dropped1)
train_encoded=ct.fit_transform(train_dropped)
test_encoded=ct.transform(test_dropped)

##Split

X_train, X_val, y_train, y_val = train_test_split(train_encoded, target, train_size=0.70) 

lr = LogisticRegression()
parameters = {
    'penalty':['l2'],
    'C':[0.0001,0.001,0.01,0.1,1.0]
  }

#lr.fit(X_train, y_train)

grid_lr=GridSearchCV(lr, cv=5, param_grid=parameters ,n_jobs=-1, scoring='accuracy')

grid_lr.fit(X_train,y_train)

print(grid_lr.best_params_)

y_pred = grid_lr.predict(X_val) # This grabs the positive class prediction
score_accuracy= accuracy_score(y_val, y_pred)

y_prob=grid_lr.predict_proba(X_val)[:,1]
score_auc=roc_auc_score(y_val,y_prob)

print('The accuracy scores:')
print(f'{score_accuracy:0.5f}') 
print('The auc score:')
print(f'{score_auc:0.5f}')

hyperparameter_summary(grid_lr.cv_results_)

In [None]:
wrong_predictions = train.loc[y_val.loc[(y_val!=y_pred)]] 
wrong_predictions['Survived'] = y_val.loc[(y_val!=y_pred)].values
wrong_predictions

display(wrong_predictions.Cabin.value_counts())# / wrong_predictions.Cabin.size
print(pd.isnull(wrong_predictions.Cabin).sum())
display(wrong_predictions.Pclass.value_counts())
wrong_predictions.shape
#sns.boxplot(data=wrong_predictions.loc[wrong_predictions['Pclass']==1], x='Survived', y='Age')

In [None]:
y_pred = grid_lr.predict(X_val)
y_prob = grid_lr.predict_proba(X_val)[:,1]
visualize_binary_predictions(y_prob, y_pred, y_val)
display(pd.crosstab(y_pred, y_val, rownames=['Predictions']).apply(lambda r: r/r.sum(), axis=1))#style.background_gradient(cmap='cool')


lr = LogisticRegression()
lr.set_params(**grid_lr.best_params_)
lr.fit(train_encoded, target)
y_pred_LR = lr.predict(test_encoded) 
sub_LR = pd.DataFrame({"Survived": y_pred_LR} , index= test.index)
sub_LR.to_csv("submission_LogistRegression_withEmbarked.csv")

<font size = '4'> The accuracy increase in particular 3% more of the predicted positive are correct. </font>

<font size = '4'> I will try now adding the feature 'Fare'. My reason to exclude it in first place was that how much you pay the ticket should not have a direct influence on the survival rate. The ticket price is related to the passanger class already included in the model and maybe on the embarked station which now we have added. </font> 

In [None]:
train2 = pd.read_csv(input_path / 'train.csv', index_col='PassengerId')
display(train.head());

In [None]:
col_to_drop_2=['Name','Ticket','Parch', 'SibSp', 'Cabin']

## Removing the features that we don't use in the model, and the na in Age from the training set
train_dropped=train2.drop(col_to_drop_2, axis=1)
test_dropped=test.drop(col_to_drop_2, axis=1)

train_dropped.Embarked=train_dropped.Embarked.fillna('U')
test_dropped.Embarked=test_dropped.Embarked.fillna('U')

train_dropped['Fare'] = train2['Fare'].fillna(train_dropped.groupby(['Pclass','Embarked'])['Fare'].transform('mean'))
test_dropped['Fare'] = test['Fare'].fillna(test_dropped.groupby(['Pclass','Embarked'])['Fare'].transform('mean'))
target= train_dropped.pop('Survived')
ct = ColumnTransformer(transformers= [('encoder', OneHotEncoder(drop='first'), ['Pclass', 'Sex', 'Embarked']),('scaler', MinMaxScaler(), ['Age', 'Fare'])], remainder='passthrough')
#ct.fit(train_dropped1)
train_encoded=ct.fit_transform(train_dropped)
test_encoded=ct.transform(test_dropped)

##Split

X_train, X_val, y_train, y_val = train_test_split(train_encoded, target, train_size=0.70) 

lr = LogisticRegression()
parameters = {
    'penalty':['l2'],
    'C':[0.0001,0.001,0.01,0.1,1.0]
  }

#lr.fit(X_train, y_train)

grid_lr=GridSearchCV(lr, cv=5, param_grid=parameters ,n_jobs=-1, scoring='accuracy')

grid_lr.fit(X_train,y_train)

print(grid_lr.best_params_)

y_pred = grid_lr.predict(X_val) # This grabs the positive class prediction
score_accuracy= accuracy_score(y_val, y_pred)

y_prob=grid_lr.predict_proba(X_val)[:,1]
score_auc=roc_auc_score(y_val,y_prob)

print('The accuracy scores:')
print(f'{score_accuracy:0.5f}') 
print('The auc score:')
print(f'{score_auc:0.5f}')

hyperparameter_summary(grid_lr.cv_results_)

<font size = '4'> As expected 'Fare' doesn't seem to add anything to the model. </font> 