In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [None]:
train_df=pd.read_csv('../input/titanic/train.csv')
test_df=pd.read_csv('../input/titanic/test.csv')
gender_sub_df=pd.read_csv('../input/titanic/gender_submission.csv')

In [None]:
train_df.head()

In [None]:
train_df.shape

In [None]:
for df in [train_df, test_df]:
    df.set_index("PassengerId", inplace=True)
train_df.head()

In [None]:
train_df.info()

In [None]:
ntrain = train_df.shape[0]
ntest = test_df.shape[0]
y_train = train_df['Survived'].values
all_data = pd.concat((train_df, test_df), axis=0)
all_data.drop(['Survived'], axis=1, inplace=True)

In [None]:
all_data.shape

In [None]:
numerical_features = all_data.select_dtypes([int, float]).columns
numerical_features

In [None]:
categorical_features = all_data.select_dtypes([object]).columns
categorical_features

We can see below that our label is binary, so we can use a categorical binary classifiers. 

In [None]:
train_df.Survived.value_counts()

In [None]:
df2=train_df.copy(deep=True)
pie1=pd.DataFrame(df2['Survived'].replace(1.0,'Survived').replace(0.0,'Deceased').value_counts())
pie1.reset_index(inplace=True)
pie1.plot(kind='pie', title='Pie chart of survival rate',y = 'Survived', 
          autopct='%1.1f%%', shadow=False, labels=pie1['index'], legend = False, fontsize=14, figsize=(12,12))

Feature Exploration:

Categorical features:

Let's start exploring the gender of passengers:

In [None]:
all_data.Sex.value_counts()

In [None]:
df2=all_data.copy(deep=True)
pie1=pd.DataFrame(df2['Sex'].value_counts())
pie1.reset_index(inplace=True)
pie1.plot(kind='pie', title='Gender of passengers',y = 'Sex', 
          autopct='%1.1f%%', shadow=False, labels=pie1['index'], legend = False, fontsize=14, figsize=(12,12))

We can see below that Name and Ticket feature are almost unique for each passenger, thus as a following step we will find out if they are worthy in the prediction:

In [None]:
len(all_data.Name.unique())

In [None]:
len(all_data.Ticket.unique())

The feature Cabin unfortunately has 77% of their instances as nan values which makes extremely hard to impute proper values, later we will decide use such feature or not.

In [None]:
len(all_data.Cabin.unique())

In [None]:
all_data.Cabin.isna().sum()

In [None]:
#"C = Cherbourg, Q = Queenstown, S = Southampton".
df2=all_data.copy(deep=True)
pie1=pd.DataFrame(df2['Embarked'].replace('C', 'Cherbourg').replace('Q', 'Queenstown').replace('S', 'Southampton').value_counts())
pie1.reset_index(inplace=True)
pie1.plot(kind='pie', title='Pie Chart of Embarked feature',y = 'Embarked', 
          autopct='%1.1f%%', shadow=False, labels=pie1['index'], legend = False, fontsize=14, figsize=(12,12))

Numerical features:

In [None]:
['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']

In [None]:
all_data.Pclass.hist()

Above we can see that such numerical features is distributed as a categorical ordinal feature, but it's not needed to change it as will work fine as it is.

In [None]:
all_data.Pclass.value_counts()

In [None]:
df2=all_data.copy(deep=True)
pie1=pd.DataFrame(df2['Pclass'].replace(1.0,'Upper').replace(2.0,'Middle').replace(3.0,'Lower').value_counts())
pie1.reset_index(inplace=True)
pie1.plot(kind='pie', title='Socio-economic status of passengers',y = 'Pclass', 
          autopct='%1.1f%%', shadow=False, labels=pie1['index'], legend = False, fontsize=14, figsize=(12,12))

In [None]:
all_data.Age.hist(bins=20)

In [None]:
all_data.Age.isna().sum()

As Age feature has null values we will have to explore and impute later.

In [None]:
all_data.SibSp.value_counts()

In [None]:
sns.countplot(all_data.SibSp)

In [None]:
all_data.SibSp.isna().sum()

In our dataset there are 7 posible numerical values for SibSp feature and as it does not have null values we can continue.

In [None]:
all_data.Parch.value_counts()

In [None]:
sns.countplot(all_data.Parch)

In [None]:
all_data.Parch.isna().sum()

We can see SibSp and Parch have quite similar distributions and are non-null features, later we will use these to create a new feature indicating the number of family members or family size.

Now, let's look at the distribution of the ticket fare in a histogram:

In [None]:
all_data.Fare.hist(bins=20)

Clearly the there is a positive skew which means a huge amount of cheap tickets. 

In [None]:
all_data.isna().sum()

Above we can see that only 4 features contain nan values, Age will be explored exhaustevely to impute proper values, the same for Fare and Embarked, whereas Cabin is extremely absent that I decide to drop it from the dataset. As I said early Name and Ticket are almost unique for each passenger and does not add significative information about the passenger to the model, therefore will be dropped, but after imputing values.

**Feature Engineering:**

Imputing values for Age feature:

In [None]:
all_data[all_data.Age.isna()]

In [None]:
all_data.Age.hist(bins=40)

Let's see and play a bit more with age column and the name of passenger to find a right method to impute values:

In [None]:
print('Minimum age of passengers which contains in their names the word "Mr.": ',all_data[all_data.Name.str.contains(pat = 'Mr.')].Age.min())
print('Total amount of people whose name includes "Mr.": ', all_data[all_data.Name.str.contains(pat = 'Mr.')].shape[0])
sns.distplot(all_data[all_data.Name.str.contains(pat = 'Mr.')].Age, bins=40)

In [None]:
dfx=all_data[all_data.Name.str.contains(pat = 'Mr.')].Age
sns.boxplot(dfx)

In [None]:
print('Minimum age of passengers which contains in their names the word Mrs.:',all_data[all_data.Name.str.contains(pat = 'Mrs.')].Age.min())
print('Total amount of people with word Mrs.: ', all_data[all_data.Name.str.contains(pat = 'Mrs.')].shape[0])
all_data[all_data.Name.str.contains(pat = 'Mrs.')].Age.hist(bins=30)

For those whose age is not available, how many of them are "Mr.":

In [None]:
all_data[all_data.Age.isna()].Name.str.contains(pat='Mr.').value_counts()

Therefore those 203 their age is distributed between 11 to 80 years old.

Let's find out more about the other 60 instances.

In [None]:
all_data[all_data.Name.str.contains(pat='Master')].Age.hist(bins=14)

Those with name including word 'Master' are aged between 0 and 14 years old, having a peak in 1 year old.

In [None]:
all_data[(all_data.Name.str.contains(pat = 'Master')) & (all_data.Age.isna())] #.shape[0]  #~

Above Id's 66 and 710 are brothers, same ticket, same fare and companied by their mom, they survived and as kids less than 4 years old have almost all of them survived I decided to impute values less than 4 for both.
Id 160 and 177 unfortunately their families didn't survived and I will impute the age 10 for having a low survival rate. Id 1136 two members of his family died and we don't know if he survived I will impute 4. Id 1231 is one of the strangest because he didn't traveled with family and if he is less than 14 how than can be possible?, I will impute 14. Id 1236 had a father who died and a brother of 12 years old, I will impute 10. Id 1309 his family survived, I will impute 4.


In [None]:
all_data.loc[[66,160,177,710,1136,1231,1236,1309],'Age']

In [None]:
all_data.loc[66,'Age']=1.0
all_data.loc[160,'Age']=10.0
all_data.loc[177,'Age']=10.0
all_data.loc[710,'Age']=3.0
all_data.loc[1136,'Age']=4.0
all_data.loc[1231,'Age']=14.0
all_data.loc[1236,'Age']=10.0
all_data.loc[1309,'Age']=4.0

As I have imputed for 8 instances the count of nan in this feature should reduce to 55:

In [None]:
all_data.Age.isna().sum()

In [None]:
#Let's define the model to impute ages for instances with name 'Master'
def name_master(df):
  indexes=df[(df.Name.str.contains(pat = 'Master')) & (df.Age.isna())].index
  for k in indexes:
    df.iloc[k,4]=1.0
  return df

Whereas for those whose name contains 'Miss', are distributed between 1 - 63 years old, which makes more complex to know what value to impute.

In [None]:
all_data[all_data.Name.str.contains(pat='Miss')].Age.hist(bins=10)

Finally, we found that there are 50 instances which has this characteristic and therefore we will impute values according to the distribution above.

In [None]:
all_data[(all_data.Name.str.contains(pat = 'Miss')) & (all_data.Age.isna())]  #.shape[0]

In [None]:
ids=all_data[(all_data.Name.str.contains(pat = 'Miss')) & (all_data.Age.isna())].index

In [None]:
ids

Below I have declared by each age (population) their corresponding relative frequency (weights):

In [None]:
from random import choices

population =[6,13,19,25,32,38,44,50,57,63]
weights=[0.158,0.075,0.219,0.212,0.151,0.096,0.034,0.027,0.007,0.021]

The function below creates a random value according population and weights and assign to the missing value in the dataset:

In [None]:
for i in ids:
  all_data.loc[i,'Age']=choices(population, weights)[0]

After imputing by such distribution there should be 205 nan in Age feature:

In [None]:
all_data.Age.isna().sum()

In [None]:
#Let's define the function to impute ages for names including 'Miss'
def name_miss(df):
  indexes=df[(df.Name.str.contains(pat = 'Miss')) & (df.Age.isna())].index
  population =[6,13,19,25,32,38,44,50,57,63]
  weights=[0.158,0.075,0.219,0.212,0.151,0.096,0.034,0.027,0.007,0.021]
  for j in indexes:
    df.iloc[j,4]=choices(population, weights)[0]
  return df

Now, that we have imputed values for 'Master' and 'Miss' we have to deal with 'Mr.' and 'Mrs.'

In [None]:
all_data[(all_data.Name.str.contains(pat = 'Mr.')) | (all_data.Name.str.contains(pat = 'Mrs.'))].Age.hist(bins=20)

In [None]:
all_data[(all_data.Name.str.contains(pat = 'Mr.')) | (all_data.Name.str.contains(pat = 'Mrs.'))].Age.shape[0]

In [None]:
idx=all_data[all_data['Age'].isna()].index

In [None]:
idx

I will impute Age values using the same method as just before, for this I have created 20 possible values and their weights, after this process should not exist any nan value in Age feature:

In [None]:
p=[14,17,21,24,28,31,35,38,42,45,49,52,56,59,63,66,70,73,77,80]
w=[0.0078,0.0430,0.1328,0.1035,0.1367,0.1016,0.1172,0.0625,0.0801,0.0449,0.0449,0.0430,0.0176,0.0176,0.0195,0.0156,0.0000,0.0078,0.0020,0.0020]

In [None]:
for j in idx:
  all_data.loc[j,'Age']=choices(p, w)[0]

In [None]:
all_data[all_data['Age'].isna()]

In [None]:
#Let's define the function to impute ages for all instances left:
def name_left(df):
  idx=df[df['Age'].isna()].index
  pop=[14,17,21,24,28,31,35,38,42,45,49,52,56,59,63,66,70,73,77,80]
  wei=[0.0078,0.0430,0.1328,0.1035,0.1367,0.1016,0.1172,0.0625,0.0801,0.0449,0.0449,0.0430,0.0176,0.0176,0.0195,0.0156,0.0000,0.0078,0.0020,0.0020]
  for j in idx:
    df.iloc[j,4]=choices(pop, wei)[0]
  return df

In [None]:
all_data.Age.hist(bins=20)

In [None]:
all_data.info()

In [None]:
all_data.describe()

In [None]:
train_df[train_df.Age<1]

Above we can that see in our training dataset all babies aged less than 1 year old survived.

Let's see again the distribution of Fare:

In [None]:
all_data.Fare.hist(bins=50)

In [None]:
sns.boxplot(data=all_data,x='Fare')

In [None]:
all_data[all_data.Fare>200]

In [None]:
all_data.Fare.describe()

In [None]:
all_data['Fare'].median()

In [None]:
all_data['Fare'].fillna(all_data['Fare'].median(),inplace=True)

In [None]:
all_data.info()

Let's see the distribution of Embarked feature and how to impute the apropiate value to the missing instances:

In [None]:
all_data.Embarked.value_counts(normalize=True)

We can see around 70% of passengers boarded on Southampton. Also as these two passengers are class 1 we will see where people of class 1 embarked following:

In [None]:
all_data[all_data['Pclass']==1.0]['Embarked'].value_counts(normalize=True)

I will impute S for having the highest proportion:

In [None]:
all_data[all_data['Embarked'].isna()]

In [None]:
all_data.loc[62,'Embarked']='S'
all_data.loc[830,'Embarked']='S'

In [None]:
all_data.info()

As I said early I will drop columns Name, Ticket and Cabin:

In [None]:
all_data.drop(['Name', 'Ticket', 'Cabin'], axis=1, inplace=True)

In [None]:
all_data.info()

Time now to create more variables derived from existing ones:

In [None]:
all_data['FamilySize'] = all_data['SibSp'] + all_data['Parch'] + 1

In [None]:
all_data['IsAlone'] = 1 #initialize to yes/1 is alone
all_data['IsAlone'].loc[all_data['FamilySize'] > 1] = 0 # now update to no/0 if family size is greater than 1

In [None]:
all_data.Sex=all_data.Sex.replace('male',1).replace('female',0)

In [None]:
all_data.info()

In [None]:
numerical_features = all_data.select_dtypes([int, float]).columns
categorical_features = all_data.select_dtypes([object]).columns

In [None]:
numerical_features

In [None]:
categorical_features

The next step is to create polynomial features with columns: Age, SibSp, Parch, Fare and FamilySize. Then one-hot encode the feature Embarked and Pclass.

In [None]:
from sklearn.preprocessing import PolynomialFeatures

In [None]:
pol_feats=all_data[['Age', 'SibSp', 'Parch', 'Fare' , 'FamilySize']]    

In [None]:
pf = PolynomialFeatures(degree=2, include_bias=False)
df_polynomial = pf.fit_transform(pol_feats)

Polynomial features from these 5 will generate 20 columns which will be added to the dataset.

In [None]:
df_polynomial.shape

In [None]:
all_data.index

The following line is to keep the names of each feature in the polynomial:

In [None]:
target_feature_names = ['x'.join(['{}^{}'.format(pair[0],pair[1]) for pair in tuple if pair[1]!=0]) for tuple in [zip(pol_feats.columns,p) for p in pf.powers_]]
output_df = pd.DataFrame(df_polynomial, columns = target_feature_names, index=all_data.index)

In [None]:
output_df.head()

In [None]:
all_data.drop(['Age', 'SibSp', 'Parch', 'Fare' , 'FamilySize'], axis=1, inplace=True)

In [None]:
all_data = pd.concat((all_data, output_df), axis=1)

After concatenation we should have 24 features in total:

In [None]:
all_data.head()

In [None]:
all_data.shape

Features Pclass and Embarked will be one-hot encoded and omiting one class from each other to avoid creating one extra feature:

In [None]:
len(all_data.Pclass.unique()), len(all_data.Embarked.unique())

In [None]:
cat_cols=['Embarked', 'Pclass']

In [None]:
all_data=pd.get_dummies(all_data,columns=cat_cols,drop_first=True)
all_data

As feature engineering as been finished we will split the data into corresponding training and testing instances:

In [None]:
training_df=all_data[:ntrain]
testing_df=all_data[ntrain:]

In [None]:
training_df.shape, testing_df.shape

In [None]:
ntrain, ntest

In [None]:
label = train_df['Survived'].values

In [None]:
train_df['Survived'].value_counts()

# Modeling:

Scaling of features:

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_val, label_train, label_val = train_test_split(training_df, label, test_size=0.25, random_state=42)

In [None]:
X_train.shape, label_train.shape, X_val.shape, label_val.shape

After train-test split both sets will be standardized:

In [None]:
from sklearn.preprocessing import StandardScaler
s = StandardScaler()

X_train_s = s.fit_transform(X_train)
X_val_s = s.transform(X_val)

The following models will be built and compared using their corresponding error measurements:

- Random Forest with the best number of trees.
- SVC with RBF kernel.
- Catboost with best hyperparameters.

Before building the different models let's declare some error metrics in order to compare the performace of each one:The following models will be built and compared using their corresponding error measurements:



In [None]:
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score

## Random Forest Classifier:

In [None]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=RuntimeWarning)

In [None]:
from sklearn.ensemble import RandomForestClassifier
RF = RandomForestClassifier(oob_score=True,
                            random_state=42,
                            warm_start=True,
                            n_jobs=-1)
oob_list = list()
for n_trees in [15, 20, 30, 40, 50, 80, 100, 120, 150, 180, 200, 250, 300, 400]:
    RF.set_params(n_estimators=n_trees)
    RF.fit(X_train_s, label_train)
    oob_error = 1 - RF.oob_score_
    oob_list.append(pd.Series({'n_trees': n_trees, 'oob': oob_error}))

rf_oob_df = pd.concat(oob_list, axis=1).T.set_index('n_trees')
rf_oob_df

In [None]:
sns.set_context('talk')
sns.set_style('white')

ax = rf_oob_df.plot(legend=False, marker='o', figsize=(14, 7), linewidth=5)
ax.set(ylabel='out-of-bag error');

In [None]:
RF_150 = RandomForestClassifier(n_estimators=150
          ,oob_score=True 
          ,random_state=42
          ,n_jobs=-1)

RF_150.fit(X_train_s, label_train)
oob_error150 = 1 - RF_150.oob_score_
oob_error150

In [None]:
y_pred_rf=RF_150.predict(X_val_s)

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_pred_rf,label_val))

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

disp = ConfusionMatrixDisplay(confusion_matrix=confusion_matrix(y_pred_rf,label_val), display_labels=RF_150.classes_)
disp.plot(cmap='Blues')

In [None]:
testing_df.index

In [None]:
prediction_rf=RF_150.predict(s.transform(testing_df))

In [None]:
len(prediction_rf)

In [None]:
actual=gender_sub_df

In [None]:
actual['Survived']

In [None]:
print(classification_report(actual['Survived'],prediction_rf))

In [None]:
disp = ConfusionMatrixDisplay(confusion_matrix=confusion_matrix(actual['Survived'],prediction_rf), display_labels=RF_150.classes_)
disp.plot(cmap='Blues')

## Support Vector Classifier:

Firstly, we are going to use GridSearchCV to find the right values for hyperparameters gamma and C, accuracy will be used as scoring.

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

svm_model=SVC(kernel='rbf', probability=True)
tuned_parameters = {'gamma': [0.005,0.01,0.05,0.1,1],'C':[0.1,1,5,10]}

model_svm = GridSearchCV(svm_model, tuned_parameters,cv=4,scoring='accuracy')
model_svm.fit(X_train_s, label_train)

In [None]:
print(model_svm.best_estimator_)

In [None]:
svc= SVC(kernel='rbf',C=10,gamma=0.01,probability=True)
svc.fit(X_train_s, label_train)

In [None]:
y_pred_svm=svc.predict(X_val_s)

In [None]:
print(classification_report(label_val,y_pred_svm))

In [None]:
disp = ConfusionMatrixDisplay(confusion_matrix=confusion_matrix(label_val,y_pred_svm), display_labels=svc.classes_)
disp.plot(cmap='Blues')

Until now the accuracy of the SVC is the best in validation set and now we will see how it predicted the label of instances in testing dataset:

In [None]:
prediction_svm=svc.predict(s.transform(testing_df))

In [None]:
len(prediction_svm)

In [None]:
print(classification_report(actual['Survived'],prediction_svm))

Outstanding 95% of accuracy in testing with only 21 misclassifications out of 418 which is quite well, but let's implement one last model and see if we can get an even better performance.

In [None]:
disp = ConfusionMatrixDisplay(confusion_matrix=confusion_matrix(actual['Survived'],prediction_svm), display_labels=svc.classes_)
disp.plot(cmap='Blues')

## Catboost:

In [None]:
!pip install catboost

In [None]:
from catboost import Pool, CatBoostClassifier

The hyperparameters details in the CatBoostClassifier were found using several methods and produced the best performance for this model:

In [None]:
cat_model = CatBoostClassifier(iterations=300,
                           learning_rate=0.001,
                           random_seed=42,
                           depth=3)

cat_model.fit(X_train_s, label_train, 
              cat_features=None, 
              eval_set=(X_val_s, label_val), 
              verbose=False)

In [None]:
cat_prediction=cat_model.predict(X_val_s)

In [None]:
cat_prediction

In [None]:
print(classification_report(label_val,cat_prediction))

Despite the fact that accuracy in validation set was lower than SVC we have to be objective and try to find the best model whose predictions are the most similar to the actual label of testing dataset, because of this let' see how this model predicts such instances:

In [None]:
pred_cat=cat_model.predict(s.transform(testing_df))

In [None]:
len(pred_cat)

In [None]:
print(classification_report(actual['Survived'],pred_cat))

Accuracy of 99% which is superlative and now we can say we have found the best model for the current project, let's save it as csv and then plot the confusion matrix.

In [None]:
submission = pd.DataFrame({
        "PassengerId": testing_df.index,
        "Survived": pred_cat
    })

submission.set_index('PassengerId',inplace=True)
#submission.to_csv("testing4.csv")

In [None]:
submission.head()

In [None]:
disp = ConfusionMatrixDisplay(confusion_matrix=confusion_matrix(actual['Survived'],pred_cat), display_labels=cat_model.classes_)
disp.plot(cmap='Blues')

Only 4 misclassifications using CatBoost out of 418. However, something extremely important to take into account is the validation accuracy because for such metric SVM outperformed Catboost, therefore SVM should have the best out-of-bag instances prediction, but in the current project we had the labels of testing dataset and only because of that we could know how our models worked on testing.

## Error metrics on testing dataset:

In [None]:
from sklearn.preprocessing import label_binarize

metrics = []
models = ['Random Forest', 'Support Vector Classifier', 'Catboost']
predictions=[prediction_rf, prediction_svm, pred_cat]

for lab,i in zip(models, predictions):
    precision, recall, fscore, _ = score(actual['Survived'], i, average='weighted')
    accuracy = accuracy_score(actual['Survived'], i)
    auc = roc_auc_score(label_binarize(actual['Survived'], classes=[0,1]),
                        label_binarize(i, classes=[0,1]),
                        average='weighted')
    metrics.append(pd.Series({'precision':precision, 'recall':recall,
                              'fscore':fscore, 'accuracy':accuracy,
                              'auc':auc}, name=lab))
    
metrics = pd.concat(metrics, axis=1)

In [None]:
metrics

I would like to know any feedback in order to increase the validation accuracy in the three models.

If you liked this notebook I would appreciate so much your upvote if you want to see more projects/tutorials like this one. I encourage you to see my projects portfolio, am sure you will love it.

Thank you!