In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

> Import libraries

In [None]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import metrics
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn import tree
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_selection import RFECV
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import LabelEncoder

%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

> import dataset

In [None]:
train_df = pd.read_csv('../input/titanic-machine-learning-from-disaster/train.csv')
test_df = pd.read_csv('../input/titanic-machine-learning-from-disaster/test.csv')

# 1. Exploratory Data Analysis

**1.1 Overview**
* PassengerId is the unique id of the row and it doesn't have any effect on target
* Survived is the target variable we are trying to predict (0 or 1):
    * 1 = Survived
    * 0 = Not Survived
* Pclass (Passenger Class) is the socio-economic status of the passenger and it is a categorical ordinal feature which has 3 unique values (1, 2 or 3):
    * 1 = Upper Class
    * 2 = Middle Class
    * 3 = Lower Class
* Name, Sex and Age are self-explanatory
* SibSp is the total number of the passengers' siblings and spouse
* Parch is the total number of the passengers' parents and children
* Ticket is the ticket number of the passenger
* Fare is the passenger fare
* Cabin is the cabin number of the passenger
* Embarked is port of embarkation and it is a categorical feature which has 3 unique values (C, Q or S):
    * C = Cherbourg
    * Q = Queenstown
    * S = Southampton

In [None]:
train_df.info() # we only have 204 cabin info in the training set

From the table above, we can note a few things. First of all, that we need to convert a lot of features into numeric ones later on, so that the machine learning algorithms can process them. Furthermore, we can see that the features have widely different ranges, that we will need to convert into roughly the same scale. We can also spot some more features, that contain missing values (NaN = not a number), that wee need to deal with.

In [None]:
test_df.info() #we only have 91 cabine info in the test set

In [None]:
train_df.describe()

Above we can see that 38% out of the training-set survived the Titanic. We can also see that the passenger ages range from 0.4 to 80. 

In [None]:
test_df.describe()

Port of Embarkation (C = Cherbourg; Q = Queenstown; S = Southampton)
April 10, 1912 - The Titanic sets sail on its maiden voyage from Southampton, England, to New York. 

In [None]:
# Full dataset is needed for imputing missing values & also for pruning outliers

#whole_df = pd.concat([train_df, test_df], axis=0, ignore_index=True, sort=True) #If True, do not use the index values along the concatenation axis.

whole_df = train_df.append(test_df,sort=False)
whole_df.info()

In [None]:
#checking number of columns of each data type for general EDA
whole_df.dtypes.value_counts()

**1.2 Numerical Variables**

In [None]:
print(whole_df.select_dtypes(['int64','float64']).columns)

*1) Age*

In [None]:
whole_df['Age'].hist(bins=70)
plt.show()

In [None]:
fig = sns.FacetGrid(whole_df, hue = 'Sex', aspect = 4)
fig.map(sns.kdeplot, 'Age', shade = True)

oldest = train_df['Age'].max()
fig.set(xlim = (0, oldest))
fig.add_legend()
plt.show()

Most passengers are within the age range between 15 and 40ish.

In [None]:
whole_df['Age'].mean()  #get the mean age of all passengers, around 30yr

In [None]:
fig = sns.FacetGrid(whole_df, hue = 'Pclass', aspect = 4)
fig.map(sns.kdeplot, 'Age', shade = True)

oldest = train_df['Age'].max()
fig.set(xlim = (0, oldest))
fig.add_legend()
plt.show()

class 1 shows a normal distribution. However, class 2 and class 3 are skewed towards younger age.

In [None]:
# We look at Age column and set Intevals on the ages and the map them to their categories as
# (Children, Teen, Adult, Old)
interval = (0,2,4,10,19,35,60,100)
categories = ['Infant','Toddler','Kid','Teen','Young Adult','Adult','Senior']
whole_df['Age_cats'] = pd.cut(whole_df.Age, interval, labels = categories)

ax = sns.countplot(x = 'Age_cats',  data = whole_df, hue = 'Survived', palette = 'Set1')

ax.set(xlabel='Age Categorical', ylabel='Total',
       title="Age Categorical Survival Distribution")

plt.show()

More infant and toddler survived than died. In kid group, number of kids died is almost the same as they survived. In the group of young adult and adult, far more people died than survived.

*2) Fare*

In [None]:
sns.distplot(whole_df.Fare)
plt.show()

In [None]:
whole_df.Fare.describe()

Fare of some tickets is ZERO. It is worth exploring further

In [None]:
whole_df[whole_df.Fare == 0]

It looks strange that so many tickets were sold at zero. It could be true or it could be the erros in the dataset.

In [None]:
# Use a pandas plotting method to plot the column 'Fare' for each value of 'Survived' on the same plot.
train_df.groupby('Survived').Fare.hist(alpha=0.6)
plt.show()

In [None]:
sns.swarmplot(x='Survived', y='Fare', data=train_df)
plt.show()

It looks like fare is correlated with survival aboard the Titanic.

In [None]:
# Use the DataFrame method .describe() to check out summary statistics of 'Fare' as a function of survival
train_df.groupby('Survived').Fare.describe()

The Embarked feature has only 2 missing values, which can easily be filled. It will be much more tricky, to deal with the ‘Age’ feature, which has 177 missing values. The ‘Cabin’ feature needs further investigation, but it looks like that we might want to drop it from the dataset, since 77 % of it are missing.


*3) PClass indicating the passenger class, totally 3 values*

In [None]:
sns.countplot(x='Pclass', hue = 'Sex',data=whole_df, palette="Set2") #Most of the males were in the 3rd class
plt.show()

most passengers were in PClass3.

In [None]:
# fig, ax = plt.subplots(1,1, figsize = (12,10))
ax = sns.countplot(x = 'Pclass', data=train_df,hue = 'Survived', palette = 'Set1')
ax.set(title = 'Passenger status (Survived/Died) against Passenger Class', 
       xlabel = 'Passenger Class', ylabel = 'Total')
plt.show()

Most victims were from the class 3.

4) SibSp - Number of Sibling Spouse

In [None]:
whole_df.SibSp.value_counts()

Totally 891 passengers were travelling alone.

In [None]:
plt.figure(figsize=(12,5))
sns.boxplot(y = whole_df.SibSp, x = whole_df.Age_cats)
plt.xticks(rotation=90)
plt.tight_layout()

In [None]:
plt.figure(figsize=(12,5))
sns.violinplot(y = whole_df.SibSp, x = whole_df.Age_cats,hue='Sex',
                    data=whole_df, palette="Set3",split=True)
plt.show()

Infant, toddler and kid tend to on board with their siblings.

*5) Parch - Parent and Child* parch: Number of Parents/Children Aboard*

In [None]:
plt.figure(figsize=(12,5))
sns.boxplot(y = whole_df.Parch, x = whole_df.Age_cats)
plt.xticks(rotation=90)
plt.tight_layout()

In [None]:
plt.figure(figsize=(12,5))
sns.violinplot(y = whole_df.Parch, x = whole_df.Age_cats,hue='Sex',
                    data=whole_df, palette="Set3",split=True)
plt.show()

1.3) Categorical Variables

In [None]:
print(whole_df.select_dtypes(['object']).columns)

*1) Cabin*

In [None]:
deck = whole_df['Cabin'].dropna()
deck.head()

In [None]:
cabin_df = DataFrame(deck)
cabin_df

In [None]:
cabin_df['Cabin'] = cabin_df['Cabin'].astype(str).str[0] 
#change the datatype to str and get the first letter
cabin_df['Cabin'].unique()  #get all the unique values of column 'Cabin'

In [None]:
cabin_df['Cabin'].value_counts()

There is only 1 T in the whole dataset,so it looks like a outliner

In [None]:
whole_df[whole_df['Cabin'].str.contains('T') == True]

In [None]:
cabin_df = cabin_df[cabin_df.Cabin != 'T']
sns.countplot('Cabin',data = cabin_df,palette = 'summer',order=['A','B','C','D','E','F','G'])
plt.show()

In [None]:
train_df['Deck']= train_df['Cabin'].dropna().astype(str).str[0] 
train_df

In [None]:
sns.catplot(x="Deck", hue="Survived", col="Sex",order=['A','B','C','D','E','F','G'],
                data=train_df, kind="count",
                height=5, aspect=1.2)
plt.show()

Women from Cabin A,B,D,F all survived. <br>
All women from Cabin A, B, D, F survived. <br>
More men than women survived in Cabin A.

2) Sex

In [None]:
#Sex
sns.set(style="darkgrid")
sns.countplot(x='Sex', data=whole_df, palette="Set2")
plt.tight_layout()
plt.show()

In [None]:
ax = sns.countplot(x = 'Sex', data=train_df,hue = 'Survived', palette = 'Set1')
ax.set(title = 'Passenger status (Survived/Died) against Passenger Class', 
       xlabel = 'Passenger Sex', ylabel = 'Total')
plt.show()

More female than male survived.
Till now, we can see that men in the 3rd class are likely not to survive in the wreck.

3) Embarked

In [None]:
#Explore the feature of Embarked
sns.countplot('Embarked',data = whole_df, hue = 'Pclass', order=['C','Q','S'], palette = 'husl')
plt.show()

In [None]:
sns.catplot(x="Embarked", hue="Survived", col="Pclass",
                data=train_df, kind="count",
                height=5, aspect=1.2)
plt.show()

Most passengers embarked from Southampton port. <br>
Those who embarked from Queenstown were all in Pclass3  <br>
C = Cherbourg; Q = Queenstown; S = Southampton

'Name' and 'Ticket' will be analyzed in the next section

It looks like passengers with the same name also have the same ticket number. 

# **2. Imputation of Missing Data**

In [None]:
whole_df['Fare'].describe()

In [None]:
whole_df.isna().sum() 
# I am going to drop Cabin and Cabin_cats since they have to many missing data
# drop Survivor, Child, Age_cats since we don't need them for further analysis
#Fill in missing data for Age, Embarked and Fare

**1) Fill in missing data for Age column** <br>
First, I am going to get the median age according to passengers' titles and then imputate the median age into the dataset

In [None]:
whole_df.Name.head(20)

In [None]:
whole_df['Title'] = whole_df.Name.str.extract(r'([A-Za-z]+)\.', expand = False)
whole_df.Title.value_counts()
#[a-zA-Z]+: a word consisting of only Latin characters with a length at least one
#+: something repeating once or more

In [None]:
whole_df[whole_df['Name'].astype(str).str.contains('Col\.') == True]

In [None]:
whole_df[whole_df['Name'].astype(str).str.contains('Don\.') == True]

In [None]:
whole_df[whole_df['Name'].astype(str).str.contains('Master') == True]

In [None]:
Common_Title = ['Mr','Miss','Mrs','Master']
whole_df['Title'].replace(['Ms','Mme','Mlle','Dona'],'Miss', inplace = True)
whole_df['Title'].replace(['Lady'],'Mrs', inplace = True)
whole_df['Title'].replace(['Sir','Rev','Capt','Col','Don','Major'],'Mr', inplace = True)
whole_df['Title'][~whole_df.Title.isin(Common_Title)] = 'Others'

I am trying to identify the title in the 'Others' category

In [None]:
whole_df[whole_df['Title'] == 'Others']

Among all doctors, only one is female and she was married, I am going to change her title to Mrs.

In [None]:
whole_df.loc[796,'Title'] ='Mrs'

In [None]:
whole_df[(whole_df['Name'].str.contains('Dr\.') == True) & (whole_df['Title'] == 'Others') ]

In [None]:
whole_df[(whole_df['Name'].str.contains('Dr\.') == True) & (whole_df['Title'] == 'Others') ].index

In [None]:
whole_df.loc[[245, 317, 398, 632, 660, 766, 293],'Title'] ='Mr'

In [None]:
whole_df[whole_df['Title'] == 'Others']

I looked up from https://www.encyclopedia-titanica.org/, Reuchlin, Jonkheer. John George was married at that time, I am going to change his title to Mr and The Countess of Rothes (Lucy Noël Martha Dyer-Edwards) was also married at that time, I am going to change her title to Mrs.

In [None]:
whole_df.loc[759,'Title'] ='Mrs'
whole_df.loc[822,'Title'] ='Mr'

In [None]:
whole_df['Title'].value_counts()

In [None]:
#train_df = whole_df[:len(train_df)]
#test_df = whole_df[len(train_df):]
#train_df

In [None]:
# compute mean per group and find index after sorting
sorted_index = whole_df.groupby('Title')['Age'].mean().sort_values().index

In [None]:
sorted_index

In [None]:
sns.boxplot(x='Title', y = 'Age', data = whole_df, order=sorted_index)
plt.show()

Find the median of Age in each title.

In [None]:
AgeMedian_by_titles = whole_df.groupby('Title')['Age'].median()
AgeMedian_by_titles

In [None]:
#Impute the missing Age values according to the titles.
for title in AgeMedian_by_titles.index:
    whole_df['Age'][(whole_df.Age.isnull()) & (whole_df.Title == title)] = AgeMedian_by_titles[title]

In [None]:
whole_df.info()

**2) Fill in missing data for Fare column** <br>
There is one missing value in columns 'Fare' <br>
We will imputate the value by PClass or Ticket.

In [None]:
whole_df[whole_df.Fare.isnull() == True]

In [None]:
plt.figure(figsize=(15,8))
sns.violinplot(x='Pclass', y = 'Fare', data = whole_df, hue='Sex',
              palette="Set3",split=True)
plt.show()

In [None]:
med_fare = whole_df.groupby(['Pclass', 'Sex']).Fare.median()
# Filling the missing value in Fare with the median Fare of 3rd class male passenger
whole_df['Fare'] = whole_df['Fare'].fillna(med_fare[3][1])

This male passenger was travelling alone in Pclass3. We can assume that Fare is related to the Sex and Pclass features. Median Fare value of a male with a third class ticket is a logical choice to fill the missing value.

In [None]:
whole_df[whole_df.Fare == 0].sort_values('Ticket')

This is high possibility that those ticket with fare of ZERO should not be ZERO. I am going to imputate median value to those tickets

In [None]:
med_fare = whole_df.groupby(['Pclass', 'Sex']).Fare.median()
med_fare

In [None]:
whole_df[(whole_df.Fare == 0) & (whole_df.Pclass == 1)].index

In [None]:
whole_df.loc[[263, 633, 806, 815, 822, 266, 372],'Fare'] = med_fare[1][1]

In [None]:
whole_df[(whole_df.Fare == 0) & (whole_df.Pclass == 2)].index

In [None]:
whole_df.loc[[277, 413, 466, 481, 674, 732],'Fare'] = med_fare[2][1]

In [None]:
whole_df[(whole_df.Fare == 0) & (whole_df.Pclass == 3)].index

In [None]:
whole_df.loc[[179, 271, 302, 597],'Fare'] = med_fare[3][1]

In [None]:
whole_df.Fare.describe()

**3) Fill in missing data for Embarked column**

In [None]:
#Embarked
#For the dataset, there are only 2 missing values in the training dataset 
whole_df[whole_df['Embarked'].isnull()== True]

Embarked is a categorical feature and there are only 2 missing values in whole data set. Both of those passengers are female, upper class and they have the same ticket number. This means that they know each other and embarked from the same port together. The mode Embarked value for an upper class female passenger is C (Cherbourg), but this doesn't necessarily mean that they embarked from that port.

When I googled Stone, Mrs. George Nelson (Martha Evelyn), I found that she embarked from S (Southampton) with her maid Amelie Icard, on this page [Martha Evelyn Stone: Titanic Survivor](https://www.encyclopedia-titanica.org/titanic-survivor/martha-evelyn-stone.html).

Mrs Stone boarded the Titanic in Southampton on 10 April 1912 and was travelling in first class with her maid Amelie Icard. She occupied cabin B-28.

Missing values in Embarked are filled with S with this information.

In [None]:
# Filling the missing values in Embarked with S
whole_df['Embarked'] = whole_df['Embarked'].fillna('S')

**4) Fill in missing data for Cabin column**

The large portion of the Cabin feature is missing and the feature itself can't be ignored completely because some the cabins might have higher survival rates. It turns out to be the first letter of the Cabin values are the decks in which the cabins are located. Those decks were mainly separated for one passenger class, but some of them were used by multiple passenger classes.

In [None]:
# Creating Deck column from the first letter of the Cabin column (M stands for Missing)
whole_df['Deck'] = whole_df['Cabin'].apply(lambda s: s[0] if pd.notnull(s) else 'M')

df_all_decks = whole_df.groupby(['Deck', 'Pclass']).count().drop(columns=['Survived', 'Sex', 'Age', 'SibSp', 'Parch', 
                                                                        'Fare', 'Embarked', 'Cabin', 'PassengerId', 'Ticket']).rename(columns={'Name': 'Count'}).transpose()

def get_pclass_dist(df):
    
    # Creating a dictionary for every passenger class count in every deck
    deck_counts = {'A': {}, 'B': {}, 'C': {}, 'D': {}, 'E': {}, 'F': {}, 'G': {}, 'M': {}, 'T': {}}
    decks = df.columns.levels[0]   
    for deck in decks:
        for pclass in range(1, 4):
            try:
                count = df[deck][pclass][0]
                deck_counts[deck][pclass] = count 
            except KeyError:
                deck_counts[deck][pclass] = 0
                
    df_decks = pd.DataFrame(deck_counts)    
    deck_percentages = {}

    # Creating a dictionary for every passenger class percentage in every deck
    for col in df_decks.columns:
        deck_percentages[col] = [(count / df_decks[col].sum()) * 100 for count in df_decks[col]]
        
    return deck_counts, deck_percentages

def display_pclass_dist(percentages):
    
    df_percentages = pd.DataFrame(percentages).transpose()
    deck_names = ('A', 'B', 'C', 'D', 'E', 'F', 'G', 'M', 'T')
    bar_count = np.arange(len(deck_names))  
    bar_width = 0.85
    pclass1 = df_percentages[0]
    pclass2 = df_percentages[1]
    pclass3 = df_percentages[2]
    
    plt.figure(figsize=(20, 10))
    plt.bar(bar_count, pclass1, color='#b5ffb9', edgecolor='white', width=bar_width, label='Passenger Class 1')
    plt.bar(bar_count, pclass2, bottom=pclass1, color='#f9bc86', edgecolor='white', width=bar_width, label='Passenger Class 2')
    plt.bar(bar_count, pclass3, bottom=pclass1 + pclass2, color='#a3acff', edgecolor='white', width=bar_width, label='Passenger Class 3')

    plt.xlabel('Deck', size=15, labelpad=20)
    plt.ylabel('Passenger Class Percentage', size=15, labelpad=20)
    plt.xticks(bar_count, deck_names)    
    plt.tick_params(axis='x', labelsize=15)
    plt.tick_params(axis='y', labelsize=15)
    
    plt.legend(loc='upper left', bbox_to_anchor=(1, 1), prop={'size': 15})
    plt.title('Passenger Class Distribution in Decks', size=18, y=1.05)   
    
    plt.show()    

all_deck_count, all_deck_per = get_pclass_dist(df_all_decks)
display_pclass_dist(all_deck_per)

* 100% of A, B and C decks are 1st class passengers
* Deck D has 87% 1st class and 13% 2nd class passengers
* Deck E has 83% 1st class, 10% 2nd class and 7% 3rd class passengers
* Deck F has 62% 2nd class and 38% 3rd class passengers
* 100% of G deck are 3rd class passengers
* There is one person on the boat deck in T cabin and he is a 1st class passenger. T cabin passenger has the closest resemblance to A deck passengers so he is grouped with A deck
* Passengers labeled as M are the missing values in Cabin feature. I don't think it is possible to find those passengers' real Deck so I decided to use M like a deck

In [None]:
# Passenger in the T deck is changed to A
idx = whole_df[whole_df['Deck'] == 'T'].index
whole_df.loc[idx, 'Deck'] = 'A'

Deck feature has high-cardinality right now so some of the values are grouped with each other based on their similarities.

* A, B and C decks are labeled as ABC because all of them have only 1st class passengers
* D and E decks are labeled as DE because both of them have similar passenger class distribution and same survival rate
* F and G decks are labeled as FG because of the same reason above
* M deck doesn't need to be grouped with other decks because it is very different from others and has the lowest survival rate.

In [None]:
whole_df['Deck'] = whole_df['Deck'].replace(['A', 'B', 'C'], 'ABC')
whole_df['Deck'] = whole_df['Deck'].replace(['D', 'E'], 'DE')
whole_df['Deck'] = whole_df['Deck'].replace(['F', 'G'], 'FG')

whole_df['Deck'].value_counts()

In [None]:
whole_df.columns

In [None]:
#drop Cabin and Cabin_cats
whole_df = whole_df.drop(['Cabin','Age_cats'],axis=1)

# 3. Feature Engineering

**1) Family Size**

In [None]:
whole_df['FamilySize'] = whole_df.SibSp + whole_df.Parch + 1
sns.countplot(whole_df.FamilySize)
plt.show()

Family size 1 dominates - most passengers were traveling alone.

In order to understand the relationship between family size and whether those passengers survived or not, the whole_df needs to be split into train_df and test_df

In [None]:
facet = sns.FacetGrid(whole_df, hue="Survived",aspect=4)
facet.map(sns.kdeplot,'FamilySize',shade= True)
facet.set(xlim=(0, whole_df['FamilySize'].max()))
facet.add_legend()
plt.xlim(0)
plt.show()

Survival probability is worst for large families.Survival probability is worst for large families.

**2) Alone - whether the passenger was traveling alone or not**

In [None]:
whole_df['Alone'] = whole_df.FamilySize.map(lambda x: 1 if x == 1 else 0)
sns.countplot(whole_df.Alone)
plt.show()

In [None]:
sns.barplot(x='Alone', y='Survived', data=whole_df)
plt.show()

It is observed that travelling alone is less likely to survive (~30% vs ~50%).

In [None]:
sns.catplot(x='Alone', hue='Sex', col= 'Survived',
                data=whole_df, kind="count",
                height=5, aspect=1.2)
plt.show()

For people who were not traveling alone, males are less likely to survive than females.

**3) Title**

In [None]:
sns.catplot(x='Title', hue='Alone', col= 'Survived',
                data=whole_df, kind="count",
                height=5, aspect=1.2)
plt.show()

In [None]:
sns.barplot(x='Title', y='Survived', data=whole_df)
plt.show() #It is obviously that Title Mr. is much less likely to survive compared to others .

**4) Connected Survival**

It is naturally to think that family members would help each other out of the disaster. 
To find out family groups, apart from surnames of passenges (there may be same surnames but different families), let’s also look at Ticket.

In [None]:
whole_df[['Name', 'Ticket']].sort_values('Name').head(20)
#It appears that passengers with same surnames have the same Ticket names.

In [None]:
whole_df[whole_df['Ticket']== 'LINE']

Mr Johan Vilhelm Henrik Törnqvis and his fellow American Line employees (William Cahoone Johnson Jr., August (Alfred) Johnson, Lionel Leonard (Andrew Shannon), Alfred Carver and Thomas Storey) were given third class accommodation aboard ther Titanic to make the trip back to New York (ticket number 370160) where they could resume work. 
Their ticket number is 370160

In [None]:
whole_df['Ticket'] = whole_df['Ticket'].str.replace('LINE', '370160',case = True)

In [None]:
whole_df['Surname'] = whole_df.Name.str.extract(r'([A-Za-z]+),', expand=False)
whole_df['Surname']

In [None]:
whole_df['TicNum'] = whole_df.Ticket.str.extract(r'([0-9]*$)', expand=False)
whole_df['TicNum']
## *: zero or more (0+), e.g., [0-9]* matches zero or more digits. 
## . (dot): ANY ONE character except newline. Same as [^\n]
## \d, \D: ANY ONE digit/non-digit character. Digits are [0-9]

In [None]:
whole_df['SurTix'] = whole_df['Surname'] + whole_df['TicNum']

In [None]:
whole_df['IsFamily'] = whole_df.SurTix.duplicated(keep=False)*1
sns.countplot(whole_df.IsFamily)
plt.show()

Around 1/3 of the passengers are travelling with families.

In [None]:
sns.catplot(col='IsFamily', x= 'Survived',
                data=whole_df, kind="count",
                height=5, aspect=1.2)
plt.show()

In [None]:
whole_df.sort_values('SurTix')

In [None]:
#Split the whole_df to training and test dataset
train_df = whole_df[:len(train_df)] #train_df
train_df.head()

In [None]:
test_df = whole_df[len(train_df):]
test_df.head()

In [None]:
train_df.columns

In [None]:
correlation = train_df.select_dtypes(include=[np.number]).corr()
print(correlation['Survived'].sort_values(ascending=False))

In [None]:
# Heatmap of correlation of numeric features
plt.figure(figsize=(25,14))
plt.title('Correlation Between Numeric Features', size=15)

sns.heatmap(correlation, square=True, vmax=0.8, cmap='coolwarm', linewidths=0.01,annot= True, annot_kws={"size": 8})

plt.show()

FamilySize and SibSp are strongly correlated (0.89) <br>
FamilySize and Parch are strongly correlated (0.78) <br>
I will drop SibSp and Parch

In [None]:
train_df.drop(['SibSp', 'Parch','FamilySize'], axis=1, inplace = True)
test_df.drop(['SibSp', 'Parch','FamilySize'], axis=1, inplace = True)

In [None]:
correlation = train_df.select_dtypes(include=[np.number]).corr()
plt.figure(figsize=(25,14))
plt.title('Correlation Between Numeric Features', size=15)

sns.heatmap(correlation, square=True, vmax=0.8, cmap='coolwarm', linewidths=0.01,annot= True, annot_kws={"size": 8})

plt.show()

Survived has positive relationship with Fare, IsFamily, and negative relationship with Alone and PClass.

# 4. Encoding Categorical Variables

In [None]:
whole_df.dtypes.value_counts() #there are 9 categorical variables 

In [None]:
print(whole_df.select_dtypes(['object']).columns)

I will encode 
* Sex
* Embarked
* Title
* Deck
* and stop Name, Ticket,TicNum and SurTix
* Also, I will put Age into different bins and encode those bins

In [None]:
#Encode string to numbers for modelling.
#Sex
train_df['Sex_Code'] = train_df['Sex'].map({'female':1, 'male':0}).astype('int')
test_df['Sex_Code'] = test_df['Sex'].map({'female':1, 'male':0}).astype('int')

In [None]:
#Embarked
train_df['Embarked_Code'] = train_df['Embarked'].map({'S':0, 'C':1, 'Q':2}).astype('int')
test_df['Embarked_Code'] = test_df['Embarked'].map({'S':0, 'C':1, 'Q':2}).astype('int')

In [None]:
# Title
train_df['Title_Code'] = train_df.Title.map({'Mr':0,'Others':1, 'Master':2,'Miss':3, 'Mrs':4}).astype('int')
test_df['Title_Code'] = test_df.Title.map({'Mr':0,'Others':1, 'Master':2,'Miss':3, 'Mrs':4}).astype('int')

In [None]:
#Deck
train_df['Deck_Code'] = train_df['Deck'].map({'M':0, 'ABC':1, 'DE':2,'FG':3}).astype('int')
test_df['Deck_Code'] = test_df['Deck'].map({'M':0, 'ABC':1, 'DE':2,'FG':3}).astype('int')

In [None]:
#Age
interval = (0,2,4,10,19,35,60,100)
categories = ['Infant','Toddler','Kid','Teen','Young Adult','Adult','Senior']
train_df['Age_category'] = pd.cut(train_df.Age, interval, labels = categories)
test_df['Age_category'] = pd.cut(test_df.Age, interval, labels = categories)

In [None]:
train_df['Age_category'] = train_df['Age_category'].map({'Infant':0,'Toddler':1,'Kid':2,
                                                         'Teen':3,'Young Adult':4,'Adult':5,'Senior':6}).astype('int')
test_df['Age_category'] = test_df['Age_category'].map({'Infant':0,'Toddler':1,'Kid':2,
                                                         'Teen':3,'Young Adult':4,'Adult':5,'Senior':6}).astype('int')

In [None]:
# Defining the map function
#def dummies(x,df):
#    temp = pd.get_dummies(df[x], drop_first = True)
#    df = pd.concat([df, temp], axis = 1)
#    df.drop([x], axis = 1, inplace = True)
#    return df


# 5. Feature Scaling

# **6. Feature Selection**

In [None]:
train_df.columns

In [None]:
#drop unused columns
X_train = train_df.drop(['PassengerId', 'Name', 'Sex', 'Age','Ticket','Embarked',
       'Title', 'Deck','Surname', 'TicNum','SurTix','Survived'], axis=1)

In [None]:
y_train = train_df['Survived']

In [None]:
X_train

In [None]:
X_test = test_df.drop(['PassengerId', 'Name', 'Sex', 'Age','Ticket','Embarked',
       'Title', 'Deck','Surname', 'TicNum','SurTix','Survived'], axis=1)
X_test

**#Model 1: Random Forest**

In [None]:
model = RandomForestClassifier(n_estimators=400, random_state=2)

In [None]:
#feature importance
model.fit(X_train,y_train)
importance = pd.DataFrame({'feature':X_train.columns, 'importance': np.round(model.feature_importances_,3)})
importance = importance.sort_values('importance', ascending=False).set_index('feature')
importance.plot(kind='bar', rot=90)
plt.show()

Choose the top 6 important features for modelling (i.e. Fare, Age, Title_Code, Sex_Code and Pclass). Always keep minimal number of features to avoid over-fitting.

In [None]:
final = ['Fare', 'Title_Code','Sex_Code','Pclass','Age_category','Deck_Code']

In [None]:
#Tune Random Forest model parameters
grid_param = {
 'n_estimators': [10, 15, 20, 30,50,100,200,300,400,800],
 'criterion':['gini', 'entropy'],
 'min_samples_split': [2, 4, 10, 20],
 'min_samples_leaf': [1,2,5],
 'max_features':["sqrt", "auto", "log2"],
 'bootstrap': [True, False],
}
gd_sr = GridSearchCV(estimator=model,
 param_grid=grid_param,
 scoring='accuracy',
 cv=5,
 n_jobs=-1)
gd_sr.fit(X_train[final], y_train)
best_parameters = gd_sr.best_params_
print(best_parameters)

n_estimators = number of trees in the foreset <br>
max_features = max number of features considered for splitting a node <br>
max_depth = max number of levels in each decision tree <br>
min_samples_split = min number of data points placed in a node before the node is split <br>
min_samples_leaf = min number of data points allowed in a leaf node <br>
bootstrap = method for sampling data points (with or without replacement)<br>

In [None]:
#Set the model paramters after tunning.
model = RandomForestClassifier(bootstrap=False,criterion= 'gini',  
                               min_samples_leaf=5, min_samples_split=20,
                               max_features='sqrt' , n_estimators=800, 
                               random_state=5)

In [None]:
#Calculate the accuracy of prediction using 5-fold cross-validation.
all_accuracies = cross_val_score(estimator=model, X=X_train[final], y=y_train, cv=10)
all_accuracies


In [None]:
print('Accuracy: %.3f stdev: %.2f' % (np.mean(np.abs(all_accuracies)), np.std(all_accuracies)))

In [None]:
X_test = test_df[final]

In [None]:

model.fit(X_train[final],y_train)
prediction = model.predict(X_test)
output = pd.DataFrame({'PassengerId': test_df.PassengerId, 'Survived': prediction.astype(int)})
output.to_csv('my_submission.csv', index=False)

**#Model 2: XGBoost**

In [None]:
# Instantiate XGB classifier - its hyperparameters are tuned through SkLearn Grid Search below

XGBmodel = XGBClassifier(n_estimators=400, random_state=5)

In [None]:
scores = cross_val_score(XGBmodel, X_train[final], y_train, cv=10, n_jobs=1, scoring='accuracy')
XGBmodel.fit(X_train[final],y_train)
print(scores)
print('Accuracy: %.3f stdev: %.2f' % (np.mean(np.abs(scores)), np.std(scores)))

In [None]:
#Tune XGB classification model parameters
xgbcParams = {
    'max_depth': range (3, 10, 1),
    'n_estimators': [100,200,300,400,800],
    'learning_rate': [0.002, 0.006, 0.1, 0.01, 0.05],
    'reg_lambda':[0,0.10, 0.50, 1],
    'subsample': [0.3, 0.9],
    'colsample_bytree': (0.5, 0.9),
    'min_child_weight': [1, 2, 3, 4],
}
grid_search = GridSearchCV(estimator=XGBmodel,
    param_grid=xgbcParams,
    scoring = 'accuracy',
    n_jobs = 4,
    cv = 5,
    verbose=True
)
grid_search.fit(X_train[final], y_train)

In [None]:
print(grid_search.best_params_)
print(grid_search.best_score_)

In [None]:
#Set the model paramters after tunning.
XGBmodel = XGBClassifier(max_depth = 4,
                       n_estimators=400, 
                       learning_rate=0.1, 
                       reg_lamda= 1, 
                       subsample =0.3 ,
                       colsample_bytree =0.9 ,
                       min_child_weight =3 ,
                       random_state=5)


In [None]:
#Calculate the accuracy of prediction using 5-fold cross-validation.
all_accuracies = cross_val_score(estimator=XGBmodel, X=X_train[final], y=y_train, cv=10)
all_accuracies

In [None]:
print('Accuracy: %.3f stdev: %.2f' % (np.mean(np.abs(all_accuracies)), np.std(all_accuracies)))

In [None]:
X_test = test_df[final]
XGBmodel.fit(X_train[final],y_train)
prediction = XGBmodel.predict(X_test)
output = pd.DataFrame({'PassengerId': test_df.PassengerId, 'Survived': prediction.astype(int)})
output.to_csv('my_submission2.csv', index=False)

**#Model 3: ANN**

In [None]:
#feature scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train2 = sc.fit_transform(X_train[final])

In [None]:
X_test2 = sc.transform(X_test)

In [None]:
# Importing the Keras libraries and packages
import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout

In [None]:
def build_classifier(optimizer):
    classifier = Sequential()
    classifier.add(Dense(units = 6, kernel_initializer = 'uniform', activation = 'relu', input_dim = 6))
    classifier.add(Dense(units = 6, kernel_initializer = 'uniform', activation = 'relu'))
    classifier.add(Dense(units = 1, kernel_initializer = 'uniform', activation = 'sigmoid'))
    classifier.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
    return classifier

In [None]:
from keras.wrappers.scikit_learn import KerasClassifier
classifier = KerasClassifier(build_fn = build_classifier, epochs = 500, batch_size = 36,optimizer = 'adam')
accuracies = cross_val_score(estimator=classifier, X=X_train2, y=y_train, cv=10) 

In [None]:
mean = accuracies.mean()
variance = accuracies.std()
print(mean)
print(variance)

In [None]:
classifier.fit(X_train2, y_train)

In [None]:
y_pred = classifier.predict(X_test2)

In [None]:
y_pred = (y_pred > 0.5)
y_pred = y_pred.astype(int)

In [None]:
submission = pd.DataFrame({ 'PassengerId': test_df['PassengerId'],
                          'Survived': y_pred[:,-1]}) 
submission = submission.to_csv("submission3.csv", index=False)
submission = pd.read_csv('submission3.csv')
print(submission)