## Titanic Disaster Survival

In [None]:
from IPython.display import Image
Image(url= "https://static1.squarespace.com/static/5006453fe4b09ef2252ba068/5095eabce4b06cb305058603/5095eabce4b02d37bef4c24c/1352002236895/100_anniversary_titanic_sinking_by_esai8mellows-d4xbme8.jpg")

## TRAINING DATA
The training set should be used to build your machine learning models. 
For the training set, we provide the outcome (also known as the “ground truth”) 
for each passenger. Your model will be based on “features” like passengers’ gender and class.
You can also use feature engineering to create new features.

## TESTING DATA
The test set should be used to see how well your model performs on unseen data. 
For the test set, we do not provide the ground truth for each passenger. 
It is your job to predict these outcomes. For each passenger in the test set, 
use the model you trained to predict whether or not they survived the sinking of the Titanic.

## Data description
    Survived: 0 = Did not survive, 1 = Survived

    Pclass: Ticket class where 1 = First class, 2 = Second class, 3 = Third class. This can also be seen as a proxy for socio-economic status.

    Sex: Male or female

    Age: Age in years, fractional if less than 1

    SibSp: Number of siblings or spouses aboard the titanic

    Parch: Number of parents or children aboard the titanic

    Ticket: Passenger ticket number

    Fare: Passenger fare

    Cabin: Cabin number

    Embarked: Point of embarkation where C = Cherbourg, Q = Queenstown, S = Southampton


## IMPORT LIBRARIES

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()
import warnings
warnings.filterwarnings('ignore')
import matplotlib.patches as  mpatches

## IMPORTING THE DATASETS

In [None]:
train = pd.read_csv("../input/titanic/train.csv")

In [None]:
train.head(10)

In [None]:
train

In [None]:
test = pd.read_csv("../input/titanic/test.csv")

In [None]:
test.head(10)

In [None]:
print("Training data shape: ", train.shape)
print("Testing data shape: ", test.shape)


## EXPLORATORY DATA ANALYSIS (EDA)

In [None]:
print(train.info())
print("-----------------------------------------")
print(test.info())

In [None]:
train.isnull().sum().sort_values(ascending = False)

In [None]:
color = sns.dark_palette("#69d", reverse=True, as_cmap=True)
sns.heatmap(train.isnull(),cmap = color)

In [None]:
test.isnull().sum().sort_values(ascending = False)

In [None]:
color = sns.color_palette("ch:start=.2,rot=-.3", as_cmap=True)
sns.heatmap(test.isnull(),cmap = color)

## Summary Statistics of Training data

In [None]:
train.describe(include="all").T

## Summary Statistics of Testing data

In [None]:
test.describe(include="all").T

## Feature Analysis

In [None]:
train["Sex"].value_counts(dropna = False)

In [None]:
# Mean of survival by sex
train[['Sex','Survived']].groupby('Sex',as_index = False).mean().sort_values(by = 'Survived', ascending = False)

In [None]:
sns.barplot(x = "Sex",y = "Survived", data = train)
plt.xlabel("Sex",fontsize=15)
plt.ylabel("Survival Probability",fontsize=15)
plt.title("Survival Probability by Gender", fontsize=20)
plt.show()

# Categorical variable : Pclass

In [None]:
train["Pclass"].value_counts(dropna = False)

In [None]:
# Mean of survival by passenger class
train[['Pclass','Survived']].groupby('Pclass',as_index = False).mean().sort_values(by = 'Survived', ascending = False)

In [None]:
sns.barplot(x = "Pclass",y = "Survived", data = train, palette = "muted")
plt.xlabel("Sex",fontsize=15)
plt.ylabel("Survival Probability",fontsize=15)
plt.title("Survival Probability by Passenger class", fontsize=20)
plt.show()

In [None]:
# Survival by gender and passenger class
plot = sns.factorplot(x = 'Pclass', y = 'Survived', hue = 'Sex',
                     data = train, kind = 'bar') #,color = )
plot.despine(left = True)
plt.ylabel("Survival Probability",fontsize=15)
plt.title("Survival Probability by Sex and Passenger class", fontsize=20)
plt.show()

# Categorical variable : Embarked

In [None]:
train["Embarked"].value_counts(dropna = False)


In [None]:
# Mean of survival by passenger class
train[['Embarked','Survived']].groupby('Embarked',as_index = False).mean().sort_values(by = 'Survived', ascending = False)

In [None]:
color = sns.color_palette("ch:s=.25,rot=-.25", as_cmap=True)
sns.barplot(x = "Embarked",y = "Survived", data = train, palette = "Paired")
plt.xlabel("Embarked",fontsize=15)
plt.ylabel("Survival Probability",fontsize=15)
plt.title("Survival Probability by Embarked", fontsize=20)
plt.show()

In [None]:
sns.factorplot('Pclass', col = 'Embarked', data = train, kind ='count', 
               palette = "OrRd")

# Survival probability by all categorical variables

In [None]:
g = sns.FacetGrid(train, row = 'Embarked', size = 3, aspect = 1.6)
g.map(sns.pointplot,'Pclass','Survived','Sex', palette = 'YlGn')
g.add_legend()
plt.show()

# Numericals variables correlation with survival

In [None]:
sns.heatmap(train[['Survived','SibSp','Parch','Age',
                        'Fare']].corr(), annot = True,
            fmt = '.2f', cmap = 'coolwarm')

# Numerical variable : SibSp

In [None]:
train['SibSp'].value_counts(dropna = False)

In [None]:
# Mean of survival by SibSp
train[['SibSp','Survived']].groupby('SibSp',
                                         as_index = False).mean().sort_values(by = 'Survived',
                                                                             ascending = False)

In [None]:
sns.barplot(x = 'SibSp', y = 'Survived', data =  train, 
            palette = "copper_r")
plt.ylabel('Survived Probability')
plt.xlabel('Survived Probability by SibSp')
plt.show()

# Numerical variable : Parch

In [None]:
train['Parch'].value_counts(dropna = False)

In [None]:
# Mean of survival by SibSp
train[['Parch','Survived']].groupby('Parch',
                                         as_index = False).mean().sort_values(by = 'Survived',
                                                                             ascending = False)

In [None]:
sns.barplot(x = 'Parch', y = 'Survived', data =  train, 
            palette = "hot_r")
plt.ylabel('Survived Probability')
plt.xlabel('Survived Probability by Parch')
plt.show()

# Numerical variable : Age

In [None]:
train["Age"].isnull().sum()

In [None]:
# Passenger age distribution
sns.distplot(train["Age"],label= 'Skewness: %.2f'%(train['Age'].skew()))
plt.legend(loc = 'best')
plt.title('Passenger Age Distibution')
plt.show()

In [None]:
# Age distribution by survival
g = sns.FacetGrid(train, col = 'Survived')
g.map(sns.distplot,'Age', color = "red")
plt.show()

In [None]:
sns.kdeplot(train['Age'][train["Survived"]==0],
            hue = train["Survived"])
sns.kdeplot(train['Age'][train["Survived"]==1],
            hue = train["Survived"])
plt.xlabel('Age')
plt.title("Passenger Age Distribution by Survival")
plt.show()

In [None]:
train["Fare"].isnull().sum()

In [None]:
# Passenger by Fare 
sns.distplot(train["Fare"],label= 'Skewness: %.2f'%(train['Fare'].skew()))
plt.legend(loc = 'best')
plt.title('Passenger Age Distibution')
plt.show()

# Data Preprocessing

# drop and fill missing values

In [None]:
# droping ticket and cabin feature from train and test data
train = train.drop(['Ticket','Cabin'],axis = 1)
test = test.drop(['Ticket','Cabin'],axis = 1)

In [None]:
# Finding the most frequent value of Embarked in train
mode = train["Embarked"].dropna().mode()[0]
mode

In [None]:
# Fillinh missing values in Embarked with mode
train["Embarked"].fillna(mode, inplace = True)

In [None]:
# Missing values in test 
test.isnull().sum().sort_values(ascending = False)

In [None]:
# Computing Median of fare in test
median = test['Fare'].dropna().median()
median

In [None]:
# Filling missing values in Fare with median
test['Fare'].fillna(median, inplace = True)

In [None]:
# Combining the train_dat and test
combine = pd.concat([train,test],axis = 0).reset_index(drop = True)
combine.head()

In [None]:
# Missing values in the combined data
combine.isnull().sum().sort_values(ascending = False)

# Data Wrangling

In [None]:
combine["Sex"] = combine['Sex'].map({'male' : 0,'female' : 1})

In [None]:
sns.factorplot(y = 'Age', x = 'Sex', hue = 'Pclass', kind = 'box',
              data = combine, palette = 'bright')
sns.factorplot(y = 'Age', x = 'Parch', kind = 'box', data = combine,
              palette = 'bright')
sns.factorplot(y = 'Age', x = 'SibSp', kind = 'box', data = combine,
              palette = 'bright')
plt.show()

In [None]:
sns.heatmap(combine.drop(['Survived','Name','PassengerId','Fare'],
                       axis = 1).corr(), annot = True,cmap = 'coolwarm')
plt.show()

In [None]:
# check number of missing ages
age_nan_indices = list(combine[combine["Age"].isnull()].index)
len(age_nan_indices)

In [None]:
# loop through list and impute missing ages
for index in age_nan_indices:
    median_age = combine['Age'].median()
    predict_age = combine["Age"][(combine['SibSp'] == combine.iloc[index]['SibSp'])
                                & (combine['Parch'] == combine.iloc[index]['Parch'])
                                &(combine['Pclass'] == combine.iloc[index]['Pclass'])].median()
    if np.isnan(predict_age):
        combine["Age"].iloc[index] = median_age
    else:
        combine["Age"].iloc[index] = predict_age

In [None]:
combine["Age"].isnull().sum()

# Data Transformation

In [None]:
# passenger by Fare distribution
sns.distplot(combine['Fare'], label = 'Skewness: %.2f'%(combine["Fare"].skew()))
plt.legend(loc = 'best')
plt.title("Passenger Fare Distribution")
plt.show()

# Feature Engineering

In [None]:
# Get title from name
combine['Title'] = [name.split(",")[1].split('.')[0].strip() for name in 
                   combine["Name"]]
combine[['Name','Title']].head()

In [None]:
combine["Title"].value_counts()

In [None]:
combine["Title"].nunique()

In [None]:
# Simplify title
combine["Title"] = combine["Title"].replace(['Dr','Rev','Col',
                                'Major','Lady','Jonkheer','Don',
                                 'the Countess','Sir','Dona'],'Rare')
combine['Title'] = combine['Title'].replace(['Mlle','Ms'],'Miss')
combine['Title'] = combine['Title'].replace('Mme','Mrs')

In [None]:
sns.countplot(combine["Title"], edgecolor=(0,0,0),
                  linewidth=2)
plt.show()

In [None]:
# Mean of survival by name title
combine[['Title','Survived']].groupby(['Title'],as_index = False).mean().sort_values(by = 'Survived',
                                                         ascending = False)
    

In [None]:
sns.factorplot(x = 'Title', y = 'Survived', data = combine, kind = 'bar')
plt.ylabel("Survived Probability")
plt.title("Mean of Survival by Title")
plt.show()

In [None]:
# Drop Name column
combine = combine.drop(['Name'],axis = 1)
combine.head(10)

# IsAlone

In [None]:
# Calculate family size from SibSp and Parch
combine["Family Size"] = combine["SibSp"] + combine["Parch"] + 1
combine[["SibSp","Parch","Family Size"]].head(10)

In [None]:
# Mean of survival by family size
combine[['Family Size','Survived']].groupby('Family Size', 
                                            as_index = False).mean().sort_values(by = 'Survived',
                                                                                 ascending = False)

In [None]:
# Create a IsAlone feature
combine["IsAlone"] = 0
combine.loc[combine["Family Size"] == 1,'IsAlone'] = 1

In [None]:
# Mean of survival by IsAlone
combine[["IsAlone","Survived"]].groupby('IsAlone', as_index = False).mean().sort_values(by = 'Survived',
                                                                                       ascending = False)

In [None]:
combine = combine.drop(['SibSp','Parch','Family Size'], axis = 1)
combine.head()

In [None]:
# Age*Class
combine["Age Band"] = pd.cut(combine["Age"],5)
combine[['Age Band','Survived']].groupby('Age Band', as_index = False).mean().sort_values(by = 
                                                                        'Age Band')

In [None]:
# Assign ordinals to each Age Band
combine.loc[combine["Age"] <= 16.136, 'Age'] = 0
combine.loc[(combine['Age'] > 16.136) & (combine['Age'] <= 32.102),'Age'] = 1
combine.loc[(combine['Age'] > 32.102) & (combine['Age'] <= 48.068),'Age'] = 2
combine.loc[(combine['Age'] > 48.068) & (combine['Age'] <= 64.034),'Age'] = 3
combine.loc[combine["Age"] > 64.034, "Age"] = 4

In [None]:
# Drop Age Band feature
combine = combine.drop('Age Band', axis = 1)
combine

In [None]:
# Age and Pclass data types
combine[["Age", "Pclass"]].dtypes

In [None]:
combine

In [None]:
# Convert Ordinal Age into Integer
combine["Age"] = combine["Age"].astype('int')
combine["Age"].dtype

In [None]:
# Create Age * Class Feature
combine["Age*Class"] = combine["Age"] * combine["Pclass"]
combine[["Age", "Pclass","Age*Class"]].head(10)

# Feature Encoding

In [None]:
# Encode Title and Embarked feature
combine = pd.get_dummies(combine, columns = ["Title"])
combine = pd.get_dummies(combine, columns = ["Embarked"], prefix = 'Em')
combine.head()

In [None]:
# Divide fare into four bands
combine['Fare band'] = pd.cut(combine["Fare"], 4)
combine[["Fare band","Survived"]].groupby(['Fare band'],as_index = False).mean().sort_values(by = 'Fare band')

In [None]:
# Assign ordinals to each Age Band
combine.loc[combine["Fare"] <= 128.0, 'Fare'] = 0
combine.loc[(combine['Fare'] > 128.0) & (combine['Fare'] <= 256.0),'Fare'] = 1
combine.loc[(combine['Fare'] > 256.0) & (combine['Fare'] <= 384.0),'Fare'] = 2
combine.loc[combine['Fare'] > 384.0,'Fare'] = 3

In [None]:
# convert Fare into integer
combine["Fare"] = combine["Fare"].astype('int')
combine["Fare"]

In [None]:
# Droping Fare band feature
combine = combine.drop('Fare band',axis = 1)

In [None]:
combine.head(10)

In [None]:
train = combine[:len(train)]
test = combine[len(train):]

In [None]:
train.head(10)

In [None]:
# Drop passenger ID column from and training set
train = train.drop('PassengerId', axis = 1)
train.head()

In [None]:
#Converting survived back to integer
train["Survived"] = train["Survived"].astype('int')
train.head()

In [None]:
test.head()

In [None]:
# Droping the Survived nad PssengerId
test  = test.drop(["Survived"],axis = 1)
test.head()

In [None]:
updated = pd.concat([train, test], axis = 0).reset_index(drop = True)
updated

# Spliting the data into Training and Testing set

In [None]:
X_train = train.drop("Survived",axis = 1)
y_train = train["Survived"]
X_test = test.drop('PassengerId',axis = 1).copy()

In [None]:
print("X_train shape: ", X_train.shape)
print("X_test shape: ", X_test.shape)
print("y_train shape: ", y_train.shape)

# Modeling

  # 1.Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
lr = LogisticRegression(random_state = 0)
lr.fit(X_train,y_train)
predictions1 = lr.predict(X_test)

In [None]:
lr.score(X_train,y_train)

# 2. Support Vector Machine

In [None]:
from sklearn.svm import SVC

In [None]:
svc = SVC()
svc.fit(X_train,y_train)
predictions2 = svc.predict(X_test)

In [None]:
svc.score(X_train,y_train)

# 3. Perceptron

In [None]:
from sklearn.linear_model import Perceptron

In [None]:
perceptron = Perceptron()
perceptron.fit(X_train,y_train)
predictions3 = perceptron.predict(X_test)

In [None]:
perceptron.score(X_train,y_train)

# 4. SGDClassifier

In [None]:
from sklearn.linear_model import SGDClassifier

In [None]:
sgd = SGDClassifier()
sgd.fit(X_train,y_train)
predictions4 = sgd.predict(X_test)

In [None]:
sgd.score(X_train,y_train)

# 5. Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
random_forest  = RandomForestClassifier()
random_forest.fit(X_train,y_train)
predictions5 = random_forest.predict(X_test)

In [None]:
random_forest.score(X_train,y_train)

# 6. KNeighbors Classifier

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
knn = KNeighborsClassifier()
knn.fit(X_train,y_train)
predictions6 = knn.predict(X_test)

In [None]:
knn.score(X_train,y_train)

# 7. Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train,y_train)
predictions7 = decision_tree.predict(X_test)

In [None]:
decision_tree.score(X_train,y_train)

# 8. Linear SVC 

In [None]:
from sklearn.svm import LinearSVC

In [None]:
linear_svc = LinearSVC()
linear_svc.fit(X_train,y_train)
predictions8 = linear_svc.predict(X_test)

In [None]:
linear_svc.score(X_train,y_train)

# 9. Naive Byes

In [None]:
from sklearn.naive_bayes import GaussianNB

In [None]:
naive = GaussianNB()
naive.fit(X_train,y_train)
predictions9 = naive.predict(X_test)

In [None]:
naive.score(X_train,y_train)

# 10. Boosting

In [None]:
from catboost import CatBoostClassifier

In [None]:
catboost = CatBoostClassifier()
catboost.fit(X_train,y_train)
predictions10 = catboost.predict(X_test)

In [None]:
catboost.score(X_train,y_train)

# Model Evaluation and Hyperparameter

In [None]:
from sklearn.model_selection import cross_val_score

# Training Accuracy

In [None]:
models = pd.DataFrame({"Model" : ['Support vector Machines','KNN',
                                 'Logistic Regression','Random Forest',
                                 'Naive Bayes','Perceptron',
                                  'Stochastic Gradient Descent',
                                  'Linear SVC','Decision tree',
                                  'Cat Boost'],
                       'Score':[svc.score(X_train,y_train),
                                knn.score(X_train,y_train),
                               lr.score(X_train,y_train),
                               random_forest.score(X_train,y_train),
                               naive.score(X_train,y_train),
                               perceptron.score(X_train,y_train),
                               sgd.score(X_train,y_train),
                               linear_svc.score(X_train,y_train),
                               decision_tree.score(X_train,y_train),
                               catboost.score(X_train,y_train)]})
models.sort_values(by = 'Score', ascending = False, ignore_index = True)

# K-Fold cross validation

In [None]:
classifiers = []
classifiers.append(LogisticRegression())
classifiers.append(SVC())
classifiers.append(KNeighborsClassifier())
classifiers.append(GaussianNB())
classifiers.append(Perceptron())
classifiers.append(LinearSVC())
classifiers.append(SGDClassifier())
classifiers.append(DecisionTreeClassifier())
classifiers.append(RandomForestClassifier())
classifiers.append(CatBoostClassifier())
len(classifiers)

In [None]:
cv_results = []
for classifier in classifiers:
    cv_results.append(cross_val_score(classifier,X_train,y_train,
                                     scoring='accuracy',cv=10))

# Mean and Standard deviation of cross validation results

In [None]:
cv_mean = []
cv_std = []
for cv_result in cv_results:
    cv_mean.append(cv_result.mean())
    cv_std.append(cv_result.std())

In [None]:
cv = pd.DataFrame({'Cross Validation Mean': cv_mean,
                   'Cross Validation Std': cv_std,
                   'Models':['Logistic Regression','Support Vector Machine',
                             'KNN','GausianNB','Perceptron',
                            'Linear SVC','Stochastic Gradient Descent',
                             'Decision Tree','Random Forest','Cat Boost']})
cv.sort_values(by = 'Cross Validation Mean',ascending = False, 
               ignore_index = True)

In [None]:
plt.rcParams['figure.dpi'] = 250
fig = plt.figure(figsize=(5, 1.5), facecolor='#f6f5f5')
fig.tight_layout(pad=100)
gs = fig.add_gridspec(1, 2)
gs.update(wspace=0.4, hspace=10)


background_color = "#f6f5f5"
sns.set_palette(['#00A4CCFF']*6)

ax = fig.add_subplot(gs[0, 0])
for s in ["right", "top"]:
    ax.spines[s].set_visible(False)
ax.set_facecolor(background_color)
ax_sns = sns.barplot(ax=ax, x=cv['Models'], 
                      y=cv['Cross Validation Mean'], 
                      zorder=2, linewidth=0, alpha=1, saturation=1, color = '#2874A6')
ax_sns.set_ylabel("Cross Val Score",fontsize=8, weight='bold')
plt.xticks(rotation = 90)
ax_sns.set_xlabel("Alogorithm's Name",fontsize=8, weight='bold')
ax_sns.grid(which='major', axis='x', zorder=0, color='#EEEEEE', linewidth=0.4)
ax_sns.grid(which='major', axis='y', zorder=0, color='#EEEEEE', linewidth=0.4)
ax_sns.tick_params(labelsize=4, width=0.5, length=1.5)



In [None]:
plt.rcParams['figure.dpi'] = 250
fig = plt.figure(figsize=(5, 1.5), facecolor='#f6f5f5')
fig.tight_layout(pad=100)
gs = fig.add_gridspec(1, 2)
gs.update(wspace=0.4, hspace=10)


background_color = "#f6f5f5"
sns.set_palette(['#00A4CCFF']*6)

ax = fig.add_subplot(gs[0, 0])
for s in ["right", "top"]:
    ax.spines[s].set_visible(False)
ax.set_facecolor(background_color)
ax_sns = sns.barplot(ax=ax, x=cv['Models'], 
                      y=cv['Cross Validation Std'], 
                      zorder=2, linewidth=0, alpha=1, saturation=1, color = '#ff3333')
ax_sns.set_ylabel("Cross Val Standard Deviation",fontsize=8, weight='bold')
plt.xticks(rotation = 90)
ax_sns.set_xlabel("Alogorithm's Name",fontsize=8, weight='bold')
ax_sns.grid(which='major', axis='x', zorder=0, color='#EEEEEE', linewidth=0.4)
ax_sns.grid(which='major', axis='y', zorder=0, color='#EEEEEE', linewidth=0.4)
ax_sns.tick_params(labelsize=4, width=0.5, length=1.5)



# Hyperparameter Tuning for SVM

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
param_grid = {'C': [0.1,1,10,100,1000],
             'gamma':[1,0.1,0.01,0.001,0.0001],
             'kernel': ['rbf']}
grid = GridSearchCV(SVC(),param_grid, refit = True, verbose = 3)
grid.fit(X_train,y_train)

In [None]:
print("Best paramters: ",grid.best_params_)
print("Best estimator: ",grid.best_estimator_)

In [None]:
# Trainig Accuracy
svc = SVC(C = 100, gamma = 0.01, kernel = 'rbf')
svc.fit(X_train,y_train)
y_prediction = svc.predict(X_test)
s = svc.score(X_train,y_train)
print("Our SVC Training Accuracy Score increase ",s)

In [None]:
y_prediction

In [None]:
len(y_prediction)

In [None]:
final_submission = pd.DataFrame({'PassengerId': test['PassengerId'], 'Survived': y_prediction})
final_submission

In [None]:
final_submission.shape

In [None]:
final_submission.to_csv("Titanic.csv",index=False)