# Machine Learning - Titanic Project


In [None]:
# data analysis and wrangling
import pandas as pd
import numpy as np
import random as rnd

#visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier

## Acquiring data



In [None]:
train_df=pd.read_csv('../input/train.csv')
test_df=pd.read_csv('../input/test.csv')

#train_df.head()
test_df.head()

In [None]:
train_df.head()


In [None]:
combine=[train_df, test_df]
combine

## Explaratory Data Analysis



In [None]:
train_df.columns

In [None]:
# preview the data
train_df.head()

In [None]:
train_df.tail()

In [None]:
train_df.info()
#cabin age and embarked have missing values
#survival:   Survival
#PassengerId: Unique Id of a passenger.
#pclass: Ticket class    
#sex:    Sex 
#Age:    Age in years    
#sibsp:  # of siblings / spouses aboard the Titanic  
#parch:  # of parents / children aboard the Titanic  
#ticket: Ticket number   
#fare:   Passenger fare  
#cabin:  Cabin number    
#embarked:   Port of Embarkation

In [None]:
train_df.info()
print('_'*40)
test_df.info()

In [None]:
#of values that are null
total=train_df.isnull().sum().sort_values(ascending=False)
total
percent_1=(train_df.isnull().sum()/train_df.isnull().count())*100
percent_1
percent_2=round(percent_1,1).sort_values(ascending=False)
nulltotal1=pd.DataFrame(percent_2,columns=['Percentage'])
nulltotal1
nulltotal2=pd.DataFrame(total, columns=['Total'])
nulltotal2
missing_data_table=pd.concat([nulltotal2,nulltotal1], axis=1)
missing_data_table


In [None]:
#Survival Rate
(train_df['Survived']==1).sum()/(len(train_df['Survived']))
#parents/children rate
(train_df['Parch']==0).sum()/(len(train_df['Parch']))
#rate of siblings/spouse
(train_df['SibSp']==0).sum()/(len(train_df['Parch']))
# %fares above 512
(train_df['Fare']>=512).sum()/(len(train_df['Fare']))
# %elderly peeps
(train_df['Age']>=65).sum()/(len(train_df['Age']))

In [None]:
train_df.describe()


In [None]:
#unique names
print('Unique names boarded on the titanic', train_df['Name'].nunique())
#ratio of males
print('The % of males on the boat is',((train_df['Sex']=='male').sum()/(len(train_df['Sex']))*100))
#different cabins
train_df['Cabin'].value_counts()
#Different Embarked
train_df['Embarked'].value_counts()

In [None]:
#to describe string columns
train_df.describe(include=['O'])

## Analyzing individual features agaisnt survival rate


In [None]:
train_df[['Pclass','Survived']].groupby(['Pclass'], as_index=False).mean().sort_values(by='Survived', ascending = False)

In [None]:
pd.crosstab(train_df['Survived'], train_df['Sex'])

In [None]:
train_df[['Sex','Survived']].groupby(['Sex'], as_index=False).mean().sort_values(by='Survived', ascending=False)

In [None]:
train_df[["SibSp", "Survived"]].groupby(['SibSp'], as_index=False).mean().sort_values(by='Survived', ascending=False)

In [None]:
train_df[['Parch', 'Survived']].groupby(['Parch'], as_index=False).mean().sort_values(by='Survived', ascending=False)

## Visualizing Data




In [None]:
fig,axes = plt.subplots(nrows=1,ncols=2,figsize=(10,4))
women = train_df[train_df['Sex']=='female']
men = train_df[train_df['Sex']=='male']
ax=sns.distplot(women[train_df['Survived']==1]['Age'].dropna(),ax=axes[0],kde=False,bins=18, label='Alive')
ax=sns.distplot(women[train_df['Survived']==0]['Age'].dropna(),ax=axes[0],kde=False,bins=40,label='Dead')
axes[0].set_title('Female')
axes[0].legend()
ax=sns.distplot(men[train_df['Survived']==1]['Age'].dropna(),ax=axes[1],kde=False,bins=18, label='Alive')
ax=sns.distplot(men[train_df['Survived']==0]['Age'].dropna(),ax=axes[1],kde=False,bins=40, label='Dead')
axes[1].set_title('Male')
axes[1].legend()




In [None]:
#g = sns.FacetGrid(train_df, col='Survived')
#g.map(plt.hist, 'Age', bins=20)
g=sns.FacetGrid(train_df, col='Survived')
g.map(sns.distplot, 'Age', bins=20)

In [None]:

grid = sns.FacetGrid(train_df, col='Survived', row='Pclass', size=2.2, aspect=1.6)
grid.map(plt.hist, 'Age',alpha=.5, bins=20)
grid.add_legend();


In [None]:
#2nd way of plotting the same graph above
grid=sns.FacetGrid(train_df,col='Survived', row='Pclass')
grid.map(sns.distplot,'Age',kde=False)

In [None]:
#FacetGrid = sns.FacetGrid(train_df, row='Embarked', size=4.5, aspect=1.6)
#FacetGrid.map(sns.pointplot, 'Pclass', 'Survived', 'Sex', palette=None,  order=None, hue_order=None )
#FacetGrid.add_legend()
FacetGrid=sns.FacetGrid(train_df,row='Embarked', size=5, aspect=2)
FacetGrid.map(sns.pointplot,'Pclass', 'Survived', 'Sex')

In [None]:
# grid = sns.FacetGrid(train_df, col='Embarked')
grid = sns.FacetGrid(train_df, row='Embarked', size=2.2, aspect=1.6)
grid.map(sns.pointplot, 'Pclass', 'Survived', 'Sex', palette='deep')
grid.add_legend()

In [None]:
#grid=sns.FacetGrid(train_df, col='Survived',row='Embarked')
#grid.map(sns.barplot,'Sex','Fare')

grid = sns.FacetGrid(train_df, row='Embarked', col='Survived', size=2.2, aspect=1.6)
grid.map(sns.barplot, 'Sex', 'Fare', alpha=.5, ci=None)
grid.add_legend()

In [None]:
sns.barplot(x='Pclass', y='Survived',hue='Sex', data=train_df)

## Feature Engineering

In [None]:
print("Before", train_df.shape, test_df.shape, combine[0].shape, combine[1].shape)
train_df.head()
train_df = train_df.drop(['Ticket', 'Cabin'], axis=1)
test_df = test_df.drop(['Ticket', 'Cabin'], axis=1)
combine = [train_df, test_df]

"After", train_df.shape, test_df.shape, combine[0].shape, combine[1].shape

In [None]:
for dataset in combine:
    dataset['Title'] = dataset.Name.str.extract('([A-Za-z]+)\.', expand=False)

pd.crosstab(train_df['Title'], train_df['Sex'])

In [None]:
for dataset in combine:
    dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')

    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')
    
train_df[['Title', 'Survived']].groupby(['Title'], as_index=False).mean().sort_values(by='Survived', ascending=False)

#pd.crosstab(train_df['Title'], train_df['Survived'])

In [None]:
title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}
for dataset in combine:
    dataset['Title'] = dataset['Title'].map(title_mapping)
    dataset['Title'] = dataset['Title'].fillna(0)
train_df.head()


In [None]:
train_df = train_df.drop(['Name', 'PassengerId'], axis=1)
test_df = test_df.drop(['Name'], axis=1)
combine = [train_df, test_df]

In [None]:
train_df.head()
#test_df.head()

In [None]:
for dataset in combine:
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1

train_df[['FamilySize', 'Survived']].groupby(['FamilySize'], as_index=False).mean().sort_values(by='Survived', ascending=False)
#different way of creating new column based on condition
#train_df["not_alone"]=np.where(train_df['relatives']>0, '0', '1')

In [None]:
for dataset in combine:
    dataset['IsAlone'] = 0
    dataset.loc[dataset['FamilySize'] == 1, 'IsAlone'] = 1

train_df[['IsAlone', 'Survived']].groupby(['IsAlone'], as_index=False).mean()

In [None]:
train_df[['FamilySize', 'Survived']].groupby(['FamilySize'], as_index=False).mean()

In [None]:
train_df.head()

In [None]:
train_df['IsAlone'].value_counts()

In [None]:
fig,axes = plt.subplots()
ax=sns.distplot(train_df[train_df['Survived']==1]['FamilySize'].dropna(),kde=False,bins=18, label='Alive')
ax=sns.distplot(train_df[train_df['Survived']==0]['FamilySize'].dropna(),kde=False,bins=40,label='Dead')
ax.legend()
ax.set_title('Survival Numbers Depending on Relatives')

#grid = sns.FacetGrid(train_df, col='Survived', size=2.2, aspect=1.6)
#grid.map(plt.hist, 'relatives', alpha=.5, bins=20)

In [None]:
train_df[['Survived','FamilySize']].groupby(['FamilySize']).mean().sort_values(by='Survived', ascending=False)


In [None]:
pd.crosstab(train_df['FamilySize'], train_df['Survived'])

In [None]:
axes = sns.factorplot('FamilySize','Survived', 
                      data=train_df, aspect = 2.5, )

In [None]:
train_df = train_df.drop(['Parch', 'SibSp', 'FamilySize'], axis=1)
test_df = test_df.drop(['Parch', 'SibSp', 'FamilySize'], axis=1)
combine = [train_df, test_df]

train_df.head()

In [None]:
for dataset in combine:
    dataset['Sex']=dataset["Sex"].map({'female':1,'male':0})


train_df.info()

In [None]:
#Null Values

#look at the distribution of age per gender and pclass
grid=sns.FacetGrid(train_df,row='Pclass',col='Sex')
grid.map(sns.distplot,'Age',kde=False)


In [None]:
guessed_array=np.zeros((2,3))
#guessed_array[0,0]=2
guessed_array

In [None]:
combine=[train_df,test_df]
for dataset in combine:
    for i in range (0,2):
        for j in range (0,3):
            age_per_category= dataset[(dataset['Sex']==i) & (dataset['Pclass']==j+1)]['Age'].dropna()
            age_guess=age_per_category.median()
            guessed_array[i,j] = int(age_guess/0.5 + 0.5 ) * 0.5
            
for dataset in combine:
    for i in range (0,2):
        for j in range (0,3):
            dataset.loc[(dataset['Age'].isnull()) & (dataset['Sex']==i) & (dataset['Pclass']==j+1),'Age'] = guessed_array[i,j]
     
    dataset['Age'] = dataset['Age'].astype(int)
train_df.head()
            
            
 #just practicing for the above cell - due to the chaining effect, the below line will not update the original datafframe column of age, you need to use the loc fucntion
#(train_df[(train_df['Age'].isnull()) & (train_df['Sex']==0) & (train_df['Pclass']==3)])['Age']=2
#(train_df.loc[(train_df.Age.isnull()) & (train_df.Sex == 0) & (train_df.Pclass == 3)])       

In [None]:
#split age into categories
train_df['AgeBand'] = pd.cut(train_df['Age'], 5)
train_df["AgeBand"].value_counts()


In [None]:
train_df[['AgeBand', 'Survived']].groupby(["AgeBand"]).mean()

In [None]:
pd.crosstab(train_df['AgeBand'], train_df['Survived'])

In [None]:
#just testing how pivot tables work.
#train_df['Survived1']=train_df['Survived']
#pd.pivot_table(train_df,index=['AgeBand'], columns='Survived',values='Survived1',aggfunc=np.sum)
#train_df = train_df.drop(['Survived1'], axis=1)
train_df.head()

In [None]:
combine=[train_df,test_df]
for dataset in combine:    
    dataset.loc[ dataset['Age'] <= 16, 'Age'] = 0
    dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 32), 'Age'] = 1
    dataset.loc[(dataset['Age'] > 32) & (dataset['Age'] <= 48), 'Age'] = 2
    dataset.loc[(dataset['Age'] > 48) & (dataset['Age'] <= 64), 'Age'] = 3
    dataset.loc[ dataset['Age'] > 64, 'Age']=4
train_df.head(5)



In [None]:
train_df=train_df.drop(['AgeBand'],axis=1)
combine=[train_df,test_df]


In [None]:
train_df.head()

In [None]:
for dataset in combine:
    dataset['Age*Class'] = dataset.Age * dataset.Pclass

train_df.loc[:, ['Age*Class', 'Age', 'Pclass']].head(10)

#same 2 same as above
#train_df[['Age*Class', 'Age', 'Pclass']].head(10)

In [None]:
Embarked_max=train_df['Embarked'].mode()[0]


In [None]:
combine=[train_df,test_df]
for dataset in combine:
    dataset['Embarked']=dataset['Embarked'].fillna(Embarked_max)

In [None]:
train_df[['Embarked','Survived']].groupby(['Embarked'],as_index=False).mean().sort_values(by='Survived', ascending=False)

In [None]:
#can now convert the EmbarkedFill feature by creating a new numeric Port feature.
Embarked_dic={'S':0,'C':1,'Q':2}
for dataset in combine:
    dataset['Embarked']=dataset['Embarked'].map(Embarked_dic)

In [None]:
train_df.head()

In [None]:
#Filling Fare NUll Value
test_df['Fare'].fillna(test_df['Fare'].dropna().median(),inplace=True)

In [None]:
sns.distplot(train_df['Fare'],kde=False)

In [None]:
#Fare Band
train_df['FareBand'] = pd.qcut(train_df['Fare'], 4)
train_df[['FareBand', 'Survived']].groupby(['FareBand'], as_index=False).mean().sort_values(by='FareBand', ascending=True)

In [None]:
train_df.head()

In [None]:
for dataset in combine:
    dataset.loc[dataset['Fare'] <=7.91, 'Fare'] = 0
    dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare']<=14.454), 'Fare'] = 1
    dataset.loc[(dataset['Fare']>14.454) & (dataset['Fare'] <=31), 'Fare']=2
    dataset.loc[dataset['Fare'] > 31, 'Fare'] = 3
    dataset['Fare'] = dataset['Fare'].astype(int)


In [None]:
train_df['Fare'].value_counts()

In [None]:
train_df.head(10)

In [None]:
test_df.head(10)

In [None]:
train_df = train_df.drop(['FareBand'], axis=1)
combine = [train_df, test_df]

## Prediciton and Modelling
 
 we can narrow down our choice of models to a few. These include:

- Logistic Regression
- KNN or k-Nearest Neighbors
- Support Vector Machines
- Naive Bayes classifier
- Decision Tree
- Random Forrest
- Perceptron
- Artificial neural network
- RVM or Relevance Vector Machine

In [None]:
X_train = train_df.drop("Survived", axis=1)
Y_train = train_df["Survived"]
X_test  = test_df.drop("PassengerId", axis=1).copy()
X_train.shape, Y_train.shape, X_test.shape

In [None]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression()
logreg.fit(X_train, Y_train)
Y_pred = logreg.predict(X_test)
Y_pred

acc_log = round(logreg.score(X_train, Y_train) * 100, 2)
acc_log

#.score formula= takes in x_train, internally calculates y_train_predict and compares it to y_train

#logreg.score(X_train, Y_train) is equivalent to your print(classification_report(Y_train, Y_pred_train)). But you do not need to calculate Y_pred; that is done internally by the library

In [None]:
logreg.coef_[0]

In [None]:
coeff_df = pd.DataFrame(train_df.columns.delete(0),columns=['Feature'])

coeff_df
coeff_df["Correlation"] = pd.DataFrame(logreg.coef_[0])

coeff_df.sort_values(by='Correlation', ascending=False)

In [None]:
# Support Vector Machines

svc = SVC()
svc.fit(X_train, Y_train)
Y_pred = svc.predict(X_test)
acc_svc = round(svc.score(X_train, Y_train) * 100, 2)
acc_svc

In [None]:
knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(X_train, Y_train)
Y_pred = knn.predict(X_test)
acc_knn = round(knn.score(X_train, Y_train) * 100, 2)
acc_knn

In [None]:
# Gaussian Naive Bayes

gaussian = GaussianNB()
gaussian.fit(X_train, Y_train)
Y_pred = gaussian.predict(X_test)
acc_gaussian = round(gaussian.score(X_train, Y_train) * 100, 2)
acc_gaussian

In [None]:
# Perceptron

perceptron = Perceptron()
perceptron.fit(X_train, Y_train)
Y_pred = perceptron.predict(X_test)
acc_perceptron = round(perceptron.score(X_train, Y_train) * 100, 2)
acc_perceptron

In [None]:
# Linear SVC

linear_svc = LinearSVC()
linear_svc.fit(X_train, Y_train)
Y_pred = linear_svc.predict(X_test)
acc_linear_svc = round(linear_svc.score(X_train, Y_train) * 100, 2)
acc_linear_svc

In [None]:
# Stochastic Gradient Descent

sgd = SGDClassifier()
sgd.fit(X_train, Y_train)
Y_pred = sgd.predict(X_test)
acc_sgd = round(sgd.score(X_train, Y_train) * 100, 2)
acc_sgd

In [None]:
# Decision Tree

decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, Y_train)
Y_pred = decision_tree.predict(X_test)
acc_decision_tree = round(decision_tree.score(X_train, Y_train) * 100, 2)
acc_decision_tree

.

In [None]:
# Random Forest

random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(X_train, Y_train)
Y_pred = random_forest.predict(X_test)
random_forest.score(X_train, Y_train)
acc_random_forest = round(random_forest.score(X_train, Y_train) * 100, 2)
acc_random_forest

### Model evaluation



In [None]:
models = pd.DataFrame({
    'Model': ['Support Vector Machines', 'KNN', 'Logistic Regression', 
              'Random Forest', 'Naive Bayes', 'Perceptron', 
              'Stochastic Gradient Decent', 'Linear SVC', 
              'Decision Tree'],
    'Score': [acc_svc, acc_knn, acc_log, 
              acc_random_forest, acc_gaussian, acc_perceptron, 
              acc_sgd, acc_linear_svc, acc_decision_tree]})
models.sort_values(by='Score', ascending=False)