# Titanic Survival Prediction Noteook

### importing the libs

In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
import pickle

### loading test and train raw data


In [2]:
# loading the training data into data frame and view it

train_raw_data_path= os.path.join(os.path.pardir,"data","raw","train.csv")
train_data=pd.read_csv(train_raw_data_path,index_col="PassengerId")  #PassengerId as index column
train_data.head(5) # to view first 5 rows of data frame

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
# loading the training data into data frame and view it

test_raw_data_path= os.path.join(os.path.pardir,"data","raw","test.csv")

test_data=pd.read_csv(test_raw_data_path,index_col="PassengerId")
test_data['Survived']=888  # creating "Survived" new column assigning default value in test data frame
test_data.head(5)

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,888
893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S,888
894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,888
895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,888
896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,888


In [4]:
# merging both test and train data frames into one

df_data=pd.concat([train_data,test_data],axis=0)
df_data.head(5)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0_level_0,Age,Cabin,Embarked,Fare,Name,Parch,Pclass,Sex,SibSp,Survived,Ticket
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,22.0,,S,7.25,"Braund, Mr. Owen Harris",0,3,male,1,0,A/5 21171
2,38.0,C85,C,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,1,female,1,1,PC 17599
3,26.0,,S,7.925,"Heikkinen, Miss. Laina",0,3,female,0,1,STON/O2. 3101282
4,35.0,C123,S,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,1,female,1,1,113803
5,35.0,,S,8.05,"Allen, Mr. William Henry",0,3,male,0,0,373450


## Feature Engineering

In [5]:
# function to get tittle from the name
def get_title(name):
    if '.' in name:
        return name.split(',')[1].split('.')[0].strip()
    else:
        return 'Unknown'

# replace extra tittles with appropriate one    
def replace_titles(x):
    title = x['Title']
    if title in ['Capt', 'Col', 'Don', 'Jonkheer', 'Major', 'Rev', 'Sir']:
        return 'Mr'
    elif title in ['the Countess', 'Mme', 'Lady']:
        return 'Mrs'
    elif title in ['Mlle', 'Ms']:
        return 'Miss'
    elif title =='Dr':
        if x['Sex']=='male':
            return 'Mr'
        else:
            return 'Mrs'
    else:
        return title



df_data["Title"]=df_data["Name"].map(lambda x : get_title(x))   # creating a new column in data frame for Tittle

df_data['Title'] = df_data.apply(replace_titles, axis=1) 
df_data.head(5)

Unnamed: 0_level_0,Age,Cabin,Embarked,Fare,Name,Parch,Pclass,Sex,SibSp,Survived,Ticket,Title
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,22.0,,S,7.25,"Braund, Mr. Owen Harris",0,3,male,1,0,A/5 21171,Mr
2,38.0,C85,C,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,1,female,1,1,PC 17599,Mrs
3,26.0,,S,7.925,"Heikkinen, Miss. Laina",0,3,female,0,1,STON/O2. 3101282,Miss
4,35.0,C123,S,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,1,female,1,1,113803,Mrs
5,35.0,,S,8.05,"Allen, Mr. William Henry",0,3,male,0,0,373450,Mr


In [6]:
# creating a new feature "Familysize"

df_data["Familysize"]=df_data['Parch']+df_data['SibSp']+1 

df_data.head(5)

Unnamed: 0_level_0,Age,Cabin,Embarked,Fare,Name,Parch,Pclass,Sex,SibSp,Survived,Ticket,Title,Familysize
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,22.0,,S,7.25,"Braund, Mr. Owen Harris",0,3,male,1,0,A/5 21171,Mr,2
2,38.0,C85,C,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,1,female,1,1,PC 17599,Mrs,2
3,26.0,,S,7.925,"Heikkinen, Miss. Laina",0,3,female,0,1,STON/O2. 3101282,Miss,1
4,35.0,C123,S,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,1,female,1,1,113803,Mrs,2
5,35.0,,S,8.05,"Allen, Mr. William Henry",0,3,male,0,0,373450,Mr,1


### Droping irrelavent features

In [7]:
df_data.drop(['Name','Ticket','Cabin','Parch','SibSp'],axis=1,inplace=True)

df_data.head(5)

Unnamed: 0_level_0,Age,Embarked,Fare,Pclass,Sex,Survived,Title,Familysize
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,22.0,S,7.25,3,male,0,Mr,2
2,38.0,C,71.2833,1,female,1,Mrs,2
3,26.0,S,7.925,3,female,1,Miss,1
4,35.0,S,53.1,1,female,1,Mrs,2
5,35.0,S,8.05,3,male,0,Mr,1


In [8]:
# to check the type and number of missing values of all the columns or features in data frame

df_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 1 to 1309
Data columns (total 8 columns):
Age           1046 non-null float64
Embarked      1307 non-null object
Fare          1308 non-null float64
Pclass        1309 non-null int64
Sex           1309 non-null object
Survived      1309 non-null int64
Title         1309 non-null object
Familysize    1309 non-null int64
dtypes: float64(2), int64(3), object(3)
memory usage: 92.0+ KB


 ### Missing Value Substitution

In [9]:

# replace missing values of age column with median of age
df_data.Age.fillna(df_data['Age'].median(),inplace=True)

# replace missing values of fare column with mean of fare
df_data.Fare.fillna(df_data['Fare'].mean(),inplace=True)

In [10]:
# to check distribution of passengers across Embarked points

df_data.Embarked.value_counts()

S    914
C    270
Q    123
Name: Embarked, dtype: int64

In [11]:
# S embarked point have max passenger so replace missing values of embarked with S

df_data.Embarked.fillna('S',inplace=True)

In [12]:
# convert Categorical features like Embarked,Sex and Tittle into numerical feature

df_data=pd.get_dummies(df_data,['Embarked','Sex','Title'])
df_data.head(5)

Unnamed: 0_level_0,Age,Fare,Pclass,Survived,Familysize,Embarked_C,Embarked_Q,Embarked_S,Sex_female,Sex_male,Title_Dona,Title_Master,Title_Miss,Title_Mr,Title_Mrs
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1,22.0,7.25,3,0,2,0,0,1,0,1,0,0,0,1,0
2,38.0,71.2833,1,1,2,1,0,0,1,0,0,0,0,0,1
3,26.0,7.925,3,1,1,0,0,1,1,0,0,0,1,0,0
4,35.0,53.1,1,1,2,0,0,1,1,0,0,0,0,0,1
5,35.0,8.05,3,0,1,0,0,1,0,1,0,0,0,1,0


In [13]:
# to check data type and missing values of data frame

df_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 1 to 1309
Data columns (total 15 columns):
Age             1309 non-null float64
Fare            1309 non-null float64
Pclass          1309 non-null int64
Survived        1309 non-null int64
Familysize      1309 non-null int64
Embarked_C      1309 non-null uint8
Embarked_Q      1309 non-null uint8
Embarked_S      1309 non-null uint8
Sex_female      1309 non-null uint8
Sex_male        1309 non-null uint8
Title_Dona      1309 non-null uint8
Title_Master    1309 non-null uint8
Title_Miss      1309 non-null uint8
Title_Mr        1309 non-null uint8
Title_Mrs       1309 non-null uint8
dtypes: float64(2), int64(3), uint8(10)
memory usage: 74.1 KB


In [14]:
# Sorting or rearranging of columns in data frame so survived columns should be at last

ColumnName =    [ColumnName for ColumnName in df_data if ColumnName != 'Survived'] + ['Survived']
print(ColumnName)
df_data=df_data[ColumnName]
df_data.head(5)

['Age', 'Fare', 'Pclass', 'Familysize', 'Embarked_C', 'Embarked_Q', 'Embarked_S', 'Sex_female', 'Sex_male', 'Title_Dona', 'Title_Master', 'Title_Miss', 'Title_Mr', 'Title_Mrs', 'Survived']


Unnamed: 0_level_0,Age,Fare,Pclass,Familysize,Embarked_C,Embarked_Q,Embarked_S,Sex_female,Sex_male,Title_Dona,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Survived
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1,22.0,7.25,3,2,0,0,1,0,1,0,0,0,1,0,0
2,38.0,71.2833,1,2,1,0,0,1,0,0,0,0,0,1,1
3,26.0,7.925,3,1,0,0,1,1,0,0,0,1,0,0,1
4,35.0,53.1,1,2,0,0,1,1,0,0,0,0,0,1,1
5,35.0,8.05,3,1,0,0,1,0,1,0,0,0,1,0,0


In [15]:
# write the processed train and test data into proceesed folder 

df_data[df_data.Survived!=888].to_csv("../data/processed/train.csv")
test_data=df_data[df_data.Survived==888]
test_data.drop(['Survived'],axis=1,inplace=True)
test_data.to_csv("../data/processed/test.csv")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [16]:
process_train_data=pd.read_csv("../data/processed/train.csv",index_col="PassengerId")
process_train_data.head()



Unnamed: 0_level_0,Age,Fare,Pclass,Familysize,Embarked_C,Embarked_Q,Embarked_S,Sex_female,Sex_male,Title_Dona,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Survived
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1,22.0,7.25,3,2,0,0,1,0,1,0,0,0,1,0,0
2,38.0,71.2833,1,2,1,0,0,1,0,0,0,0,0,1,1
3,26.0,7.925,3,1,0,0,1,1,0,0,0,1,0,0,1
4,35.0,53.1,1,2,0,0,1,1,0,0,0,0,0,1,1
5,35.0,8.05,3,1,0,0,1,0,1,0,0,0,1,0,0


In [17]:
# Separate the features and prediction columns into two data frames

columns=process_train_data.columns

features=columns[:-1]
label=columns[-1:]

X=process_train_data[features]
Y=process_train_data[label]


# split the training data into 30% ratio of train and test data
x_train,x_test,y_train,y_test= train_test_split(X,Y,test_size=0.30,random_state=0)



In [18]:
# load RandomForestClassifier and train the model

from sklearn.ensemble  import RandomForestClassifier

rfc_model=RandomForestClassifier(n_estimators=100, max_depth=2,random_state=0)
rfc_model.fit(x_train,y_train)

  


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=2, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [19]:
# check the prediction score

y_predict=rfc_model.predict(x_test)

accuracy_score(y_test,y_predict)

0.7873134328358209

In [20]:
# load LogisticRegression Model and train the model

lgr_model=LogisticRegression(random_state=0)

parameters = {
        "C" : [0.1,1.0,10.0,50.0,100.0],
        "penalty" : ["l1","l2"]
}

# Use grid search to find the best hyperparameters values

clf = GridSearchCV(lgr_model,param_grid=parameters,cv=3)

clf.fit(x_train,y_train)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=0, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'C': [0.1, 1.0, 10.0, 50.0, 100.0], 'penalty': ['l1', 'l2']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [21]:
clf.best_params_

{'C': 1.0, 'penalty': 'l1'}

In [22]:
# check the accuracy score of logistic regression model
y_predict_lgr=clf.predict(x_test)
accuracy_score(y_test,y_predict_lgr)

0.8246268656716418

In [23]:
#load the processed Test data

process_test_data=pd.read_csv("../data/processed/test.csv",index_col="PassengerId")

process_test_data.head(5)

Unnamed: 0_level_0,Age,Fare,Pclass,Familysize,Embarked_C,Embarked_Q,Embarked_S,Sex_female,Sex_male,Title_Dona,Title_Master,Title_Miss,Title_Mr,Title_Mrs
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
892,34.5,7.8292,3,1,0,1,0,0,1,0,0,0,1,0
893,47.0,7.0,3,2,0,0,1,1,0,0,0,0,0,1
894,62.0,9.6875,2,1,0,1,0,0,1,0,0,0,1,0
895,27.0,8.6625,3,1,0,0,1,0,1,0,0,0,1,0
896,22.0,12.2875,3,3,0,0,1,1,0,0,0,0,0,1


In [24]:
#predict the survival with logistic regreesion model and make dataframe in required submission format

y_predict_lgr=clf.predict(process_test_data)
submission_data=pd.DataFrame(y_predict_lgr,process_test_data.index)
submission_data.columns=['Survived']

In [25]:
#write the predictions in required format for submission

submission_file_path= os.path.join(os.path.pardir,"data","external","submission.csv")

submission_data.to_csv(submission_file_path)

In [32]:
# dump trained model for later use 

model_path= os.path.join(os.path.pardir,"models","lgr_model")

pickle.dump(clf,open(model_path,"wb"))