In [35]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

Predicting titanic survivor using Logistic Regression

The data provided by https://www.kaggle.com

link : https://www.kaggle.com/c/titanic/overview

In [36]:
#loading the data into data frame

#this is our data frame for our training data
df_all_passengers_train = pd.read_csv('E:\\for github\\predictive analysis\\titanic\\train.csv')

#this one is our competition data that we will try to predict later 
df_all_passengers_competition = pd.read_csv('E:\\for github\\predictive analysis\\titanic\\test.csv')

#examine the df
df_all_passengers_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [37]:
#okay lets proceed to our train data to build our LR model

#now, we select The features we want to use in our logistic regression model

#i am going to use:
    #Pclass (passenger class (divided into 3 class), higher class means top priority, 
        #and have better room placement in ship)
    
    #Sex (well we know women, children, and old people are top priority in case of emergency)
    
    #Age (children and the olds are priority in case of emergency)
    
#We are also going to use Survived column as our training predictive outcome
    #0 means deceased, 1 means survived

#Next put the column name of our features and traning outcome in lists 
predicting_feature = ['Pclass', 'Sex', 'Age']
training_outcome = ['Survived']


#looks like we need to change the sex columns value to INT so we can put it in our ML model
df_all_passengers_train['Sex'] = df_all_passengers_train['Sex'].map({'male':0, 'female':1})
#examine again
df_all_passengers_train.head()
#works fine

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,,S


In [38]:
#next we are going to do a little bit of data cleaning

#now we check the data types of our features, are they all numbers?
df_all_passengers_train[predicting_feature+training_outcome].dtypes
#good, they are all numbers, Age is float and not integer but thats okay. 

Pclass        int64
Sex           int64
Age         float64
Survived      int64
dtype: object

In [39]:
#now we check for any nan values
#it must be done because NAN values will mess with our ML models
df_all_passengers_train[predicting_feature+training_outcome].isnull().values.any()
#comes out true, it means we have NAN values somewhere

True

In [40]:
#check each columns separately to check where is the nan value
for i in predicting_feature+training_outcome:
    print('There are ' + str(df_all_passengers_train[i].isnull().sum()) + ' nan values in '+ i + ' Columns')
#lucky, only 177 nan values in 1 of our feature

There are 0 nan values in Pclass Columns
There are 0 nan values in Sex Columns
There are 177 nan values in Age Columns
There are 0 nan values in Survived Columns


In [41]:
#now we have to think a way to make that NAN values not NAN anymore

#how about we delete the row that has nan value in Age column?
#but 177 is a big number, we are going to delete 177 rows? 

# i dont think so, Age is only one of our features that has nan value, our other features and training outcome has no NAN

#more rows means our ML models predictive output might have higher precision because more training data

#lets just change NAN value in our Age columns into its MEAN 
df_all_passengers_train['Age'].fillna(value=df_all_passengers_train['Age'].mean(), inplace=True)

#check if it works
df_all_passengers_train['Age'].isnull().sum()
#it works

0

In [42]:
#Now, one last step before we feed our features in our ML models
#remember, our Pclass columns values is either 1, 2, or 3. this represents the passenger class and 1 is the highest
# we need to further split this values into their own passenger class columns
#why?
#so we can have either 0 or 1 as the values
df_all_passengers_train['First_Class'] = df_all_passengers_train['Pclass'].apply(lambda x: 1 if x == 1 else 0)
df_all_passengers_train['Second_Class'] = df_all_passengers_train['Pclass'].apply(lambda x: 1 if x == 2 else 0)
df_all_passengers_train['Third_Class'] = df_all_passengers_train['Pclass'].apply(lambda x: 1 if x == 3 else 0)

#check for the last time if it works
df_all_passengers_train.head()
#it works
#done with data cleaning

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,First_Class,Second_Class,Third_Class
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,,S,0,0,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,C,1,0,0
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,,S,0,0,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,C123,S,1,0,0
4,5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,,S,0,0,1


In [43]:
#check our data statistic to see if it's sensible
df_all_passengers_train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,First_Class,Second_Class,Third_Class
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,0.352413,29.699118,0.523008,0.381594,32.204208,0.242424,0.20651,0.551066
std,257.353842,0.486592,0.836071,0.47799,13.002015,1.102743,0.806057,49.693429,0.42879,0.405028,0.497665
min,1.0,0.0,1.0,0.0,0.42,0.0,0.0,0.0,0.0,0.0,0.0
25%,223.5,0.0,2.0,0.0,22.0,0.0,0.0,7.9104,0.0,0.0,0.0
50%,446.0,0.0,3.0,0.0,29.699118,0.0,0.0,14.4542,0.0,0.0,1.0
75%,668.5,1.0,3.0,1.0,35.0,1.0,0.0,31.0,0.0,0.0,1.0
max,891.0,1.0,3.0,1.0,80.0,8.0,6.0,512.3292,1.0,1.0,1.0


In [44]:
#Now we're ready to begin

# first step, split our train data into train and test data
#this is important to see our model prediction score

#put our features into new dataframes
features_data = df_all_passengers_train[['Sex', 'Age', 'First_Class', 'Second_Class', 'Third_Class']]

#put our training outcome data into pandas series
outcome_training = df_all_passengers_train['Survived']

#now split them (default split value is 80-20)
train_features, test_features, train_labels, test_labels = train_test_split(features_data, outcome_training)

#next,  normalize/standardize our features first, so mean of our features is 0 and standard deviation is 1
#normalize/standardize our data is important before applying it into any machine learning models
#i use standardscaler to standardize the data
scale = StandardScaler()
#standardize train_features
train_features = scale.fit_transform(train_features)
#standardize test_features
test_features = scale.transform(test_features)

In [45]:
#Create our logistic regression object
logistic_regression = LogisticRegression()

#fit our training data that has been split and standardize
logistic_regression.fit(train_features, train_labels)

#now we see our model score on test features
logistic_regression.score(test_features, test_labels)


0.8475336322869955

In [46]:
#now we back on our competition Data frame that we've already load 

#this is the data we are going to use to predict which passenger will likely to survive

#we are already trained our model, so this will be the predictive outcome

#this is part of kaggle fun competition

df_all_passengers_competition.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [47]:
#same as our training data, we need to change the Sex column to 0 for male and 1 for female
df_all_passengers_competition['Sex'] = df_all_passengers_competition['Sex'].map({'male':0, 'female':1})

predicting_feature = ['Pclass', 'Sex', 'Age']
#check dtypes again
df_all_passengers_competition[predicting_feature].dtypes
#good, they are all numbers, Age is float and not integer but thats okay. 

#check for nan values again
df_all_passengers_competition[predicting_feature].isnull().values.any()
#okay it comes out true, we have nan value somewhere

#check each columns separately to check where is the nan value
#for i in predicting_feature:
    #print('There are ' + str(df_all_passengers_competition[i].isnull().sum()) + ' nan values in '+ i + ' Columns')
#lucky, only 86 nan values in 1 of our feature

#lets just change NAN value in our Age columns into its MEAN 
df_all_passengers_competition['Age'].fillna(value=df_all_passengers_competition['Age'].mean(), inplace=True)

#check if it works
df_all_passengers_competition['Age'].isnull().sum()
#it works

#split the Pclass column again
df_all_passengers_competition['First_Class'] = df_all_passengers_train['Pclass'].apply(lambda x: 1 if x == 1 else 0)
df_all_passengers_competition['Second_Class'] = df_all_passengers_train['Pclass'].apply(lambda x: 1 if x == 2 else 0)
df_all_passengers_competition['Third_Class'] = df_all_passengers_train['Pclass'].apply(lambda x: 1 if x == 3 else 0)

#check for the last time if it works
df_all_passengers_competition.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,First_Class,Second_Class,Third_Class
0,892,3,"Kelly, Mr. James",0,34.5,0,0,330911,7.8292,,Q,0,0,1
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",1,47.0,1,0,363272,7.0,,S,1,0,0
2,894,2,"Myles, Mr. Thomas Francis",0,62.0,0,0,240276,9.6875,,Q,0,0,1
3,895,3,"Wirz, Mr. Albert",0,27.0,0,0,315154,8.6625,,S,1,0,0
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",1,22.0,1,1,3101298,12.2875,,S,0,0,1


In [48]:
#check our data statistic to see if it's sensible
df_all_passengers_competition.describe()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,First_Class,Second_Class,Third_Class
count,418.0,418.0,418.0,418.0,418.0,418.0,417.0,418.0,418.0,418.0
mean,1100.5,2.26555,0.363636,30.27259,0.447368,0.392344,35.627188,0.22488,0.210526,0.564593
std,120.810458,0.841838,0.481622,12.634534,0.89676,0.981429,55.907576,0.418004,0.408171,0.496404
min,892.0,1.0,0.0,0.17,0.0,0.0,0.0,0.0,0.0,0.0
25%,996.25,1.0,0.0,23.0,0.0,0.0,7.8958,0.0,0.0,0.0
50%,1100.5,3.0,0.0,30.27259,0.0,0.0,14.4542,0.0,0.0,1.0
75%,1204.75,3.0,1.0,35.75,1.0,0.0,31.5,0.0,0.0,1.0
max,1309.0,3.0,1.0,76.0,8.0,9.0,512.3292,1.0,1.0,1.0


In [49]:
#finally we can begin predicting our model

#put our features into new dataframes
features_data = df_all_passengers_competition[['Sex', 'Age', 'First_Class', 'Second_Class', 'Third_Class']]


#as always, standardize it first
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features_data)

#now we predict
prediction_titanic = logistic_regression.predict(scaled_features)
#print(prediction_titanic)
#(0 means deceased, 1 means survived)

#we have see wheter a passenger survived or not
#how about their probablity?
prediction_probablity_titanic = logistic_regression.predict_proba(scaled_features)
#print(prediction_probablity_titanic)
#each list represent each passenger probablity of survival
#number at list index 0 means likeliness of decease, and number at list index 1 is likeliness of survival


In [50]:
#to see it better who survived and deceased, we are going to pair the predictive outcome with it respective passenger
new_data_frame_titanic = df_all_passengers_competition[['PassengerId', 'Name', 'Age', 'Sex', 'Pclass']].reset_index(drop=True)
new_data_frame_titanic['Survive'] = prediction_titanic
new_data_frame_titanic['Sex'] = new_data_frame_titanic['Sex'].apply(lambda x: 'Male' if x==0 else 'Female')
new_data_frame_titanic['Survive'] = new_data_frame_titanic['Survive'].apply(lambda x: 'Yes' if x==1 else 'No')
new_data_frame_titanic.head()


Unnamed: 0,PassengerId,Name,Age,Sex,Pclass,Survive
0,892,"Kelly, Mr. James",34.5,Male,3,No
1,893,"Wilkes, Mrs. James (Ellen Needs)",47.0,Female,3,Yes
2,894,"Myles, Mr. Thomas Francis",62.0,Male,2,No
3,895,"Wirz, Mr. Albert",27.0,Male,3,Yes
4,896,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",22.0,Female,3,Yes


In [51]:
#ignore this part, i need to make csv of the data  for kaggle
passenger_id = new_data_frame_titanic['PassengerId']
survived = prediction_titanic
kaggle_titanic = pd.DataFrame({'PassengerId': passenger_id, 'Survived' : survived})
kaggle_titanic.to_csv('titanic_submission.csv', index=False)