In [64]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

df = pd.read_csv('passengers.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [65]:
#Convert 'Sex' column to numerical value. Male will be 0, Female will be 1
df.loc[df['Sex'] == 'male', 'Sex'] = 0
df.loc[df['Sex'] == 'female', 'Sex'] = 1
df.tail()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
886,887,0,2,"Montvila, Rev. Juozas",0,27.0,0,0,211536,13.0,,S
887,888,1,1,"Graham, Miss. Margaret Edith",1,19.0,0,0,112053,30.0,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",1,,1,2,W./C. 6607,23.45,,S
889,890,1,1,"Behr, Mr. Karl Howell",0,26.0,0,0,111369,30.0,C148,C
890,891,0,3,"Dooley, Mr. Patrick",0,32.0,0,0,370376,7.75,,Q


There is an 'NaN' value in age for entry 888. Using a model with a value as NaN will lead to an error. There are several ways to 
approach this problem. 1. Remove the entire entry 2. Use a descriptive statistic to abridge the error (mean, median) 3. Build own machine learning model
to find missing value.

Using a descriptive statistic to estimate mean will probably be the best in this situation.

In [66]:
df = df.fillna(df['Age'].median())
df.tail() #looks like median age is 28

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
886,887,0,2,"Montvila, Rev. Juozas",0,27.0,0,0,211536,13.0,28,S
887,888,1,1,"Graham, Miss. Margaret Edith",1,19.0,0,0,112053,30.0,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",1,28.0,1,2,W./C. 6607,23.45,28,S
889,890,1,1,"Behr, Mr. Karl Howell",0,26.0,0,0,111369,30.0,C148,C
890,891,0,3,"Dooley, Mr. Patrick",0,32.0,0,0,370376,7.75,28,Q


In [67]:
df['first_class'] = 0
df['second_class'] = 0
df['third_class'] = 0

df.loc[df['Pclass'] == 1, 'first_class'] = 1
df.loc[df['Pclass'] == 2, 'second_class'] = 1
df.loc[df['Pclass'] == 3, 'third_class'] = 1
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,first_class,second_class,third_class
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.2500,28,S,0,0,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,C,1,0,0
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.9250,28,S,0,0,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1000,C123,S,1,0,0
4,5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.0500,28,S,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",0,27.0,0,0,211536,13.0000,28,S,0,1,0
887,888,1,1,"Graham, Miss. Margaret Edith",1,19.0,0,0,112053,30.0000,B42,S,1,0,0
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",1,28.0,1,2,W./C. 6607,23.4500,28,S,0,0,1
889,890,1,1,"Behr, Mr. Karl Howell",0,26.0,0,0,111369,30.0000,C148,C,1,0,0


In [68]:
features = df[['first_class','second_class','third_class','Sex','Age']]
label = df['Survived']

In [69]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(features, label, test_size = .2)

In [70]:
from sklearn.preprocessing import StandardScaler
#Need to normalize data, otherwise Age will have an exaggerated weight on the Logistic Regression
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

In [71]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(x_train, y_train)
print(model.coef_) #These coefficients are stating how each column impacts the model
#[[1,2,3, Sex, Age]] =  [[ 0.51917804  0.06797277 -0.49655559  1.2296514  -0.43777542]]
#Sex has largest impact, then classes 1,3, then Age, then 2nd.
#Seems like being in 2nd class doesnt really matter. People in 2nd class' survival correlates more with Sex and Age
print(model.intercept_)

[[ 0.51917804  0.06797277 -0.49655559  1.2296514  -0.43777542]]
[-0.71037408]


In [72]:
model.score(x_train, y_train)
#80% accuracy is pretty decent, especially for a model of this size

0.8033707865168539

In [73]:
model.score(x_test, y_test)

0.7821229050279329

In [77]:
#Predict sample passengers with model
jack = [0,0,1,0,20]
rose = [1,0,0,1,20]
me = [0,1,0,0,19]
sample_pass = np.array([jack,rose,me])
sample_pass = scaler.transform(sample_pass)

model.predict(sample_pass)

array([0, 1, 0], dtype=int64)