In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.formula.api as sm
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from sklearn import linear_model
from sklearn import tree

In [2]:
df = pd.read_csv('titanic_dataset/train.csv')
df.drop(['Cabin', 'Ticket', 'PassengerId', 'Name'], axis=1, inplace=True)

In [3]:
# df.dropna(inplace=True)
# replace missing Ages with Mean age since data is normally distributed
df['Age'] = df['Age'].fillna(df['Age'].mean())

In [4]:
df = pd.get_dummies(df, columns=['Sex', 'Embarked'])
df.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,0,3,22.0,1,0,7.25,0,1,0,0,1
1,1,1,38.0,1,0,71.2833,1,0,1,0,0
2,1,3,26.0,0,0,7.925,1,0,0,0,1
3,1,1,35.0,1,0,53.1,1,0,0,0,1
4,0,3,35.0,0,0,8.05,0,1,0,0,1


In [5]:
#scaling
values = df.values
scaler = MinMaxScaler(feature_range=(0,1))
scaled_df = scaler.fit_transform(values)
scaled_df.shape

(891, 11)

In [6]:
#Train-Test Set
X = scaled_df[:, 1:]
Y = scaled_df[:, 0]

training_sample_size = int(0.8 * len(scaled_df))
X_train = X[:training_sample_size, :]
Y_train = Y[:training_sample_size]

X_test = X[training_sample_size:, :]
Y_test = Y[training_sample_size:]
print(X_train.shape, Y_train.shape, X_test.shape, Y_test.shape)

(712, 10) (712,) (179, 10) (179,)


In [7]:
model = sm.Logit(Y_train, X_train)
result = model.fit()
result.summary()

         Current function value: 0.452741
         Iterations: 35




0,1,2,3
Dep. Variable:,y,No. Observations:,712.0
Model:,Logit,Df Residuals:,702.0
Method:,MLE,Df Model:,9.0
Date:,"Sun, 17 Jun 2018",Pseudo R-squ.:,0.3232
Time:,12:18:38,Log-Likelihood:,-322.35
converged:,False,LL-Null:,-476.29
,,LLR p-value:,5.554e-61

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
x1,-2.1727,0.321,-6.765,0.000,-2.802,-1.543
x2,-2.7141,0.687,-3.949,0.000,-4.061,-1.367
x3,-2.1597,0.970,-2.227,0.026,-4.060,-0.259
x4,-0.9126,0.813,-1.123,0.262,-2.506,0.681
x5,0.1887,1.360,0.139,0.890,-2.477,2.854
x6,20.1672,1.28e+04,0.002,0.999,-2.51e+04,2.51e+04
x7,17.4838,1.28e+04,0.001,0.999,-2.51e+04,2.51e+04
x8,-16.1851,1.28e+04,-0.001,0.999,-2.51e+04,2.51e+04
x9,-16.1234,1.28e+04,-0.001,0.999,-2.51e+04,2.51e+04


In [8]:
Y_pred_scaled = result.predict(X_test)
Y_pred_scaled = pd.DataFrame(Y_pred_scaled)

In [9]:
#invert scaling for forecast
prediction = np.concatenate((Y_pred_scaled, X_test), axis=1)

In [10]:
prediction = scaler.inverse_transform(prediction)
Y_pred = prediction[:,0]
final_Y_pred = [1.0 if x > 0.5 else 0.0 for x in Y_pred]

In [11]:
print("RMSE : ", mean_squared_error(final_Y_pred, Y_test))

RMSE :  0.16759776536312848


In [12]:
#load validation dataset
validation_X_df = pd.read_csv('titanic_dataset/test.csv')
validation_X_df.drop(['Cabin', 'Ticket', 'PassengerId', 'Name'], axis=1, inplace=True)
validation_X_df = pd.get_dummies(validation_X_df, columns=['Sex', 'Embarked'])
validation_X_df = pd.DataFrame(validation_X_df.values)

In [13]:
validation_Y_df = pd.read_csv('titanic_dataset/gender_submission.csv')
validation_Y_df = pd.DataFrame(validation_Y_df.values[:,1:])

In [14]:
validation_df = pd.concat([validation_Y_df, validation_X_df], axis=1)
validation_df.shape

(418, 11)

In [15]:
validation_df.isnull().sum()

0     0
0     0
1    86
2     0
3     0
4     1
5     0
6     0
7     0
8     0
9     0
dtype: int64

In [16]:
validation_df.dropna(inplace=True)

In [17]:
#scaling
values = validation_df.values
scaler = MinMaxScaler(feature_range=(0,1))
scaled_val_df = scaler.fit_transform(values)
scaled_val_df.shape

(331, 11)

In [18]:
X_val = scaled_val_df[:, 1:]
Y_val = scaled_val_df[:, 0]

In [19]:
Y_pred_scaled = result.predict(X_val)
Y_pred_scaled = pd.DataFrame(Y_pred_scaled)
#invert scaling for forecast
prediction = np.concatenate((Y_pred_scaled, X_val), axis=1)
Y_pred_scaled.shape

(331, 1)

In [20]:
prediction = scaler.inverse_transform(prediction)

In [21]:
Y_pred = prediction[:,0]
final_Y_pred = [1.0 if x > 0.5 else 0.0 for x in Y_pred]

In [22]:
print("RMSE : ", mean_squared_error(final_Y_pred, Y_val))

RMSE :  0.054380664652567974


In [23]:
log_regr = linear_model.LogisticRegression()
log_regr.fit(X_train, Y_train)
log_regr.score(X_test, Y_test)

0.8212290502793296

In [24]:
log_regr.score(X_val, Y_val)

0.9607250755287009

In [25]:
dec_tree = tree.DecisionTreeClassifier()
dec_tree.fit(X_train, Y_train)
dec_tree.score(X_test, Y_test)

0.7653631284916201