In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
from sklearn.neural_network import MLPClassifier
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Imputer

In [2]:
train_df = pd.read_csv('../data/raw/train.csv')
train_df.sample(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
777,778,1,3,"Emanuel, Miss. Virginia Ethel",female,5.0,0,0,364516,12.475,,S
869,870,1,3,"Johnson, Master. Harold Theodor",male,4.0,1,1,347742,11.1333,,S
843,844,0,3,"Lemberopolous, Mr. Peter L",male,34.5,0,0,2683,6.4375,,C
718,719,0,3,"McEvoy, Mr. Michael",male,,0,0,36568,15.5,,Q
307,308,1,1,"Penasco y Castellana, Mrs. Victor de Satode (M...",female,17.0,1,0,PC 17758,108.9,C65,C


In [3]:
test_df = pd.read_csv('../data/raw/test.csv')
# test_df.sample(10)

In [4]:
def xtract(df,test=False):
    cpy = df.copy()
      
    cpy = pd.concat([df,pd.get_dummies(df["Sex"],prefix='sex')],axis=1)
    cpy = pd.concat([cpy,pd.get_dummies(df["Pclass"],prefix='class')],axis=1)
    
    # only columns we'll actually use
    if test:
        cpy = cpy[ [col for col in list(cpy) if col.startswith('sex')  or col.startswith('class') or col == "Age" or col == "Survived" or col == "PassengerId" ] ]
        return cpy
    else:
        cpy = cpy[ [col for col in list(cpy) if col.startswith('sex')  or col.startswith('class') or col == "Age" or col == "Survived" ] ]
        return cpy.dropna()
    
        
train_df_clean = xtract(train_df)
train_df_clean.head()

Unnamed: 0,Survived,Age,sex_female,sex_male,class_1,class_2,class_3
0,0,22.0,0,1,0,0,1
1,1,38.0,1,0,1,0,0
2,1,26.0,1,0,0,0,1
3,1,35.0,1,0,1,0,0
4,0,35.0,0,1,0,0,1


In [5]:
test_df_clean = xtract(test_df,test=True)
test_df_clean.sample(20)

Unnamed: 0,PassengerId,Age,sex_female,sex_male,class_1,class_2,class_3
393,1285,47.0,0,1,0,1,0
287,1179,24.0,0,1,1,0,0
19,911,45.0,1,0,0,0,1
94,986,25.0,0,1,1,0,0
391,1283,51.0,1,0,1,0,0
44,936,45.0,1,0,1,0,0
229,1121,36.0,0,1,0,1,0
255,1147,,0,1,0,0,1
223,1115,21.0,0,1,0,0,1
295,1187,26.0,0,1,0,0,1


In [6]:
X = []
y = []

for row in train_df_clean.values:
    X.append(row[1:])
    y.append(row[0])
    
X = np.array(X)
y = np.array(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

In [7]:
# comment this block to see the difference
scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [8]:
# purposefully ignoring data standardization to mean 0 and variance 1
clf = MLPClassifier(random_state=1,solver='lbfgs')
clf.fit(X_train,y_train)
metrics.accuracy_score(y_test,clf.predict(X_test))

0.80419580419580416

In [9]:
passenger_ids = []
X_out = []
y_out = []

for row in test_df_clean.values:
    passenger_ids.append(row[0])
    X_out.append(row[1:])
    
X_out = np.array(X_out)    
    
imp = Imputer()
imp.fit(X_train)

X_out = imp.transform(X_out)
    
y_out = clf.predict(X_out)

In [10]:
out_df = pd.DataFrame({
    'PassengerId': passenger_ids,
    'Survived': y_out
})
out_df.head()

Unnamed: 0,PassengerId,Survived
0,892.0,1.0
1,893.0,1.0
2,894.0,1.0
3,895.0,1.0
4,896.0,1.0


In [11]:
out_df["PassengerId"] = out_df["PassengerId"].apply(lambda dbl: int(dbl))
out_df["Survived"] = out_df["Survived"].apply(lambda dbl: int(dbl))
out_df.head()

Unnamed: 0,PassengerId,Survived
0,892,1
1,893,1
2,894,1
3,895,1
4,896,1


In [12]:
out_df.to_csv("../data/interim/nn.csv", index=False)