In [0]:
from google.colab import drive 
from sklearn.preprocessing import StandardScaler
drive.mount('/content/gdrive')

import pandas as pd
pd.set_option('display.max_columns', 500)
import re

def get_title(name):
  title = re.search('([A-Za-z]+)\.', name)

  if title:
    return title.group(1)
  return '' 

def preprocess(name):
  df = pd.read_csv(name)

  #replace NaNs
  age_median = df['Age'].median()
  df['Age'].fillna(age_median, inplace=True)
  fare_median = df['Fare'].median()
  df['Fare'].fillna(fare_median, inplace=True)
  df = df.dropna(subset=['Embarked'])
  
  #Add new features
  df['Family_Size'] = df['SibSp']+df['Parch']
  df['Has_Cabin'] = df['Cabin'].map(lambda x: 0 if pd.isnull(x) else 1)
  df['Title'] = df['Name'].map(lambda x: get_title(x))
  df['Title'] =df['Title'].replace(['Dr', 'Rev', 'Col', 'Major', 'Mlle', 'Capt', 'Don', 'Jonkheer', 'Mme', 'Countess', 'Ms', 'Lady', 'Sir', 'Dona'], 'Rare')

  #Deal with categorical vars
  df = pd.get_dummies(df, columns=['Sex', 'Embarked', 'Pclass', 'Title'], drop_first=True)

  #Normalize continuous vars
  scaler = StandardScaler()
  df['Fare'] = scaler.fit_transform(df[['Fare']].values.astype(float))
  df['Age'] = scaler.fit_transform(df[['Age']].values.astype(float))

  ids = df['PassengerId'].values
  df.drop(['PassengerId', 'SibSp','Parch','Ticket', 'Cabin', 'Name'], axis=1, inplace=True)

  return df, ids

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
import numpy as np

df, _ = preprocess('gdrive/My Drive/train.csv')
data = df.values

print(df.head())

Y = data[:, 0].astype(int)
X = data[:, 1:]

df_test, ids = preprocess('gdrive/My Drive/test.csv')
X_kaggle_test = df_test.values

   Survived       Age      Fare  Family_Size  Has_Cabin  Sex_male  Embarked_Q  \
0         0 -0.563674 -0.500240            1          0         1           0   
1         1  0.669217  0.788947            1          1         0           0   
2         1 -0.255451 -0.486650            0          0         0           0   
3         1  0.438050  0.422861            1          1         0           0   
4         0  0.438050 -0.484133            0          0         1           0   

   Embarked_S  Pclass_2  Pclass_3  Title_Miss  Title_Mr  Title_Mrs  Title_Rare  
0           1         0         1           0         1          0           0  
1           0         0         0           0         0          1           0  
2           1         0         1           1         0          0           0  
3           1         0         0           0         0          1           0  
4           1         0         1           0         1          0           0  


In [0]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.utils import shuffle

X, Y = shuffle(X, Y)

brain = MLPClassifier(hidden_layer_sizes=(4, 7),
                      learning_rate_init=1e-3,
                      max_iter=100000,
                      activation='tanh')

brain.fit(X, Y)
preds = brain.predict(X_kaggle_test)
result = np.concatenate((np.array([ids]).T, np.array([preds]).T), axis=1)
np.savetxt("gdrive/My Drive/result.csv", result, delimiter=",", fmt='%s')

print("Cross validation score: ", np.mean(cross_val_score(brain, X, Y, cv=10,verbose=0)))

Cross validation score:  0.8200842696629213
