In [25]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import numpy as np

import matplotlib.pyplot as plt
from numpy import genfromtxt
import pandas as pd

from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV


%matplotlib inline

In [13]:
dat = pd.read_csv('train.csv')

train_size = dat.shape[0]

dat1 = pd.read_csv('test.csv')

test_size = dat1.shape[0]

dat1.insert(loc=1, column='Survived', value=0)

#appending test data to train data for preprocessing
dat = dat.append(dat1)

dat['Age'] = dat['Age'].fillna(dat['Age'].median())
dat['Fare'] = dat['Fare'].fillna(dat['Fare'].median())

dat.drop(['Ticket', 'Name', 'Cabin'],axis=1, inplace=True)

dat["Embarked"] = dat["Embarked"].fillna("S")

dat.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,male,22.0,1,0,7.25,S
1,2,1,1,female,38.0,1,0,71.2833,C
2,3,1,3,female,26.0,0,0,7.925,S
3,4,1,1,female,35.0,1,0,53.1,S
4,5,0,3,male,35.0,0,0,8.05,S


In [14]:
dat['Family_Size'] = dat['Parch'] + dat['SibSp'] + 1
dat.drop(['SibSp', 'Parch'],axis=1, inplace=True)
# dat.isna().sum()


dat.head(4)


Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,Fare,Embarked,Family_Size
0,1,0,3,male,22.0,7.25,S,2
1,2,1,1,female,38.0,71.2833,C,2
2,3,1,3,female,26.0,7.925,S,1
3,4,1,1,female,35.0,53.1,S,2


In [15]:
dat['AgeBin'] = pd.cut(dat['Age'], 5)
label = LabelEncoder()
dat.head(4)

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,Fare,Embarked,Family_Size,AgeBin
0,1,0,3,male,22.0,7.25,S,2,"(16.136, 32.102]"
1,2,1,1,female,38.0,71.2833,C,2,"(32.102, 48.068]"
2,3,1,3,female,26.0,7.925,S,1,"(16.136, 32.102]"
3,4,1,1,female,35.0,53.1,S,2,"(32.102, 48.068]"


In [16]:
dat['FareBin'] = pd.cut(dat['Fare'], 5)
dat['FareBin_Code'] = label.fit_transform(dat['FareBin'])
dat['AgeBin_Code'] = label.fit_transform(dat['AgeBin'])
dat['Sex'] = label.fit_transform(dat['Sex'])

dat.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,Fare,Embarked,Family_Size,AgeBin,FareBin,FareBin_Code,AgeBin_Code
0,1,0,3,1,22.0,7.25,S,2,"(16.136, 32.102]","(-0.512, 102.466]",0,1
1,2,1,1,0,38.0,71.2833,C,2,"(32.102, 48.068]","(-0.512, 102.466]",0,2
2,3,1,3,0,26.0,7.925,S,1,"(16.136, 32.102]","(-0.512, 102.466]",0,1
3,4,1,1,0,35.0,53.1,S,2,"(32.102, 48.068]","(-0.512, 102.466]",0,2
4,5,0,3,1,35.0,8.05,S,1,"(32.102, 48.068]","(-0.512, 102.466]",0,2


In [17]:
del dat['Age']
del dat['Fare']
del dat['AgeBin']
del dat['FareBin']
dat.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Embarked,Family_Size,FareBin_Code,AgeBin_Code
0,1,0,3,1,S,2,0,1
1,2,1,1,0,C,2,0,2
2,3,1,3,0,S,1,0,1
3,4,1,1,0,S,2,0,2
4,5,0,3,1,S,1,0,2


In [18]:
dat = pd.get_dummies(dat, columns = ["Embarked", "Pclass","Sex","FareBin_Code","AgeBin_Code"],prefix=["Emb","Pclass","Sex","Fare_type","Age_type"])
dat.head()

Unnamed: 0,PassengerId,Survived,Family_Size,Emb_C,Emb_Q,Emb_S,Pclass_1,Pclass_2,Pclass_3,Sex_0,Sex_1,Fare_type_0,Fare_type_1,Fare_type_2,Fare_type_3,Age_type_0,Age_type_1,Age_type_2,Age_type_3,Age_type_4
0,1,0,2,0,0,1,0,0,1,0,1,1,0,0,0,0,1,0,0,0
1,2,1,2,1,0,0,1,0,0,1,0,1,0,0,0,0,0,1,0,0
2,3,1,1,0,0,1,0,0,1,1,0,1,0,0,0,0,1,0,0,0
3,4,1,2,0,0,1,1,0,0,1,0,1,0,0,0,0,0,1,0,0
4,5,0,1,0,0,1,0,0,1,0,1,1,0,0,0,0,0,1,0,0


In [19]:
data = dat.values

train_data = data[0:train_size, :]
test_data = data[train_size:,:]

X = train_data[:,2:]
Y = train_data[:,1]

In [20]:
print(train_data.shape)
print(test_data.shape)

(891, 20)
(418, 20)


In [21]:
forest_params = dict(
    max_depth = [n for n in range(9, 14)],
    min_samples_split = [n for n in range(4, 11)],
    min_samples_leaf = [n for n in range(2, 5)],
    n_estimators = [n for n in range(50, 600, 50)],
)

In [22]:
forest = RandomForestClassifier()

In [26]:
forest_cv = GridSearchCV(estimator=forest, param_grid=forest_params, cv=5)
forest_cv.fit(X, Y)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'max_depth': [9, 10, 11, 12, 13], 'min_samples_split': [4, 5, 6, 7, 8, 9, 10], 'min_samples_leaf': [2, 3, 4], 'n_estimators': [50, 100, 150, 200, 250, 300, 350, 400, 450, 500, 550]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [27]:
print("Best score: {}".format(forest_cv.best_score_))
print("Optimal params: {}".format(forest_cv.best_estimator_))

Best score: 0.8215488215488216
Optimal params: RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=13, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=4, min_samples_split=9,
            min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)


In [29]:
predicted = forest_cv.predict(test_data[:,2:])

In [30]:
pid = test_data[:,0]
pid = pid.reshape(pid.shape[0], 1)
predicted = predicted.reshape(predicted.shape[0], 1)

result = np.append(pid, predicted, axis =1)

In [31]:
with open("foo.csv", "wb") as f:
    f.write(b'PassengerId,Survived\n')
    np.savetxt(f, result.astype(int), fmt='%i', delimiter=",")