In [10]:
import os
import pandas as pd
import numpy as np
import re

from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder, StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV

In [37]:
def load_data():
    csv_path = "D:\\repos\\MLPractice\\TitanicSurvivors"
    training_set = pd.read_csv(os.path.join(csv_path, "train.csv"))
    test_set = pd.read_csv(os.path.join(csv_path, "test.csv"))
    return training_set, test_set
    
training_set, test_set = load_data()

test_set.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [38]:
training_set_labels = training_set['Survived']
training_set = training_set.iloc[:, [2, 3, 4, 5, 6, 7, 9, 11]]

mean_age = training_set['Age'].mean()
training_set['Age'] = training_set['Age'].fillna(mean_age)

training_set = training_set.fillna(method='bfill')
training_set = training_set.fillna(method='ffill')
#training_set['Cabin'] = training_set['Cabin'].str[0:1]

test_indices = test_set['PassengerId']

test_set = test_set.iloc[:, [1, 2, 3, 4, 5, 6, 8, 10]]

mean_age = test_set['Age'].mean()
test_set['Age'] = test_set['Age'].fillna(mean_age)

test_set = test_set.fillna(method='bfill')
test_set = test_set.fillna(method='ffill')
#test_set['Cabin'] = test_set['Cabin'].str[0:1]

#training_set.head()

0


In [30]:
def get_title(name):
    search = re.search(' ([A-Za-z]+)\. ', name)
    
    if search:
        rare_group = ['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona']
        
        title = search.group(1)
        if title in rare_group:
            title = 'Rare'
        elif title == 'Mlle':
            title = 'Miss'
        elif title == 'Ms':
            title = 'Miss'
        elif title == 'Mme':
            title = 'Mrs'
    else:
        title = ""
    
    return title

In [31]:
training_conditions = [
    (training_set['Age'] < 13),
    (training_set['Age'] >= 13) & (training_set['Age'] < 20),
    (training_set['Age'] >= 20) & (training_set['Age'] < 60),
    (training_set['Age'] >= 60)
]

test_conditions = [
    (test_set['Age'] < 13),
    (test_set['Age'] >= 13) & (test_set['Age'] < 20),
    (test_set['Age'] >= 20) & (test_set['Age'] < 60),
    (test_set['Age'] >= 60)
]

values = [1, 2, 3, 4]

training_set['FamilySize'] = training_set['SibSp'] + training_set['Parch'] + 1
training_set['AgeGroup'] = np.select(training_conditions, values, default=3)
training_set['Alone'] = np.where(training_set['FamilySize'] == 1, 1, 0)
training_set['Title'] = training_set['Name'].apply(get_title)

test_set['FamilySize'] = test_set['SibSp'] + test_set['Parch'] + 1
test_set['AgeGroup'] = np.select(test_conditions, values, default=3)
test_set['Alone'] = np.where(test_set['FamilySize'] == 1, 1, 0)
test_set['Title'] = test_set['Name'].apply(get_title)

#training_set = training_set.drop('SibSp', axis=1)
#training_set = training_set.drop('Parch', axis=1)
training_set = training_set.drop('Name', axis=1)

#test_set = test_set.drop('SibSp', axis=1)
#test_set = test_set.drop('Parch', axis=1)
test_set = test_set.drop('Name', axis=1)

training_set.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,FamilySize,AgeGroup,Alone,Title
0,3,male,22.0,1,0,7.25,S,2,3,0,Mr
1,1,female,38.0,1,0,71.2833,C,2,3,0,Mrs
2,3,female,26.0,0,0,7.925,S,1,3,1,Miss
3,1,female,35.0,1,0,53.1,S,2,3,0,Mrs
4,3,male,35.0,0,0,8.05,S,1,3,1,Mr


In [32]:
encoder = LabelEncoder()

encoder = encoder.fit(training_set['Sex'])
training_set['Sex'] = encoder.transform(training_set['Sex'])
test_set['Sex'] = encoder.transform(test_set['Sex'])

encoder = encoder.fit(training_set['Embarked'])
training_set['Embarked'] = encoder.transform(training_set['Embarked'])
test_set['Embarked'] = encoder.transform(test_set['Embarked'])

#encoder = encoder.fit(training_set['Cabin'])
#training_set['Cabin'] = encoder.transform(training_set['Cabin'])
#test_set['Cabin'] = encoder.transform(test_set['Cabin'])

encoder = encoder.fit(training_set['Title'])
training_set['Title'] = encoder.transform(training_set['Title'])
test_set['Title'] = encoder.transform(test_set['Title'])

training_set.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,FamilySize,AgeGroup,Alone,Title
Pclass,1.0,0.1319,-0.331339,0.083081,0.018443,-0.5495,0.166223,0.065997,-0.169802,0.135207,-0.181177
Sex,0.1319,1.0,0.084153,-0.114631,-0.245489,-0.182333,0.111894,-0.200988,0.112819,0.303646,0.060299
Age,-0.331339,0.084153,1.0,-0.232625,-0.179191,0.091566,-0.033737,-0.248512,0.765504,0.179775,0.475693
SibSp,0.083081,-0.114631,-0.232625,1.0,0.414838,0.159651,0.069444,0.890712,-0.321783,-0.584471,-0.209813
Parch,0.018443,-0.245489,-0.179191,0.414838,1.0,0.216225,0.041064,0.783111,-0.322835,-0.583398,-0.117587
Fare,-0.5495,-0.182333,0.091566,0.159651,0.216225,1.0,-0.227015,0.217138,0.014665,-0.271832,-0.013273
Embarked,0.166223,0.111894,-0.033737,0.069444,0.041064,-0.227015,1.0,0.067977,-0.037143,0.061111,0.001228
FamilySize,0.065997,-0.200988,-0.248512,0.890712,0.783111,0.217138,0.067977,1.0,-0.381211,-0.690922,-0.202145
AgeGroup,-0.169802,0.112819,0.765504,-0.321783,-0.322835,0.014665,-0.037143,-0.381211,1.0,0.321646,0.494782
Alone,0.135207,0.303646,0.179775,-0.584471,-0.583398,-0.271832,0.061111,-0.690922,0.321646,1.0,0.01761


In [33]:
training_set = StandardScaler().fit_transform(training_set)
test_set = StandardScaler().fit_transform(test_set)

In [25]:
#model = LinearRegression()
#model = DecisionTreeRegressor()
model = RandomForestRegressor()

params = [
    {"n_estimators": [150, 200, 400], "max_features": [10, 11], "random_state": [1, 2], "max_depth": [100, 200, 300]},
    {"bootstrap": [False], "n_estimators": [150, 200, 400], "max_features": [10, 11], "random_state": [1, 2], "max_depth": [100, 200, 300]}
]

grid_search = GridSearchCV(model, params, cv=5, scoring="neg_mean_squared_error", return_train_score=True, verbose=3)

In [26]:
grid_search.fit(training_set, training_set_labels)
print(grid_search.best_params_)
print(grid_search.best_estimator_)
print(grid_search.best_estimator_.score(training_set, training_set_labels))

Fitting 5 folds for each of 72 candidates, totalling 360 fits
[CV 1/5] END max_depth=100, max_features=10, n_estimators=150, random_state=1;, score=(train=-0.026, test=-0.161) total time=   0.1s
[CV 2/5] END max_depth=100, max_features=10, n_estimators=150, random_state=1;, score=(train=-0.023, test=-0.150) total time=   0.1s
[CV 3/5] END max_depth=100, max_features=10, n_estimators=150, random_state=1;, score=(train=-0.027, test=-0.113) total time=   0.1s
[CV 4/5] END max_depth=100, max_features=10, n_estimators=150, random_state=1;, score=(train=-0.024, test=-0.163) total time=   0.1s
[CV 5/5] END max_depth=100, max_features=10, n_estimators=150, random_state=1;, score=(train=-0.026, test=-0.124) total time=   0.1s
[CV 1/5] END max_depth=100, max_features=10, n_estimators=150, random_state=2;, score=(train=-0.026, test=-0.159) total time=   0.1s
[CV 2/5] END max_depth=100, max_features=10, n_estimators=150, random_state=2;, score=(train=-0.023, test=-0.152) total time=   0.1s
[CV 3/5

[CV 3/5] END max_depth=200, max_features=10, n_estimators=150, random_state=1;, score=(train=-0.027, test=-0.113) total time=   0.1s
[CV 4/5] END max_depth=200, max_features=10, n_estimators=150, random_state=1;, score=(train=-0.024, test=-0.163) total time=   0.1s
[CV 5/5] END max_depth=200, max_features=10, n_estimators=150, random_state=1;, score=(train=-0.026, test=-0.124) total time=   0.1s
[CV 1/5] END max_depth=200, max_features=10, n_estimators=150, random_state=2;, score=(train=-0.026, test=-0.159) total time=   0.1s
[CV 2/5] END max_depth=200, max_features=10, n_estimators=150, random_state=2;, score=(train=-0.023, test=-0.152) total time=   0.1s
[CV 3/5] END max_depth=200, max_features=10, n_estimators=150, random_state=2;, score=(train=-0.027, test=-0.115) total time=   0.1s
[CV 4/5] END max_depth=200, max_features=10, n_estimators=150, random_state=2;, score=(train=-0.024, test=-0.165) total time=   0.1s
[CV 5/5] END max_depth=200, max_features=10, n_estimators=150, random

[CV 5/5] END max_depth=300, max_features=10, n_estimators=150, random_state=1;, score=(train=-0.026, test=-0.124) total time=   0.1s
[CV 1/5] END max_depth=300, max_features=10, n_estimators=150, random_state=2;, score=(train=-0.026, test=-0.159) total time=   0.1s
[CV 2/5] END max_depth=300, max_features=10, n_estimators=150, random_state=2;, score=(train=-0.023, test=-0.152) total time=   0.1s
[CV 3/5] END max_depth=300, max_features=10, n_estimators=150, random_state=2;, score=(train=-0.027, test=-0.115) total time=   0.1s
[CV 4/5] END max_depth=300, max_features=10, n_estimators=150, random_state=2;, score=(train=-0.024, test=-0.165) total time=   0.1s
[CV 5/5] END max_depth=300, max_features=10, n_estimators=150, random_state=2;, score=(train=-0.026, test=-0.120) total time=   0.1s
[CV 1/5] END max_depth=300, max_features=10, n_estimators=200, random_state=1;, score=(train=-0.026, test=-0.159) total time=   0.2s
[CV 2/5] END max_depth=300, max_features=10, n_estimators=200, random

[CV 1/5] END bootstrap=False, max_depth=100, max_features=10, n_estimators=150, random_state=2;, score=(train=-0.009, test=-0.205) total time=   0.1s
[CV 2/5] END bootstrap=False, max_depth=100, max_features=10, n_estimators=150, random_state=2;, score=(train=-0.007, test=-0.183) total time=   0.1s
[CV 3/5] END bootstrap=False, max_depth=100, max_features=10, n_estimators=150, random_state=2;, score=(train=-0.009, test=-0.150) total time=   0.1s
[CV 4/5] END bootstrap=False, max_depth=100, max_features=10, n_estimators=150, random_state=2;, score=(train=-0.008, test=-0.212) total time=   0.1s
[CV 5/5] END bootstrap=False, max_depth=100, max_features=10, n_estimators=150, random_state=2;, score=(train=-0.009, test=-0.185) total time=   0.1s
[CV 1/5] END bootstrap=False, max_depth=100, max_features=10, n_estimators=200, random_state=1;, score=(train=-0.009, test=-0.210) total time=   0.2s
[CV 2/5] END bootstrap=False, max_depth=100, max_features=10, n_estimators=200, random_state=1;, sco

[CV 1/5] END bootstrap=False, max_depth=200, max_features=10, n_estimators=150, random_state=1;, score=(train=-0.009, test=-0.211) total time=   0.1s
[CV 2/5] END bootstrap=False, max_depth=200, max_features=10, n_estimators=150, random_state=1;, score=(train=-0.007, test=-0.182) total time=   0.1s
[CV 3/5] END bootstrap=False, max_depth=200, max_features=10, n_estimators=150, random_state=1;, score=(train=-0.009, test=-0.150) total time=   0.1s
[CV 4/5] END bootstrap=False, max_depth=200, max_features=10, n_estimators=150, random_state=1;, score=(train=-0.008, test=-0.211) total time=   0.1s
[CV 5/5] END bootstrap=False, max_depth=200, max_features=10, n_estimators=150, random_state=1;, score=(train=-0.009, test=-0.188) total time=   0.2s
[CV 1/5] END bootstrap=False, max_depth=200, max_features=10, n_estimators=150, random_state=2;, score=(train=-0.009, test=-0.205) total time=   0.1s
[CV 2/5] END bootstrap=False, max_depth=200, max_features=10, n_estimators=150, random_state=2;, sco

[CV 1/5] END bootstrap=False, max_depth=200, max_features=11, n_estimators=400, random_state=2;, score=(train=-0.009, test=-0.239) total time=   0.6s
[CV 2/5] END bootstrap=False, max_depth=200, max_features=11, n_estimators=400, random_state=2;, score=(train=-0.007, test=-0.190) total time=   0.5s
[CV 3/5] END bootstrap=False, max_depth=200, max_features=11, n_estimators=400, random_state=2;, score=(train=-0.009, test=-0.171) total time=   0.6s
[CV 4/5] END bootstrap=False, max_depth=200, max_features=11, n_estimators=400, random_state=2;, score=(train=-0.008, test=-0.223) total time=   0.6s
[CV 5/5] END bootstrap=False, max_depth=200, max_features=11, n_estimators=400, random_state=2;, score=(train=-0.009, test=-0.211) total time=   0.6s
[CV 1/5] END bootstrap=False, max_depth=300, max_features=10, n_estimators=150, random_state=1;, score=(train=-0.009, test=-0.211) total time=   0.1s
[CV 2/5] END bootstrap=False, max_depth=300, max_features=10, n_estimators=150, random_state=1;, sco

[CV 1/5] END bootstrap=False, max_depth=300, max_features=11, n_estimators=400, random_state=1;, score=(train=-0.009, test=-0.240) total time=   0.6s
[CV 2/5] END bootstrap=False, max_depth=300, max_features=11, n_estimators=400, random_state=1;, score=(train=-0.007, test=-0.192) total time=   0.6s
[CV 3/5] END bootstrap=False, max_depth=300, max_features=11, n_estimators=400, random_state=1;, score=(train=-0.009, test=-0.171) total time=   0.6s
[CV 4/5] END bootstrap=False, max_depth=300, max_features=11, n_estimators=400, random_state=1;, score=(train=-0.008, test=-0.224) total time=   0.6s
[CV 5/5] END bootstrap=False, max_depth=300, max_features=11, n_estimators=400, random_state=1;, score=(train=-0.009, test=-0.211) total time=   0.6s
[CV 1/5] END bootstrap=False, max_depth=300, max_features=11, n_estimators=400, random_state=2;, score=(train=-0.009, test=-0.239) total time=   0.6s
[CV 2/5] END bootstrap=False, max_depth=300, max_features=11, n_estimators=400, random_state=2;, sco

In [27]:
predictions = grid_search.best_estimator_.predict(test_set)

pred = pd.DataFrame(predictions)
pred['Survived'] = pred[0]
pred = pred.iloc[:,[1]].astype("int")
pred.insert(0, 'PassengerId', test_indices)

pred.to_csv('Submission.csv', index=False)