In [49]:
import os
import pandas as pd
import numpy as np
import re

from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.svm import SVR
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder, StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV

In [50]:
def load_data():
    csv_path = "D:\\repos\\MLPractice\\TitanicSurvivors"
    training_set = pd.read_csv(os.path.join(csv_path, "train.csv"))
    test_set = pd.read_csv(os.path.join(csv_path, "test.csv"))
    return training_set, test_set
    
training_set, test_set = load_data()

test_set.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [51]:
training_set_labels = training_set['Survived']
training_set = training_set.iloc[:, [2, 3, 4, 5, 6, 7, 9, 10, 11]]

mean_age = training_set['Age'].mean()
training_set['Age'] = training_set['Age'].fillna(mean_age)

training_set = training_set.fillna(method='bfill')
training_set = training_set.fillna(method='ffill')
training_set['Cabin'] = training_set['Cabin'].str[0:1]

test_indices = test_set['PassengerId']

test_set = test_set.iloc[:, [1, 2, 3, 4, 5, 6, 8, 9, 10]]

mean_age = test_set['Age'].mean()
test_set['Age'] = test_set['Age'].fillna(mean_age)

test_set = test_set.fillna(method='bfill')
test_set = test_set.fillna(method='ffill')
test_set['Cabin'] = test_set['Cabin'].str[0:1]

#training_set.head()

In [52]:
def get_title(name):
    search = re.search(' ([A-Za-z]+)\. ', name)
    
    if search:
        rare_group = ['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona']
        
        title = search.group(1)
        if title in rare_group:
            title = 'Rare'
        elif title == 'Mlle':
            title = 'Miss'
        elif title == 'Ms':
            title = 'Miss'
        elif title == 'Mme':
            title = 'Mrs'
    else:
        title = ""
    
    return title

In [53]:
training_conditions = [
    (training_set['Age'] < 13),
    (training_set['Age'] >= 13) & (training_set['Age'] < 20),
    (training_set['Age'] >= 20) & (training_set['Age'] < 60),
    (training_set['Age'] >= 60)
]

test_conditions = [
    (test_set['Age'] < 13),
    (test_set['Age'] >= 13) & (test_set['Age'] < 20),
    (test_set['Age'] >= 20) & (test_set['Age'] < 60),
    (test_set['Age'] >= 60)
]

values = [1, 2, 3, 4]

training_set['FamilySize'] = training_set['SibSp'] + training_set['Parch'] + 1
training_set['AgeGroup'] = np.select(training_conditions, values, default=3)
training_set['Alone'] = np.where(training_set['FamilySize'] == 1, 1, 0)
training_set['Title'] = training_set['Name'].apply(get_title)

test_set['FamilySize'] = test_set['SibSp'] + test_set['Parch'] + 1
test_set['AgeGroup'] = np.select(test_conditions, values, default=3)
test_set['Alone'] = np.where(test_set['FamilySize'] == 1, 1, 0)
test_set['Title'] = test_set['Name'].apply(get_title)

#training_set = training_set.drop('SibSp', axis=1)
#training_set = training_set.drop('Parch', axis=1)
training_set = training_set.drop('Name', axis=1)

#test_set = test_set.drop('SibSp', axis=1)
#test_set = test_set.drop('Parch', axis=1)
test_set = test_set.drop('Name', axis=1)

training_set.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,FamilySize,AgeGroup,Alone,Title
0,3,male,22.0,1,0,7.25,C,S,2,3,0,Mr
1,1,female,38.0,1,0,71.2833,C,C,2,3,0,Mrs
2,3,female,26.0,0,0,7.925,C,S,1,3,1,Miss
3,1,female,35.0,1,0,53.1,C,S,2,3,0,Mrs
4,3,male,35.0,0,0,8.05,E,S,1,3,1,Mr


In [54]:
encoder = LabelEncoder()

encoder = encoder.fit(training_set['Sex'])
training_set['Sex'] = encoder.transform(training_set['Sex'])
test_set['Sex'] = encoder.transform(test_set['Sex'])

encoder = encoder.fit(training_set['Embarked'])
training_set['Embarked'] = encoder.transform(training_set['Embarked'])
test_set['Embarked'] = encoder.transform(test_set['Embarked'])

encoder = encoder.fit(training_set['Cabin'])
training_set['Cabin'] = encoder.transform(training_set['Cabin'])
test_set['Cabin'] = encoder.transform(test_set['Cabin'])

encoder = encoder.fit(training_set['Title'])
training_set['Title'] = encoder.transform(training_set['Title'])
test_set['Title'] = encoder.transform(test_set['Title'])

training_set.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,FamilySize,AgeGroup,Alone,Title
0,3,1,22.0,1,0,7.25,2,2,2,3,0,2
1,1,0,38.0,1,0,71.2833,2,0,2,3,0,3
2,3,0,26.0,0,0,7.925,2,2,1,3,1,1
3,1,0,35.0,1,0,53.1,2,2,2,3,0,3
4,3,1,35.0,0,0,8.05,4,2,1,3,1,2


In [55]:
training_set = StandardScaler().fit_transform(training_set)
test_set = StandardScaler().fit_transform(test_set)

In [59]:
#model = LinearRegression()
#model = DecisionTreeRegressor()
model = RandomForestClassifier()

params = [
    {"n_estimators": [50, 100, 150, 200, 400], "max_features": [10, 11], "random_state": [1, 2], "max_depth": [100, 200, 300]},
    {"bootstrap": [False], "n_estimators": [50, 100, 150, 200, 400], "max_features": [10, 11], "random_state": [1, 2], "max_depth": [100, 200, 300]}
]

grid_search = GridSearchCV(model, params, cv=5, scoring="accuracy", return_train_score=True, verbose=3)

In [60]:
grid_search.fit(training_set, training_set_labels)
print(grid_search.best_params_)
print(grid_search.best_estimator_)
print(grid_search.best_estimator_.score(training_set, training_set_labels))

Fitting 5 folds for each of 120 candidates, totalling 600 fits
[CV 1/5] END max_depth=100, max_features=10, n_estimators=50, random_state=1;, score=(train=0.997, test=0.771) total time=   0.0s
[CV 2/5] END max_depth=100, max_features=10, n_estimators=50, random_state=1;, score=(train=0.997, test=0.809) total time=   0.0s
[CV 3/5] END max_depth=100, max_features=10, n_estimators=50, random_state=1;, score=(train=0.994, test=0.871) total time=   0.0s
[CV 4/5] END max_depth=100, max_features=10, n_estimators=50, random_state=1;, score=(train=0.996, test=0.809) total time=   0.0s
[CV 5/5] END max_depth=100, max_features=10, n_estimators=50, random_state=1;, score=(train=0.996, test=0.843) total time=   0.0s
[CV 1/5] END max_depth=100, max_features=10, n_estimators=50, random_state=2;, score=(train=0.999, test=0.782) total time=   0.0s
[CV 2/5] END max_depth=100, max_features=10, n_estimators=50, random_state=2;, score=(train=0.996, test=0.809) total time=   0.0s
[CV 3/5] END max_depth=100,

[CV 3/5] END max_depth=100, max_features=11, n_estimators=100, random_state=1;, score=(train=0.996, test=0.865) total time=   0.1s
[CV 4/5] END max_depth=100, max_features=11, n_estimators=100, random_state=1;, score=(train=0.997, test=0.803) total time=   0.1s
[CV 5/5] END max_depth=100, max_features=11, n_estimators=100, random_state=1;, score=(train=0.997, test=0.837) total time=   0.1s
[CV 1/5] END max_depth=100, max_features=11, n_estimators=100, random_state=2;, score=(train=0.999, test=0.782) total time=   0.1s
[CV 2/5] END max_depth=100, max_features=11, n_estimators=100, random_state=2;, score=(train=0.997, test=0.809) total time=   0.1s
[CV 3/5] END max_depth=100, max_features=11, n_estimators=100, random_state=2;, score=(train=0.996, test=0.893) total time=   0.1s
[CV 4/5] END max_depth=100, max_features=11, n_estimators=100, random_state=2;, score=(train=0.997, test=0.803) total time=   0.1s
[CV 5/5] END max_depth=100, max_features=11, n_estimators=100, random_state=2;, sco

[CV 1/5] END max_depth=200, max_features=10, n_estimators=150, random_state=2;, score=(train=0.999, test=0.793) total time=   0.2s
[CV 2/5] END max_depth=200, max_features=10, n_estimators=150, random_state=2;, score=(train=0.997, test=0.809) total time=   0.2s
[CV 3/5] END max_depth=200, max_features=10, n_estimators=150, random_state=2;, score=(train=0.996, test=0.871) total time=   0.2s
[CV 4/5] END max_depth=200, max_features=10, n_estimators=150, random_state=2;, score=(train=0.997, test=0.798) total time=   0.2s
[CV 5/5] END max_depth=200, max_features=10, n_estimators=150, random_state=2;, score=(train=0.997, test=0.831) total time=   0.2s
[CV 1/5] END max_depth=200, max_features=10, n_estimators=200, random_state=1;, score=(train=0.999, test=0.799) total time=   0.3s
[CV 2/5] END max_depth=200, max_features=10, n_estimators=200, random_state=1;, score=(train=0.997, test=0.809) total time=   0.2s
[CV 3/5] END max_depth=200, max_features=10, n_estimators=200, random_state=1;, sco

[CV 4/5] END max_depth=200, max_features=11, n_estimators=200, random_state=2;, score=(train=0.997, test=0.803) total time=   0.3s
[CV 5/5] END max_depth=200, max_features=11, n_estimators=200, random_state=2;, score=(train=0.997, test=0.826) total time=   0.3s
[CV 1/5] END max_depth=200, max_features=11, n_estimators=400, random_state=1;, score=(train=0.999, test=0.782) total time=   0.6s
[CV 2/5] END max_depth=200, max_features=11, n_estimators=400, random_state=1;, score=(train=0.997, test=0.815) total time=   0.6s
[CV 3/5] END max_depth=200, max_features=11, n_estimators=400, random_state=1;, score=(train=0.996, test=0.888) total time=   0.6s
[CV 4/5] END max_depth=200, max_features=11, n_estimators=400, random_state=1;, score=(train=0.997, test=0.792) total time=   0.6s
[CV 5/5] END max_depth=200, max_features=11, n_estimators=400, random_state=1;, score=(train=0.997, test=0.826) total time=   0.6s
[CV 1/5] END max_depth=200, max_features=11, n_estimators=400, random_state=2;, sco

[CV 2/5] END max_depth=300, max_features=11, n_estimators=50, random_state=1;, score=(train=0.997, test=0.809) total time=   0.0s
[CV 3/5] END max_depth=300, max_features=11, n_estimators=50, random_state=1;, score=(train=0.994, test=0.876) total time=   0.0s
[CV 4/5] END max_depth=300, max_features=11, n_estimators=50, random_state=1;, score=(train=0.994, test=0.809) total time=   0.0s
[CV 5/5] END max_depth=300, max_features=11, n_estimators=50, random_state=1;, score=(train=0.996, test=0.837) total time=   0.0s
[CV 1/5] END max_depth=300, max_features=11, n_estimators=50, random_state=2;, score=(train=0.999, test=0.788) total time=   0.0s
[CV 2/5] END max_depth=300, max_features=11, n_estimators=50, random_state=2;, score=(train=0.996, test=0.809) total time=   0.0s
[CV 3/5] END max_depth=300, max_features=11, n_estimators=50, random_state=2;, score=(train=0.994, test=0.876) total time=   0.0s
[CV 4/5] END max_depth=300, max_features=11, n_estimators=50, random_state=2;, score=(trai

[CV 4/5] END bootstrap=False, max_depth=100, max_features=10, n_estimators=100, random_state=1;, score=(train=0.997, test=0.758) total time=   0.1s
[CV 5/5] END bootstrap=False, max_depth=100, max_features=10, n_estimators=100, random_state=1;, score=(train=0.997, test=0.815) total time=   0.1s
[CV 1/5] END bootstrap=False, max_depth=100, max_features=10, n_estimators=100, random_state=2;, score=(train=0.999, test=0.721) total time=   0.1s
[CV 2/5] END bootstrap=False, max_depth=100, max_features=10, n_estimators=100, random_state=2;, score=(train=0.997, test=0.803) total time=   0.1s
[CV 3/5] END bootstrap=False, max_depth=100, max_features=10, n_estimators=100, random_state=2;, score=(train=0.996, test=0.831) total time=   0.1s
[CV 4/5] END bootstrap=False, max_depth=100, max_features=10, n_estimators=100, random_state=2;, score=(train=0.997, test=0.747) total time=   0.1s
[CV 5/5] END bootstrap=False, max_depth=100, max_features=10, n_estimators=100, random_state=2;, score=(train=0.

[CV 5/5] END bootstrap=False, max_depth=100, max_features=11, n_estimators=100, random_state=2;, score=(train=0.997, test=0.770) total time=   0.1s
[CV 1/5] END bootstrap=False, max_depth=100, max_features=11, n_estimators=150, random_state=1;, score=(train=0.999, test=0.732) total time=   0.2s
[CV 2/5] END bootstrap=False, max_depth=100, max_features=11, n_estimators=150, random_state=1;, score=(train=0.997, test=0.809) total time=   0.2s
[CV 3/5] END bootstrap=False, max_depth=100, max_features=11, n_estimators=150, random_state=1;, score=(train=0.996, test=0.837) total time=   0.2s
[CV 4/5] END bootstrap=False, max_depth=100, max_features=11, n_estimators=150, random_state=1;, score=(train=0.997, test=0.753) total time=   0.2s
[CV 5/5] END bootstrap=False, max_depth=100, max_features=11, n_estimators=150, random_state=1;, score=(train=0.997, test=0.764) total time=   0.2s
[CV 1/5] END bootstrap=False, max_depth=100, max_features=11, n_estimators=150, random_state=2;, score=(train=0.

[CV 1/5] END bootstrap=False, max_depth=200, max_features=10, n_estimators=150, random_state=2;, score=(train=0.999, test=0.732) total time=   0.2s
[CV 2/5] END bootstrap=False, max_depth=200, max_features=10, n_estimators=150, random_state=2;, score=(train=0.997, test=0.815) total time=   0.2s
[CV 3/5] END bootstrap=False, max_depth=200, max_features=10, n_estimators=150, random_state=2;, score=(train=0.996, test=0.831) total time=   0.2s
[CV 4/5] END bootstrap=False, max_depth=200, max_features=10, n_estimators=150, random_state=2;, score=(train=0.997, test=0.747) total time=   0.2s
[CV 5/5] END bootstrap=False, max_depth=200, max_features=10, n_estimators=150, random_state=2;, score=(train=0.997, test=0.815) total time=   0.2s
[CV 1/5] END bootstrap=False, max_depth=200, max_features=10, n_estimators=200, random_state=1;, score=(train=0.999, test=0.737) total time=   0.3s
[CV 2/5] END bootstrap=False, max_depth=200, max_features=10, n_estimators=200, random_state=1;, score=(train=0.

[CV 2/5] END bootstrap=False, max_depth=200, max_features=11, n_estimators=200, random_state=1;, score=(train=0.997, test=0.803) total time=   0.3s
[CV 3/5] END bootstrap=False, max_depth=200, max_features=11, n_estimators=200, random_state=1;, score=(train=0.996, test=0.837) total time=   0.3s
[CV 4/5] END bootstrap=False, max_depth=200, max_features=11, n_estimators=200, random_state=1;, score=(train=0.997, test=0.764) total time=   0.3s
[CV 5/5] END bootstrap=False, max_depth=200, max_features=11, n_estimators=200, random_state=1;, score=(train=0.997, test=0.764) total time=   0.3s
[CV 1/5] END bootstrap=False, max_depth=200, max_features=11, n_estimators=200, random_state=2;, score=(train=0.999, test=0.732) total time=   0.3s
[CV 2/5] END bootstrap=False, max_depth=200, max_features=11, n_estimators=200, random_state=2;, score=(train=0.997, test=0.809) total time=   0.3s
[CV 3/5] END bootstrap=False, max_depth=200, max_features=11, n_estimators=200, random_state=2;, score=(train=0.

[CV 3/5] END bootstrap=False, max_depth=300, max_features=10, n_estimators=200, random_state=2;, score=(train=0.996, test=0.837) total time=   0.3s
[CV 4/5] END bootstrap=False, max_depth=300, max_features=10, n_estimators=200, random_state=2;, score=(train=0.997, test=0.747) total time=   0.3s
[CV 5/5] END bootstrap=False, max_depth=300, max_features=10, n_estimators=200, random_state=2;, score=(train=0.997, test=0.809) total time=   0.3s
[CV 1/5] END bootstrap=False, max_depth=300, max_features=10, n_estimators=400, random_state=1;, score=(train=0.999, test=0.737) total time=   0.7s
[CV 2/5] END bootstrap=False, max_depth=300, max_features=10, n_estimators=400, random_state=1;, score=(train=0.997, test=0.815) total time=   1.0s
[CV 3/5] END bootstrap=False, max_depth=300, max_features=10, n_estimators=400, random_state=1;, score=(train=0.996, test=0.837) total time=   1.1s
[CV 4/5] END bootstrap=False, max_depth=300, max_features=10, n_estimators=400, random_state=1;, score=(train=0.

[CV 4/5] END bootstrap=False, max_depth=300, max_features=11, n_estimators=400, random_state=1;, score=(train=0.997, test=0.764) total time=   0.8s
[CV 5/5] END bootstrap=False, max_depth=300, max_features=11, n_estimators=400, random_state=1;, score=(train=0.997, test=0.764) total time=   0.8s
[CV 1/5] END bootstrap=False, max_depth=300, max_features=11, n_estimators=400, random_state=2;, score=(train=0.999, test=0.737) total time=   0.8s
[CV 2/5] END bootstrap=False, max_depth=300, max_features=11, n_estimators=400, random_state=2;, score=(train=0.997, test=0.809) total time=   0.7s
[CV 3/5] END bootstrap=False, max_depth=300, max_features=11, n_estimators=400, random_state=2;, score=(train=0.996, test=0.848) total time=   0.8s
[CV 4/5] END bootstrap=False, max_depth=300, max_features=11, n_estimators=400, random_state=2;, score=(train=0.997, test=0.764) total time=   0.8s
[CV 5/5] END bootstrap=False, max_depth=300, max_features=11, n_estimators=400, random_state=2;, score=(train=0.

In [61]:
predictions = grid_search.best_estimator_.predict(test_set)

pred = pd.DataFrame(predictions)
pred['Survived'] = pred[0]
pred = pred.iloc[:,[1]].astype("int")
pred.insert(0, 'PassengerId', test_indices)

pred.to_csv('Submission.csv', index=False)