In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from joblib import dump, load
#from google.colab import files

In [None]:
def clean(raw):
    raw.drop(labels=['Name', 'Ticket', 'Cabin'], axis='columns', inplace=True)
    
    raw['Age'] = raw['Age'].fillna(raw['Age'].mean())
    raw['Embarked'] = raw['Embarked'].fillna(raw['Embarked'].mode()[0])
    
    raw['Fare'] = raw['Fare'].map(lambda i: np.log(i) if i > 0 else 0)
    
    sex_dummy = pd.get_dummies(raw['Sex'])
    raw = pd.concat([raw, sex_dummy], axis=1)
    raw.drop('Sex', axis=1, inplace=True)

    embarked_dummy = pd.get_dummies(raw['Embarked'], prefix='Embarked')
    raw = pd.concat([raw, embarked_dummy], axis=1)
    raw.drop('Embarked', axis=1, inplace=True)

    pclass_dummy = pd.get_dummies(raw['Pclass'], prefix='Pclass')
    raw = pd.concat([raw, pclass_dummy], axis=1)
    raw.drop('Pclass', axis=1, inplace=True)
    
    raw.drop(
        labels=['Embarked_Q', 'SibSp', 'Age', 'Parch', 'Pclass_2'],
        axis='columns',inplace=True)
    
    return raw

In [None]:
x_train: pd.DataFrame = pd.read_csv('../data/train.csv', index_col=0)
x_test: pd.DataFrame = pd.read_csv('../data/test.csv', index_col=0)
y_train: pd.Series = x_train['Survived']
x_train.drop(labels=['Survived'], axis='columns', inplace=True)

x_train = clean(x_train)
x_test = clean(x_test)

params = {
    'max_depth': [i for i in range(1, 10)],
}

base_rf = RandomForestClassifier(random_state=3)
rf = GridSearchCV(
    base_rf,
    params,
    scoring='balanced_accuracy',
    n_jobs=-1,
    cv=5,
    return_train_score=True,
)
rf.fit(x_train, y_train)

results = pd.DataFrame(rf.cv_results_)
results = results[['param_max_depth','mean_test_score',
                  'rank_test_score', 'mean_train_score']]
results.to_csv('rf_results.csv', index=False)
#files.download('rf_results.csv')

In [None]:
plt.plot(results['param_max_depth'], results['mean_test_score'], label='test')
plt.plot(results['param_max_depth'], results['mean_train_score'], label='train')

plt.xlabel('Max Depth')
plt.ylabel('Balanced Accuracy')
plt.title('Max Depth Overfit Analysis')
plt.legend()
plt.savefig('depth.svg', format='svg')
#files.download('depth.svg')

In [None]:
base_rf = RandomForestClassifier(random_state=3, max_depth=2)
rf.fit(x_train, y_train)
predictions = pd.DataFrame()
predictions['Survived'] = rf.predict(x_test)
predictions['PassengerId'] = x_test.index
predictions.to_csv('rf_output.csv', index=False)
# files.download('rf_output.csv')