In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

import os
import sys
sys.path.append(os.path.join(
  os.path.abspath(''), '..', '..')
)

from src.preprocess import Preprocess

import warnings
warnings.filterwarnings('ignore')

In [None]:
x_train: pd.DataFrame = pd.read_csv(
  '../../data/train.csv', index_col=0)
x_test: pd.DataFrame = pd.read_csv(
  '../../data/test.csv', index_col=0)
y_train: pd.Series = x_train['Survived']

x_train, x_test = Preprocess(scaler=StandardScaler()) \
  .apply(x_train, x_test)

params = {
  'max_depth': [i for i in range(1, 10)],
}

base_rf = RandomForestClassifier(random_state=3)
rf = GridSearchCV(
  base_rf,
  params,
  scoring='balanced_accuracy',
  n_jobs=-1,
  cv=5,
  return_train_score=True,
)
rf.fit(x_train, y_train)

results = pd.DataFrame(rf.cv_results_)
results = results[[
  'param_max_depth','mean_test_score',
  'rank_test_score', 'mean_train_score'
]]
results.to_csv('rf_results.csv', index=False)

In [None]:
plt.plot(results['param_max_depth'],
  results['mean_test_score'], label='test')
plt.plot(results['param_max_depth'],
  results['mean_train_score'], label='train')

plt.xlabel('Max Depth')
plt.ylabel('Balanced Accuracy')
plt.title('Max Depth Overfit Analysis')
plt.legend()
plt.savefig('depth.svg', format='svg')