In [1]:
import os
import joblib

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

%matplotlib inline
%load_ext autoreload
%autoreload 2

### 1a. Train, Test Split

In [2]:
csv_path = '../dataset/topagent_dataset/100_games.csv'
df = pd.read_csv(csv_path)
df.shape

(799600, 7)

In [3]:
df.head()

Unnamed: 0,round_num,machine_id,agent_id,n_pulls_self,n_success_self,n_pulls_opp,payout
0,1,69,0,1,0,0,0.1746
1,1,69,1,0,0,1,0.1746
2,1,94,1,1,0,0,0.0
3,1,94,0,0,0,1,0.0
4,2,63,0,1,1,0,0.7469


In [4]:
list(df)

['round_num',
 'machine_id',
 'agent_id',
 'n_pulls_self',
 'n_success_self',
 'n_pulls_opp',
 'payout']

In [5]:
predictor_cols = ['round_num', 'n_pulls_self', 'n_success_self', 'n_pulls_opp']
target_col = 'payout'

In [6]:
df = df.sample(frac = 1, random_state = 0).reset_index(drop = True)

In [7]:
df.head()

Unnamed: 0,round_num,machine_id,agent_id,n_pulls_self,n_success_self,n_pulls_opp,payout
0,1242,38,1,10,5,13,0.282895
1,832,26,1,16,8,14,0.304765
2,141,47,1,6,4,8,0.574496
3,1614,48,0,29,17,18,0.217427
4,648,61,0,2,1,2,0.230176


In [8]:
X = df[predictor_cols]
y = df[target_col]

In [9]:
X.shape, y.shape

((799600, 4), (799600,))

### 1b. Feature Engineering

In [None]:
# round_num: group into class
# consecutive k flag

### 2a. Grid Search Training
- Sample Code on grid search:
```
model = RandomForestRegressor()
grid = GridSearchCV(
    model, params, cv = 5, 
    scoring = wrapped_r2_score)
grid.best_params_
grid.best_score_
```

- Sample code on timing train time
```
%%time
final_rfr = RandomForestRegressor(n_jobs = 10)
final_rfr.fit(X.head(100000), y.head(100000))
```

In [11]:
sub_df = df.sample(n = 100000, random_state = 0).reset_index(drop = True)
df.shape, sub_df.shape

((799600, 7), (100000, 7))

In [12]:
sub_X = sub_df[predictor_cols]
sub_y = sub_df[target_col]

In [13]:
rfr_params = {
    'criterion': ['mse', 'mae'],
    'max_depth': [3, 6, 9],
    'min_samples_leaf': [2, 0.05, 0.1]
}

In [14]:
def wrapped_r2_score(estimator, X, y):
    y_pred = estimator.predict(X)
    score = r2_score(y, y_pred)
    return score

In [15]:
model = RandomForestRegressor()
grid = GridSearchCV(
    model, rfr_params, cv = 4,
    n_jobs = 10, scoring = wrapped_r2_score)

In [16]:
grid.fit(sub_X, sub_y)

GridSearchCV(cv=4, estimator=RandomForestRegressor(), n_jobs=10,
             param_grid={'criterion': ['mse', 'mae'], 'max_depth': [3, 6, 9],
                         'min_samples_leaf': [2, 0.05, 0.1]},
             scoring=<function wrapped_r2_score at 0x7f0ea19f00e0>)

In [22]:
pd.DataFrame(grid.cv_results_).head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,param_min_samples_leaf,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,mean_test_score,std_test_score,rank_test_score
0,4.065744,0.48621,0.089635,0.002796,mse,3,2.0,"{'criterion': 'mse', 'max_depth': 3, 'min_samp...",0.603521,0.613286,0.610812,0.613893,0.610378,0.004124,5
1,3.847039,0.436526,0.086831,0.007173,mse,3,0.05,"{'criterion': 'mse', 'max_depth': 3, 'min_samp...",0.596575,0.610304,0.604836,0.608504,0.605055,0.005277,9
2,3.283388,0.195777,0.079746,0.000559,mse,3,0.1,"{'criterion': 'mse', 'max_depth': 3, 'min_samp...",0.585994,0.598276,0.594643,0.59764,0.594138,0.004898,13
3,7.260632,0.146728,0.141156,0.012022,mse,6,2.0,"{'criterion': 'mse', 'max_depth': 6, 'min_samp...",0.65676,0.665204,0.656502,0.665516,0.660996,0.004367,3
4,4.579074,0.18341,0.107166,0.001819,mse,6,0.05,"{'criterion': 'mse', 'max_depth': 6, 'min_samp...",0.598243,0.611226,0.605757,0.609936,0.606291,0.005067,8


In [18]:
grid.best_params_

{'criterion': 'mse', 'max_depth': 9, 'min_samples_leaf': 2}

In [29]:
grid.best_score_

0.689596035503381

In [None]:
# train avg scores v.s. test avg scores

### 2b. Second Times of Grid Search with Higher `max_depth`

In [23]:
sec_rfr_params = {
    'criterion': ['mse'],
    'max_depth': [9, 12, 15, 17],
    'min_samples_leaf': [2, 5]
}

In [24]:
model = RandomForestRegressor()
sec_grid = GridSearchCV(
    model, sec_rfr_params, cv = 4,
    n_jobs = 10, scoring = wrapped_r2_score)

In [25]:
sec_grid.fit(sub_X, sub_y)

GridSearchCV(cv=4, estimator=RandomForestRegressor(), n_jobs=10,
             param_grid={'criterion': ['mse'], 'max_depth': [9, 12, 15, 17],
                         'min_samples_leaf': [2, 5]},
             scoring=<function wrapped_r2_score at 0x7f0ea19f00e0>)

In [28]:
sec_grid.best_params_

{'criterion': 'mse', 'max_depth': 12, 'min_samples_leaf': 5}

In [30]:
sec_grid.best_score_

0.6936235012673624

### Condition: Cross Validation on Full Dataset with Best Param
- Best parameters found
```
First Search: {'criterion': 'mse', 'max_depth': 9, 'min_samples_leaf': 2}
Second Search: {'criterion': 'mse', 'max_depth': 12, 'min_samples_leaf': 5}
```

### 3. Retrain with Best Param and Save Models

In [19]:
best_params = grid.best_params_

In [20]:
final_rfr = RandomForestRegressor(**best_params)
final_rfr.fit(X, y)

RandomForestRegressor(max_depth=9, min_samples_leaf=2)

In [21]:
joblib.dump(final_rfr, '100games_rfr.joblib')

['100games_rfr.joblib']

### NOT USED

In [32]:
csv_path = '../dataset/topagent_dataset/100_games.csv'
df = pd.read_csv(csv_path)
df.shape

(799600, 7)

In [34]:
df.head(800).tail(6)

Unnamed: 0,round_num,machine_id,agent_id,n_pulls_self,n_success_self,n_pulls_opp,payout
794,199,59,1,1,0,0,0.6402
795,199,59,0,0,0,1,0.6402
796,200,15,0,13,10,4,0.500494
797,200,15,1,4,2,13,0.500494
798,200,34,1,1,1,0,0.1261
799,200,34,0,0,0,1,0.1261
