In [1]:
import pandas as pd
import numpy as np
import sys, os
import pickle
import seaborn as sns
from tqdm.notebook import tqdm
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

plt.rcParams['font.family'] = 'Malgun Gothic'

In [2]:
df = pd.read_csv("./dataset/new_bgg_data_after_eda2.csv",encoding="utf-8")
df.drop(columns = ["Rank", "Id", "Name"], inplace = True)

In [3]:
df_ = df.copy()

### Data Enginerring (continue from Modeling_1.ipynb)

In [4]:
#from previous notebook

# Year
df_v1 = df_.copy()
before = df_.shape[0]
df_v1.drop(index = df_v1[df_v1.Year < 1600].index, inplace=True)
after = df_v1.shape[0]
print (f"By year, we drop {before - after} entries.")

# MaxPlayer
df_v2 = df_v1.copy() 
before = df_v1.shape[0]
df_v2.drop(index = df_v2[df_v2.Maxplayers > 20].index, inplace=True)
after = df_v2.shape[0]
print (f"By Max Player, we drop {before - after} entries.")

# drop artist
df_v3 = df_v2.copy() 
df_v3.drop(columns="Nartist", inplace=True)

By year, we drop 202 entries.
By Max Player, we drop 244 entries.


### Enviroment setting and Dataset splitting

In [5]:
test_size = 0.2
random_state = 42
target_list_ = ["Users_rated", "Rating", "Complexity"]
unused_feature_list_ = ["Sug_players", "Sug_age", "Language_dependence", "Own"]

In [6]:
from sklearn.model_selection import train_test_split

def get_Xs_Ys(df, target = target_list_, exclude = unused_feature_list_, use_val=True):

      assert type(target) == list, "target must be list"
      assert type(exclude) == list, "exclude must be list"

      train, test = train_test_split(
                        df,
                        test_size = test_size,
                        random_state = random_state)
      if use_val:
            train, val = train_test_split(
                              train,
                              test_size = test_size,
                              random_state = random_state)


      get_y = lambda data : data[target]
      get_x = lambda data : data.drop(columns = target + exclude)

      Ys = {"train" : get_y(train),
            "test" : get_y(test)}
      Xs = {"train" : get_x(train),
            "test" : get_x(test)}
      
      if use_val:
            Ys['val'] = get_y(val)
            Xs['val'] = get_x(val)
            
      return Xs, Ys, target
        
Xs, Ys, target_list = get_Xs_Ys(df)

### Modeling for Problem 1 (continue from Modeling_1.ipynb)

4. Decision Tree

4.1. with Hyperopt

In [7]:
from hyperopt import fmin, tpe, Trials, STATUS_OK, hp
from sklearn.model_selection import cross_val_score

In [8]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error

def make_dt(dt_args = {}):
    
    dt_args["max_depth"] = int(dt_args["max_depth"])
         
    return DecisionTreeRegressor(random_state = random_state, **dt_args)

def find_best_param_decision_tree(df, results=False):
    
    params = {
        "max_depth": hp.quniform("max_depth", 2, 40, 2),
        "min_samples_split" : hp.uniform("min_samples_split", 0.5, 1.0),
        "min_samples_leaf" : hp.quniform("min_samples_leaf", 1, 5, 1),
        "max_features" : hp.choice("max_features", [None, "sqrt", "log2"]),
        "min_impurity_decrease" : hp.uniform("min_impurity_decrease", 0.5, 1.0),
        "criterion" : hp.choice("criterion", ["squared_error", "absolute_error", "poisson"])
    } 
    Xs, Ys, target_list = get_Xs_Ys(df, use_val=True)
    
    best_params = []
    trials = []
    
    i = 0
    def fit_and_eval(params):
    
        dt = make_dt(dt_args = params)
        
        dt.fit(Xs['train'], Ys['train'].iloc[:, i])
        
        y_pred_val = dt.predict(Xs['val'])
        
        score = mean_squared_error(Ys['val'].iloc[:, i], y_pred_val)
        
        # score = cross_val_score(dt, Xs['train'], Ys['train'].iloc[:, i], cv=5, scoring="neg_mean_squared_error")
        # avg_cv_score = np.mean(score)
        
        return {"loss" : score, "status":STATUS_OK}
    
    for j in range(len(target_list)):

        i = j
        
        trial = (Trials())       
        
        best_param = fmin(
            fn = fit_and_eval,
            trials = trial,
            space = params,
            algo = tpe.suggest,
            max_evals = 20
        )
        best_params.append(best_param)
        trials.append(trial)
    
    if results:
        return best_params, trials

In [58]:
dataset = [df_, df_v1, df_v2, df_v3]

best_params_group = []
trials_group = []

for data in dataset:
    best_params, trials = find_best_param_decision_tree(data, results= True)
    best_params_group.append(best_params)
    trials_group.append(trials)

100%|██████████| 20/20 [00:00<00:00, 31.53trial/s, best loss: 11875945.666352432]
100%|██████████| 20/20 [00:00<00:00, 31.49trial/s, best loss: 0.8613774659743214]
100%|██████████| 20/20 [00:00<00:00, 31.88trial/s, best loss: 0.6578505151420543]
100%|██████████| 20/20 [00:00<00:00, 42.39trial/s, best loss: 13715735.045520952]
100%|██████████| 20/20 [00:00<00:00, 41.26trial/s, best loss: 0.8375320598875979]
100%|██████████| 20/20 [00:00<00:00, 27.64trial/s, best loss: 0.7095266943102468]
100%|██████████| 20/20 [00:00<00:00, 61.51trial/s, best loss: 13639480.347471006]
100%|██████████| 20/20 [00:00<00:00, 38.27trial/s, best loss: 0.8929002611159038]
100%|██████████| 20/20 [00:00<00:00, 39.48trial/s, best loss: 0.6812828201096172]
100%|██████████| 20/20 [00:00<00:00, 30.04trial/s, best loss: 13639480.347471006]
100%|██████████| 20/20 [00:00<00:00, 42.04trial/s, best loss: 0.8929002611159038]
100%|██████████| 20/20 [00:00<00:00, 30.57trial/s, best loss: 0.6812828201096172]


In [10]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

def eval_models(y_pred, y_true, verbose = False):
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    
    if verbose:
        
        return {"MSE" : mse,
                "RMSE" : rmse,
                "MAE" : mae,
                "R2": r2}
    else:
        return mse, rmse, mae, r2

def test_eval(x_test, y_test, model, target_list):

    assert len(model) == len(target_list)

    eval_tests = []
    for i in range(len(target_list)):

        eval_test = eval_models(model[i].predict(x_test), Ys['test'][target_list[i]])
        eval_tests.append(eval_test)

    return pd.DataFrame(eval_tests, index=target_list, columns=["MSE", "RMSE", "MAE", "R2"])

In [62]:
def make_best_model(params, x_train, y_train, target_list):
    
    assert len(params) == len(target_list)
    
    models = []
    
    for i, param in tqdm(enumerate(params)):
        dtree = make_dt(dt_args = param)
        dtree.fit(x_train, y_train[target_list[i]])
        
        models.append(dtree)
    
    return models


In [50]:
Xs, Ys, _ = get_Xs_Ys(df_, use_val=False)

max_features = [None, "sqrt", "log2"]
criterion = ["squared_error", "absolute_error", "poisson"]

best_params_ = [best_params_group[0][0],
                best_params_group[1][1],
                best_params_group[0][2]]

for best_param in best_params_:
    best_param["max_features"] = max_features[best_param["max_features"]]
    best_param["criterion"] = criterion[best_param["criterion"]]

In [36]:
Xs, Ys, _ = get_Xs_Ys(df_, use_val=False)
dt_0_2 = make_best_model([best_params_group[0][0],best_params_group[0][2]],
                        Xs['train'],
                        Ys['train'],
                        ["Users_rated", "Complexity"])

t1 = test_eval(Xs['test'], Ys['test'], dt_0_2, ["Users_rated", "Complexity"])

Xs, Ys, _ = get_Xs_Ys(df_v1, use_val=False)
dt_1 = make_best_model([best_params_group[1][1]],
                        Xs['train'],
                        Ys['train'],
                        ["Rating"])

t2 = test_eval(Xs['test'], Ys['test'], dt_1, ["Rating"])

0it [00:00, ?it/s]

0it [00:00, ?it/s]

In [47]:
t = pd.concat([t1, t2], axis=0)
t.index = target_list_
t

Unnamed: 0,MSE,RMSE,MAE,R2
Users_rated,8426302.0,2902.809365,1084.107741,-0.00203
Rating,0.7020936,0.83791,0.674865,-0.001991
Complexity,0.8277937,0.909832,0.712981,-4e-05


In [59]:
Xs, Ys, _ = get_Xs_Ys(df_, use_val=False)

max_features = [None, "sqrt", "log2"]
criterion = ["squared_error", "absolute_error", "poisson"]

for best_param in best_params_group[0]:
    best_param["max_features"] = max_features[best_param["max_features"]]
    best_param["criterion"] = criterion[best_param["criterion"]]
    


0it [00:00, ?it/s]

Unnamed: 0,MSE,RMSE,MAE,R2
Users_rated,8426302.0,2902.809365,1084.107741,-0.00203
Rating,0.8560904,0.925252,0.71758,-0.000357
Complexity,0.7020936,0.83791,0.674865,-0.001991


In [64]:
best_params_group[0]

[{'criterion': 'squared_error',
  'max_depth': 30,
  'max_features': 'sqrt',
  'min_impurity_decrease': 0.6918248079430422,
  'min_samples_leaf': 3.0,
  'min_samples_split': 0.7472979432859868},
 {'criterion': 'poisson',
  'max_depth': 34,
  'max_features': None,
  'min_impurity_decrease': 0.8036358995125704,
  'min_samples_leaf': 2.0,
  'min_samples_split': 0.6876976184667712},
 {'criterion': 'absolute_error',
  'max_depth': 34,
  'max_features': 'log2',
  'min_impurity_decrease': 0.8334961998941997,
  'min_samples_leaf': 2.0,
  'min_samples_split': 0.6115968092528161}]

In [63]:
dt = make_best_model(best_params_group[0], Xs['train'], Ys['train'], target_list_)

test_eval(Xs['test'], Ys['test'], dt, target_list)

0it [00:00, ?it/s]

Unnamed: 0,MSE,RMSE,MAE,R2
Users_rated,8426302.0,2902.809365,1084.107741,-0.00203
Rating,0.8560904,0.925252,0.71758,-0.000357
Complexity,0.7020936,0.83791,0.674865,-0.001991


In [55]:
Ys['test'].iloc[:,0]

19594     460
15754     103
18397      72
15591      60
3263      432
         ... 
19919     433
16580      45
15345      70
535      4115
6868       84
Name: Users_rated, Length: 4004, dtype: int64

In [60]:
dt[0].predict(Xs['test'])

array([875.56207844, 875.56207844, 875.56207844, ..., 875.56207844,
       875.56207844, 875.56207844])

In [54]:
r2_score(Ys['test'].iloc[:,0], dt[0].predict(Xs['test']))

-0.0020304570973386227

4.2 with Random Search

In [12]:
from sklearn.model_selection import RandomizedSearchCV

def find_best_param_randomly(df, results=False):
    params = {
        "max_depth": np.arange(1, 16)*10,
        "min_samples_split" : np.random.uniform(0.5, 1.0, 100),
        "min_samples_leaf" : [1,2,3,4,5],
        "max_features" : [None, "sqrt", "log2"],
        "min_impurity_decrease" : np.random.uniform(0.01, 1.0, 100),
        "criterion" : ["squared_error", "absolute_error",]
    }
    Xs, Ys, target_list = get_Xs_Ys(df, use_val=False)
    
    random_searchs = []
    
    for i in tqdm(range(len(target_list))):
        random_search = RandomizedSearchCV(DecisionTreeRegressor(random_state = random_state), params, n_iter = 20,
                                           n_jobs=-1, random_state=random_state, cv=5)
        random_search.fit(Xs['train'], Ys['train'].iloc[:, i])
        
        random_searchs.append(random_search)
        print(f"Target : {target_list[i]}")
        print("Best parameter: ", random_search.best_params_)
        print("Best R2 Score: ", random_search.best_score_)
    
    if results:
        return random_searchs

In [13]:
# dataset = [df, df_v1, df_v2, df_v3]

# for i, data in tqdm(enumerate(dataset)):
#     print(f"df_v[{i}] : " ,end="")
#     find_best_param_randomly(data)
