## Librerias

In [1]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_squared_error

## Importamos el dataframe

In [4]:
df = pd.read_csv('C:/Users/Ramiro/Desktop/Digital House/Desafios/HLTB Scrapper/DB Input Modelo.csv', sep ='|')
df = df.drop(columns=['Unnamed: 0'])

In [5]:
df.head()

Unnamed: 0,Id,Titulo,Main Story,Main + Extras,Completionist,All Styles,Rating,console_IOS,console_GAME BOY,console_ONLIVE,...,pub_MUMBOJUMBO,pub_VIVA MEDIA,pub_GT INTERACTIVE,pub_KEMCO,pub_DAGESTAN TECHNOLOGY,pub_BLACK SHELL MEDIA,pub_SLITHERINE,pub_SUNSOFT,pub_DEGICA,pub_EIDOS INTERACTIVE
0,1,688(I) Hunter/Killer,9.5,24.0,50.5,15.5,,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,Beyond Good & Evil 2,,,,,,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,10,Ico and Shadow of the Colossus Collection,,,,,,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,16,Shadow of the Comet,,,,,,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,18,'Splosion Man,8.0,10.0,18.0,9.5,76.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
#Seteamos el indice 
df = df.set_index(df['Id'])
df = df.drop(columns='Id')

# XGBoost

In [7]:
#Elimino las observaciones que no tienen completa la variable objetivo
df = df.dropna(subset = ['Rating'])

In [8]:
#Separo la matriz de features y la variable objetivo
X =  df.drop(['Titulo','Rating'], axis = 1)
y = df['Rating']

#Separo en Train y Test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)

#Instancio el Clasificador de XGBoost
xg_reg = xgb.XGBRegressor(max_depth=3, learning_rate=0.1, n_estimators=100, silent=True, objective='reg:linear',\
                           booster='gbtree', n_jobs=-1)

In [9]:
%%time
#Fiteo el modelo
xg_reg.fit(X_train,y_train)

#Predicciones
preds = xg_reg.predict(X_test)

Wall time: 17.5 s


In [10]:
#Score
xg_reg.score(X_test, y_test)

0.2075300338990297

### Como podemos mejorar el score?

# XGBoost + GridSearch

In [None]:
# Definimos una lista de hiperparametros a tunear
l2_reg_range = [3,5,7,10]
profundidad = [2,4,6,8,10]
learn_rate = [0.01,0.03,0.1]

In [None]:
#Generamos la grilla para GridSearch
param_grid = dict(reg_lambda = l2_reg_range, max_depth=profundidad, learning_rate = learn_rate)
print(param_grid)

In [None]:
%%time
# Instanciamos el modelo de XGBoost con los parametros de GridSearch y un cross validation de 5 folds
xgb_regressor = xgb.XGBRegressor(n_estimators=100, silent=True, objective='reg:linear',booster='gbtree')
xgb_gs = GridSearchCV(xgb_regressor, param_grid, cv = 5)
xgb_gs.fit(X_train, y_train)

In [None]:
#Vemos los parametros del mejor modelo
xgb_gs.best_estimator_, xgb_gs.best_score_, xgb_gs.best_params_

## Utilizamos el mejor modelo para entrenar

# Mejor estimador??

In [12]:
xgb_best = xgb.XGBRegressor(n_estimators=100, silent=True, objective='reg:linear',booster='gbtree', n_jobs=-1, \
                           learning_rate = 0.1, max_depth=6,reg_lambda=10)
xgb_best.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=6, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=-1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=10, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [13]:
xgb_best.score(X_test, y_test)

0.22414392575899822

## CatBoost

In [36]:
from catboost import CatBoostRegressor
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings('ignore')

In [41]:
#Separo la matriz de features y la variable objetivo
X_cb =  df.drop(['Titulo','Rating','Developers','Publishers','NA','EU','JP','Updated'], axis = 1)
y_cb = df['Rating'].astype(int)

In [43]:
#Separo en Train y Test
X_train_cb, X_test_cb, y_train_cb, y_test_cb = train_test_split(X_cb, y_cb, test_size=0.25, random_state=1)

In [44]:
cb_model = CatBoostRegressor(iterations=100, learning_rate=0.1, depth=3, l2_leaf_reg=7, loss_function='RMSE')

In [45]:
cb_model.fit(X_train_cb, y_train_cb)

0:	learn: 59.5222746	total: 16.1ms	remaining: 1.59s
1:	learn: 54.1467145	total: 28.4ms	remaining: 1.39s
2:	learn: 49.3646450	total: 41.8ms	remaining: 1.35s
3:	learn: 45.1198757	total: 53.8ms	remaining: 1.29s
4:	learn: 41.3022939	total: 67.9ms	remaining: 1.29s
5:	learn: 37.9101091	total: 82.4ms	remaining: 1.29s
6:	learn: 34.9292373	total: 95ms	remaining: 1.26s
7:	learn: 32.3601524	total: 107ms	remaining: 1.23s
8:	learn: 30.0935479	total: 119ms	remaining: 1.21s
9:	learn: 28.0583158	total: 134ms	remaining: 1.2s
10:	learn: 26.3177230	total: 147ms	remaining: 1.19s
11:	learn: 24.7952239	total: 163ms	remaining: 1.2s
12:	learn: 23.5436691	total: 175ms	remaining: 1.17s
13:	learn: 22.4059498	total: 188ms	remaining: 1.16s
14:	learn: 21.4457824	total: 201ms	remaining: 1.14s
15:	learn: 20.6217964	total: 217ms	remaining: 1.14s
16:	learn: 19.9150810	total: 234ms	remaining: 1.14s
17:	learn: 19.3365551	total: 249ms	remaining: 1.13s
18:	learn: 18.8524540	total: 263ms	remaining: 1.12s
19:	learn: 18.44848

<catboost.core.CatBoostRegressor at 0xbac6147160>

In [46]:
#Predecimos sobre el set de testeo
cb_preds = cb_model.predict(X_test_cb, y_test_cb)

TypeError: cannot convert the series to <class 'int'>