In [42]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler, OneHotEncoder #normalizacao
from sklearn.pipeline import Pipeline #definir sequencias de transformacoes
from sklearn.compose import ColumnTransformer #aplicacao do pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error

#modelos
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

### Import do dataset e tratamento do mesmo

In [2]:
data = pd.read_csv("databaseOPGG.csv")
data["dmg_per_death"] = data["dmg_per_death"].apply(round, ndigits=1)
data.head()

Unnamed: 0,opggScore,kda,kill_participation,dmg_per_death,vision,CS_per_min,role
0,4.2,0.5,17,1072.0,9,4.5,0
1,5.7,2.8,61,2502.0,6,6.1,1
2,7.2,2.8,61,4283.4,10,8.3,2
3,5.6,0.88,30,1744.1,7,6.8,3
4,5.5,2.0,43,1656.8,38,1.2,4


### Divisão do dataset entre teste e treino

In [3]:
x_treino, x_teste, y_treino, y_teste = train_test_split(data.drop("opggScore", axis=1), data["opggScore"], test_size=0.3)

In [4]:
pipe = Pipeline([('scaler', StandardScaler())])
role_pipe = Pipeline([("one_hot", OneHotEncoder())])
num_columns = data.drop(["opggScore", "role"], axis=1).columns
num_columns

Index(['kda', 'kill_participation', 'dmg_per_death', 'vision', 'CS_per_min'], dtype='object')

In [5]:
transformer = ColumnTransformer([("num_columns", pipe, num_columns), ("num_columns_role", role_pipe, ["role"])])
x_treino_normal = transformer.fit_transform(x_treino)
x_teste_normal = transformer.transform(x_teste)

In [6]:
#linreg = LinearRegression()
#dt = DecisionTreeRegressor()
rf = RandomForestRegressor()
#knn = KNeighborsRegressor()

In [7]:
#linreg.fit(x_treino_normal, y_treino)
#dt.fit(x_treino_normal, y_treino)
rf.fit(x_treino_normal, y_treino)
#knn.fit(x_treino_normal, y_treino)

RandomForestRegressor()

In [8]:
#pred_linreg = linreg.predict(x_teste_normal)
#pred_dt = dt.predict(x_teste_normal)
pred_rf = rf.predict(x_teste_normal)
#pred_knn = knn.predict(x_teste_normal)

In [9]:
def evaluation(y_test, y_pred):
    print('MAE: ' +str(mean_absolute_error(y_test, y_pred)))
    print('MSE: '+str( mean_squared_error(y_test, y_pred)))
    print('RMSE: ' +str(np.sqrt(mean_squared_error(y_test, y_pred))))
    print('R2: ' +str(r2_score(y_test, y_pred)))

In [10]:
#evaluation(y_teste, pred_linreg)
#evaluation(y_teste, pred_dt)
evaluation(y_teste, pred_rf)
#evaluation(y_teste, pred_knn)

MAE: 0.3288319060651541
MSE: 0.22814758173685296
RMSE: 0.4776479684211511
R2: 0.8418251555043417


### Teste de alguns parametros para a Florest

In [11]:
rf1 = RandomForestRegressor(n_estimators=200)
rf2 = RandomForestRegressor(max_features="sqrt")
rf3 = RandomForestRegressor(max_features="log2")
rf4 = RandomForestRegressor(min_samples_split=20)
rf4 = RandomForestRegressor(oob_score=True)
rf1.fit(x_treino_normal, y_treino)
pred_rf1 = rf1.predict(x_teste_normal)
evaluation(y_teste, pred_rf1)

MAE: 0.3275293546393574
MSE: 0.22650844701450812
RMSE: 0.4759290356917805
R2: 0.8429615685131517


### Guardando o modelo salvo com joblib

In [33]:
from joblib import dump, load
dump(rf1, 'filename.joblib') 
model = load('filename.joblib')

### Rodando com um CSV teste a execução

In [40]:
tt = pd.read_csv("apagar.csv")
tt["dmg_per_death"] = tt["dmg_per_death"].apply(round, ndigits=1)
x_teste_normal_novo = transformer.transform(tt)

r = model.predict(x_teste_normal_novo)
r

array([7.834 , 6.055 , 7.4875, 7.0365, 6.6875, 4.929 , 5.9835, 7.7705,
       6.891 , 7.5155])