In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# variável usada para remover warnings do jupyter notebook
import warnings
warnings.filterwarnings('ignore')

# leitura e transformação do arquivo em dataframe
dados = pd.read_csv('dadosbinariosliteratura.csv')
#dados = dados.drop_duplicates()

dados = dados.drop(columns=['id', 'Unnamed: 0'])
dados['winner'] = dados['winner'].apply(lambda item: 1 if item == 'radiant' else 0)
dados = dados.astype(int)
dados

Unnamed: 0,winner,duration,hero1,hero2,hero3,hero4,hero5,hero6,hero7,hero8,...,hero120,hero121,hero123,hero126,hero128,hero129,hero135,hero136,hero137,hero138
0,1,2309,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1228,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,1051,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,2518,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,1563,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1382,1,1839,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1383,1,2340,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1384,1,1316,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1385,0,1153,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [2]:
import mlflow
import mlflow.sklearn
import os

# função auxiliar para criar um experimento no mlflow
def criarExperimento():
    # variáveis de conexão com o mlflow
    os.environ['MLFLOW_TRACKING_URI'] = ""
    os.environ['MLFLOW_TRACKING_USERNAME'] = ""
    os.environ['MLFLOW_TRACKING_PASSWORD'] = ""
    
    mlflow.set_tracking_uri(os.environ['MLFLOW_TRACKING_URI'])
    mlflow.set_experiment(experiment_name='Resultados Literatura')

    tags = {
            "Projeto": "Projeto de Engenharia de Aprendizado de Máquina",
            "team": "",
            "dataset": "dota2teamsprofessionals"
           }

In [3]:
# função auxiliar que salva os dados no MLFlow
def modelosMLFlow(acuracia, modelo, nomeModelo):
    with  mlflow.start_run(run_name=nomeModelo):
        # registro das métricas
        mlflow.log_metric("Acurácia", acuracia)
        
        # registro do modelo
        mlflow.sklearn.log_model(modelo, "Modelo")
        mlflow.end_run()

In [4]:
# import utilizado para realizar a separação do treino e teste
from sklearn.model_selection import train_test_split

# função auxiliar que realiza a separação do dataset em treino e teste
def separacaoModelo(dataset, target):
    X = dataset.drop(target, axis=1)
    Y = dataset[target]
    xTrain, xTest, yTrain, yTest = train_test_split(X, Y, test_size=0.2, random_state=42)
    
    return xTrain, xTest, yTrain, yTest

In [5]:
# criando experimento
criarExperimento()

## Dados Binários

In [6]:
# gerando base de dados
xTrain, xTest, yTrain, yTest = separacaoModelo(dados, 'winner')

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier

### Decision Tree

In [8]:
# imports necessário
from sklearn.model_selection import GridSearchCV
from scipy.stats import randint

# parâmetros que serão aleatoriezados
parameters = {'criterion': ['entropy', 'gini'], 'max_depth': [None, 50, 100], 'min_samples_split': [2, 4, 6],
              'max_features': ['auto', 'sqrt', 'log2']}

# classificador usado
dt = DecisionTreeClassifier(random_state=42)

# geração do gridsearch
clf = GridSearchCV(estimator=dt, param_grid=parameters, cv=30)
clf.fit(xTrain, yTrain)

In [9]:
# melhores parâmetros
clf.best_params_

{'criterion': 'entropy',
 'max_depth': 50,
 'max_features': 'auto',
 'min_samples_split': 4}

In [10]:
# melhor score
clf.best_score_

0.49692192192192197

In [11]:
modelosMLFlow(clf.best_score_, dt, "Decision Tree Binário")

In [50]:
from sklearn.metrics import accuracy_score

dt = DecisionTreeClassifier(random_state=42, criterion='entropy', max_depth=50, min_samples_split=4)
dt.fit(xTrain, yTrain)
pred = dt.predict(xTest)
acuracia = accuracy_score(yTest, pred)
print(f'Acurácia: {acuracia}')

Acurácia: 0.6151079136690647


### Logistic Regression

In [12]:
# parâmetros que serão aleatoriezados
parameters = {'penalty': ['l1', 'l2', 'elasticnet'], 'tol': [0.0001, 0.001, 0.01, 0.0004, 0.004]}

# classificador usado
lr = LogisticRegression()

# geração do gridsearch
clf = GridSearchCV(estimator=lr, param_grid=parameters, cv=30)
clf.fit(xTrain, yTrain)

In [13]:
# melhores parâmetros
clf.best_params_

{'penalty': 'l2', 'tol': 0.0001}

In [14]:
# melhor score
clf.best_score_

0.4904904904904905

In [15]:
modelosMLFlow(clf.best_score_, lr, "Logistic Regression Binário")

In [51]:
dt = LogisticRegression(penalty='l2', tol=0.0001)
dt.fit(xTrain, yTrain)
pred = dt.predict(xTest)
acuracia = accuracy_score(yTest, pred)
print(f'Acurácia: {acuracia}')

Acurácia: 0.6115107913669064


### KNN

In [16]:
# parâmetros que serão aleatoriezados
parameters = {'n_neighbors': [5, 10, 15, 20], 'algorithm': ['auto', 'brute', 'kd_tree', 'ball_tree'], 
              'metric': ['cosine', 'euclidean', 'manhattan', 'minkowski']}

# classificador usado
knn = KNeighborsClassifier()

# geração do gridsearch
clf = GridSearchCV(estimator=knn, param_grid=parameters, cv=30)
clf.fit(xTrain, yTrain)

In [17]:
# melhores parâmetros
clf.best_params_

{'algorithm': 'auto', 'metric': 'manhattan', 'n_neighbors': 5}

In [18]:
# melhor score
clf.best_score_

0.5039289289289288

In [19]:
modelosMLFlow(clf.best_score_, knn, "KNN Binário")

In [52]:
dt = KNeighborsClassifier(algorithm='auto', metric='manhattan', n_neighbors=5)
dt.fit(xTrain, yTrain)
pred = dt.predict(xTest)
acuracia = accuracy_score(yTest, pred)
print(f'Acurácia: {acuracia}')

Acurácia: 0.5683453237410072


### Naive Bayes

In [20]:
# parâmetros que serão aleatoriezados
parameters = {'priors': [None, [0.1,]*10],
            'var_smoothing': [1e-9, 1e-6, 1e-12]}

# classificador usado
nb = GaussianNB()

# geração do gridsearch
clf = GridSearchCV(estimator=nb, param_grid=parameters, cv=30)
clf.fit(xTrain, yTrain)

In [21]:
# melhores parâmetros
clf.best_params_

{'priors': None, 'var_smoothing': 1e-12}

In [22]:
# melhor score
clf.best_score_

0.5464214214214215

In [23]:
modelosMLFlow(clf.best_score_, nb, "Naive Bayes Binário")

In [53]:
dt = GaussianNB(priors=None, var_smoothing=1e-12)
dt.fit(xTrain, yTrain)
pred = dt.predict(xTest)
acuracia = accuracy_score(yTest, pred)
print(f'Acurácia: {acuracia}')

Acurácia: 0.564748201438849


### Random Forest

In [24]:
# parâmetros que serão aleatoriezados
parameters = {'n_estimators': [50, 100, 200], 'criterion': ['entropy', 'gini'],
              'max_depth': [None, 50, 100], 'min_samples_split': [2, 4, 6],
              'max_features': ['auto', 'sqrt', 'log2'], 'bootstrap': [True, False]}

# classificador usado
rf = RandomForestClassifier(random_state=42)

# geração do gridsearch
clf = GridSearchCV(estimator=rf, param_grid=parameters, cv=10)
clf.fit(xTrain, yTrain)

In [25]:
# melhores parâmetros
clf.best_params_

{'bootstrap': False,
 'criterion': 'entropy',
 'max_depth': None,
 'max_features': 'log2',
 'min_samples_split': 6,
 'n_estimators': 200}

In [26]:
# melhor score
clf.best_score_

0.5031203931203931

In [27]:
modelosMLFlow(clf.best_score_, rf, "Random Forest Binário")

In [54]:
rf = RandomForestClassifier(random_state=42, bootstrap=False, criterion='entropy', max_depth=None, max_features='log2', min_samples_split=6, n_estimators=200)
rf.fit(xTrain, yTrain)
pred = rf.predict(xTest)
acuracia = accuracy_score(yTest, pred)
print(f'Acurácia: {acuracia}')

Acurácia: 0.6187050359712231


### XGBoost

In [9]:
from sklearn.model_selection import GridSearchCV
from scipy.stats import randint

# parâmetros que serão aleatoriezados
parameters = {'max_depth': [6, 8, 10], 'alpha': [10, 20, 30], 'learning_rate': [0.6, 0.8, 1.0]}

# classificador usado
xgb = XGBClassifier()

# geração do gridsearch
clf = GridSearchCV(estimator=xgb, param_grid=parameters, cv=10)
clf.fit(xTrain, yTrain)

In [10]:
# melhores parâmetros
clf.best_params_

{'alpha': 20, 'learning_rate': 0.6, 'max_depth': 6}

In [11]:
# melhor score
clf.best_score_

0.5076658476658477

In [12]:
modelosMLFlow(clf.best_score_, xgb, "XGBoost Binário")

In [13]:
from sklearn.metrics import accuracy_score

xgb = XGBClassifier(alpha=20, learning_rate=0.6, max_depth=6)
xgb.fit(xTrain, yTrain)
pred = xgb.predict(xTest)
acuracia = accuracy_score(yTest, pred)
print(f'Acurácia: {acuracia}')

Acurácia: 0.5323741007194245


## Dados Binários e Tempo de Partida

In [19]:
dados = dados.drop(columns=['duration'])
dados

Unnamed: 0,winner,hero1,hero2,hero3,hero4,hero5,hero6,hero7,hero8,hero9,...,hero120,hero121,hero123,hero126,hero128,hero129,hero135,hero136,hero137,hero138
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1382,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1383,1,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1384,1,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1385,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
xTrain, xTest, yTrain, yTest = separacaoModelo(dados, 'winner')

### Decision Tree

In [30]:
# imports necessário
from sklearn.model_selection import GridSearchCV
from scipy.stats import randint

# parâmetros que serão aleatoriezados
parameters = {'criterion': ['entropy', 'gini'], 'max_depth': [None, 50, 100], 'min_samples_split': [2, 4, 6],
              'max_features': ['auto', 'sqrt', 'log2']}

# classificador usado
dt = DecisionTreeClassifier(random_state=42)

# geração do gridsearch
clf = GridSearchCV(estimator=dt, param_grid=parameters, cv=30)
clf.fit(xTrain, yTrain)

In [31]:
# melhores parâmetros
clf.best_params_

{'criterion': 'entropy',
 'max_depth': None,
 'max_features': 'log2',
 'min_samples_split': 2}

In [32]:
# melhor score
clf.best_score_

0.5355605605605605

In [33]:
modelosMLFlow(clf.best_score_, dt, "Decision Tree Tempo Partida")

In [55]:
dt = DecisionTreeClassifier(random_state=42, criterion='entropy', max_depth=None, max_features='log2', min_samples_split=2)
dt.fit(xTrain, yTrain)
pred = dt.predict(xTest)
acuracia = accuracy_score(yTest, pred)
print(f'Acurácia: {acuracia}')

Acurácia: 0.6151079136690647


### Logistic Regression

In [34]:
# parâmetros que serão aleatoriezados
parameters = {'penalty': ['l1', 'l2', 'elasticnet'], 'tol': [0.0001, 0.001, 0.01, 0.0004, 0.004]}

# classificador usado
lr = LogisticRegression()

# geração do gridsearch
clf = GridSearchCV(estimator=lr, param_grid=parameters, cv=30)
clf.fit(xTrain, yTrain)

In [35]:
# melhores parâmetros
clf.best_params_

{'penalty': 'l2', 'tol': 0.0001}

In [36]:
# melhor score
clf.best_score_

0.5067317317317317

In [37]:
modelosMLFlow(clf.best_score_, lr, "Logistic Regression Tempo Partida")

In [56]:
dt = LogisticRegression(penalty='l2', tol=0.0001)
dt.fit(xTrain, yTrain)
pred = dt.predict(xTest)
acuracia = accuracy_score(yTest, pred)
print(f'Acurácia: {acuracia}')

Acurácia: 0.6115107913669064


### KNN

In [38]:
# parâmetros que serão aleatoriezados
parameters = {'n_neighbors': [5, 10, 15, 20], 'algorithm': ['auto', 'brute', 'kd_tree', 'ball_tree'], 
              'metric': ['cosine', 'euclidean', 'manhattan', 'minkowski']}

# classificador usado
knn = KNeighborsClassifier()

# geração do gridsearch
clf = GridSearchCV(estimator=knn, param_grid=parameters, cv=30)
clf.fit(xTrain, yTrain)

In [39]:
# melhores parâmetros
clf.best_params_

{'algorithm': 'auto', 'metric': 'cosine', 'n_neighbors': 20}

In [40]:
# melhor score
clf.best_score_

0.5400900900900901

In [41]:
modelosMLFlow(clf.best_score_, knn, "KNN Tempo Partida")

In [57]:
dt = KNeighborsClassifier(algorithm='auto', metric='cosine', n_neighbors=20)
dt.fit(xTrain, yTrain)
pred = dt.predict(xTest)
acuracia = accuracy_score(yTest, pred)
print(f'Acurácia: {acuracia}')

Acurácia: 0.5719424460431655


### Naive Bayes

In [42]:
# parâmetros que serão aleatoriezados
parameters = {'priors': [None, [0.1,]*10],
            'var_smoothing': [1e-9, 1e-6, 1e-12]}

# classificador usado
nb = GaussianNB()

# geração do gridsearch
clf = GridSearchCV(estimator=nb, param_grid=parameters, cv=30)
clf.fit(xTrain, yTrain)

In [43]:
# melhores parâmetros
clf.best_params_

{'priors': None, 'var_smoothing': 1e-06}

In [44]:
# melhor score
clf.best_score_

0.5518268268268269

In [45]:
modelosMLFlow(clf.best_score_, nb, "Naive Bayes Tempo Partida")

In [58]:
dt = GaussianNB(priors=None, var_smoothing=1e-06)
dt.fit(xTrain, yTrain)
pred = dt.predict(xTest)
acuracia = accuracy_score(yTest, pred)
print(f'Acurácia: {acuracia}')

Acurácia: 0.5863309352517986


### Random Forest

In [46]:
# parâmetros que serão aleatoriezados
parameters = {'n_estimators': [50, 100, 200], 'criterion': ['entropy', 'gini'],
              'max_depth': [None, 50, 100], 'min_samples_split': [2, 4, 6],
              'max_features': ['auto', 'sqrt', 'log2'], 'bootstrap': [True, False]}

# classificador usado
rf = RandomForestClassifier(random_state=42)

# geração do gridsearch
clf = GridSearchCV(estimator=rf, param_grid=parameters, cv=10)
clf.fit(xTrain, yTrain)

In [47]:
# melhores parâmetros
clf.best_params_

{'bootstrap': False,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'min_samples_split': 2,
 'n_estimators': 100}

In [48]:
# melhor score
clf.best_score_

0.5184848484848484

In [49]:
modelosMLFlow(clf.best_score_, rf, "Random Forest Tempo Partida")

In [59]:
rf = RandomForestClassifier(random_state=42, bootstrap=False, criterion='gini', max_depth=None, min_samples_split=2, n_estimators=50)
rf.fit(xTrain, yTrain)
pred = rf.predict(xTest)
acuracia = accuracy_score(yTest, pred)
print(f'Acurácia: {acuracia}')

Acurácia: 0.6187050359712231


### XGBoost

In [21]:
from sklearn.model_selection import GridSearchCV
from scipy.stats import randint

# parâmetros que serão aleatoriezados
parameters = {'max_depth': [6, 8, 10], 'alpha': [10, 20, 30], 'learning_rate': [0.6, 0.8, 1.0]}

# classificador usado
xgb = XGBClassifier()

# geração do gridsearch
clf = GridSearchCV(estimator=xgb, param_grid=parameters, cv=10)
clf.fit(xTrain, yTrain)

In [22]:
# melhores parâmetros
clf.best_params_

{'alpha': 10, 'learning_rate': 1.0, 'max_depth': 10}

In [23]:
# melhor score
clf.best_score_

0.5121539721539722

In [24]:
modelosMLFlow(clf.best_score_, xgb, "XGBoost Tempo Partida")

In [25]:
from sklearn.metrics import accuracy_score

xgb = XGBClassifier(alpha=10, learning_rate=1.0, max_depth=10)
xgb.fit(xTrain, yTrain)
pred = xgb.predict(xTest)
acuracia = accuracy_score(yTest, pred)
print(f'Acurácia: {acuracia}')

Acurácia: 0.5827338129496403
