In [1]:
# import basic modules
import pandas as pd
import numpy as np
import timeit

# import sklearn modules
from sklearn.preprocessing import MinMaxScaler 
from sklearn.model_selection import RandomizedSearchCV
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import make_scorer, accuracy_score

# load preprocessed file of match rounds
preprocessed_path_file = "../../data/rounds_preprocessed.csv"
df_matches = pd.read_csv(preprocessed_path_file, encoding='utf_16', index_col=0)

# remove matches from the first 4 rounds, there is no enough history for them
df_matches = df_matches[df_matches["rodada_id"]>4]

df_matches.info()

ModuleNotFoundError: No module named 'sklearn'

In [None]:
# select features and targets and target for classification
# unselected features: 'clube_id', 'placar', 'placar_adv', 'rodada_id',
X_col_select = ['in_pos', 'in_pos_adv', 'wins', 'losses', 'draws', 
                'wins_adv', 'losses_adv', 'draws_adv', 'home']
X = df_matches[X_col_select].copy()


# adjust scale from numeric features
scaler = MinMaxScaler()
X['n_in_pos'] = scaler.fit_transform(X[['in_pos']])
X['n_in_pos_adv'] = scaler.fit_transform(X[['in_pos_adv']])

# remove features adjusted
X_col_select = ['n_in_pos', 'n_in_pos_adv', 'wins', 'losses', 'draws', 
                'wins_adv', 'losses_adv', 'draws_adv', 'home']
X = X[X_col_select]

X.describe()

In [None]:
# create boolean targets for classification
y_col_select = ['result', 'placar', 'placar_adv']
y = df_matches[y_col_select].copy()

# team lost the match
y["loss"] = y["result"].values == "d" 

# team won the match
y["win"] = y["result"].values == "v"

# team suffered zero goals on the match
y["suffer_zero"] = y["placar_adv"] == 0

# team scored more than 2 goal on the match
y["scor2plus"] = y["placar"] > 2

# remove original values
y = y.drop(['result', 'placar', 'placar_adv'], axis=1)

# first select a model for the win target, using recall_score to minimize false positive
ya = y["win"].values.ravel()
scorer = make_scorer(accuracy_score)

print(X.info(), ya.shape)


In [None]:
est = KNeighborsClassifier()
param_dict = {"n_neighbors": [2, 5, 10, 15],
             "algorithm": ['ball_tree', 'kd_tree'],
             "weights": ['uniform', 'distance']}

rs = RandomizedSearchCV(estimator=est, param_distributions=param_dict, scoring=scorer,
                        n_iter=16, cv=50, random_state=1984)
rs.fit(X, ya)

print("KNN - best recall score = %.3f"%rs.best_score_)
print("KNN - best params = ",rs.best_params_)
print("KNN - mean recall score = %.3f"%rs.cv_results_['mean_test_score'].mean())

In [None]:
est = DecisionTreeClassifier()
param_dict = {"max_depth": [5, 10, 20, 30],
             "min_samples_leaf": [1, 3, 5],
             "max_features": [2, 3, 5, 9]} #'auto', 'sqrt', 'log2']}

rs = RandomizedSearchCV(estimator=est, param_distributions=param_dict, scoring=scorer,
                        n_iter=48, cv=50, random_state=1984)
rs.fit(X, ya)

print("DecisionTree - best recall score = %.3f"%rs.best_score_)
print("DecisionTree - best params = ",rs.best_params_)
print("DecisionTree - mean recall score = %.3f"%rs.cv_results_['mean_test_score'].mean())

In [None]:
est = RandomForestClassifier(max_features=9, max_depth=5)
param_dict = {"n_estimators": [50, 100, 200],
              #"max_depth": [20, 30, 40],
             "min_samples_leaf": [1, 3, 5]}#,
             #"max_features": [2, 5, 7]} 

rs = RandomizedSearchCV(estimator=est, param_distributions=param_dict, scoring=scorer,
                        n_iter=9, cv=50, random_state=1984)
time = timeit.timeit()
rs.fit(X, ya)
print("DecisionTree - fit ellapsed time = %.3f seconds"%(time-timeit.timeit()))

print("RandomForest - best recall score = %.3f"%rs.best_score_)
print("RandomForest - best params = ",rs.best_params_)
print("RandomForest - mean recall score = %.3f"%rs.cv_results_['mean_test_score'].mean())

In [None]:
est = MultinomialNB()
param_dict = {"alpha": [0, 0.5, 1]} 

rs = RandomizedSearchCV(estimator=est, param_distributions=param_dict, scoring=scorer,
                        n_iter=2, cv=5, random_state=1984)
time = timeit.timeit()
rs.fit(X, ya)
print("Naive Bayes - fit ellapsed time = %.3f seconds"%(time-timeit.timeit()))

print("Naive Bayes - best recall score = %.3f"%rs.best_score_)
print("Naive Bayes - best params = ",rs.best_params_)
print("Naive Bayes - mean recall score = %.3f"%rs.cv_results_['mean_test_score'].mean())

In [None]:
est = LogisticRegression()
param_dict = {"C": [.1, 1, 5, 10]} 

rs = RandomizedSearchCV(estimator=est, param_distributions=param_dict, scoring=scorer,
                        n_iter=4, cv=50, random_state=1984)
time = timeit.timeit()
rs.fit(X, ya)
print("LogisticRegression - fit ellapsed time = %.3f seconds"%(time-timeit.timeit()))

print("LogisticRegression - best recall score = %.3f"%rs.best_score_)
print("LogisticRegression - best params = ",rs.best_params_)
print("LogisticRegression - mean recall score = %.3f"%rs.cv_results_['mean_test_score'].mean())