In [2]:
# import basic modules
import pandas as pd
import numpy as np
import timeit

# import sklearn modules
from sklearn.preprocessing import MinMaxScaler 
from sklearn.model_selection import RandomizedSearchCV
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import make_scorer, accuracy_score

# load preprocessed file of match rounds
preprocessed_path_file = "../../data/rounds_preprocessed.csv"
df_matches = pd.read_csv(preprocessed_path_file, encoding='utf_16', index_col=0)

# remove matches from the first 4 rounds, there is no enough history for them
df_matches = df_matches[df_matches["rodada_id"]>4]

df_matches.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 472 entries, 40 to 279
Data columns (total 18 columns):
clube_id          472 non-null int64
aproveitamento    472 non-null object
in_pos            472 non-null float64
in_pos_adv        472 non-null float64
local             472 non-null object
partida_data      472 non-null object
partida_id        472 non-null int64
placar            472 non-null float64
placar_adv        472 non-null float64
rodada_id         472 non-null int64
wins              472 non-null int64
losses            472 non-null int64
draws             472 non-null int64
wins_adv          472 non-null int64
losses_adv        472 non-null int64
draws_adv         472 non-null int64
home              472 non-null int64
result            472 non-null object
dtypes: float64(4), int64(10), object(4)
memory usage: 62.7+ KB


In [3]:
# select features and targets and target for classification
# unselected features: 'clube_id', 'placar', 'placar_adv', 'rodada_id',
X_col_select = ['in_pos', 'in_pos_adv', 'wins', 'losses', 'draws', 
                'wins_adv', 'losses_adv', 'draws_adv', 'home']
X = df_matches[X_col_select].copy()


# adjust scale from numeric features
scaler = MinMaxScaler()
X['n_in_pos'] = scaler.fit_transform(X[['in_pos']])
X['n_in_pos_adv'] = scaler.fit_transform(X[['in_pos_adv']])

# remove features adjusted
X_col_select = ['n_in_pos', 'n_in_pos_adv', 'wins', 'losses', 'draws', 
                'wins_adv', 'losses_adv', 'draws_adv', 'home']
X = X[X_col_select]

X.describe()

Unnamed: 0,n_in_pos,n_in_pos_adv,wins,losses,draws,wins_adv,losses_adv,draws_adv,home
count,472.0,472.0,472.0,472.0,472.0,472.0,472.0,472.0,472.0
mean,0.503011,0.503011,1.427966,1.447034,1.125,1.427966,1.447034,1.125,0.5
std,0.303447,0.303447,1.052296,1.018591,0.895406,1.052296,1.018591,0.895406,0.500531
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.263158,0.263158,1.0,1.0,0.0,1.0,1.0,0.0,0.0
50%,0.526316,0.526316,1.0,1.0,1.0,1.0,1.0,1.0,0.5
75%,0.789474,0.789474,2.0,2.0,2.0,2.0,2.0,2.0,1.0
max,1.0,1.0,4.0,4.0,4.0,4.0,4.0,4.0,1.0


In [4]:
# create boolean targets for classification
y_col_select = ['result', 'placar', 'placar_adv']
y = df_matches[y_col_select].copy()

# team lost the match
y["loss"] = y["result"].values == "d" 

# team won the match
y["win"] = y["result"].values == "v"

# team suffered zero goals on the match
y["suffer_zero"] = y["placar_adv"] == 0

# team scored more than 2 goal on the match
y["scor2plus"] = y["placar"] > 2

# remove original values
y = y.drop(['result', 'placar', 'placar_adv'], axis=1)

# first select a model for the win target, using recall_score to minimize false positive
ya = y["win"].values.ravel()
scorer = make_scorer(accuracy_score)

print(X.info(), ya.shape)


<class 'pandas.core.frame.DataFrame'>
Int64Index: 472 entries, 40 to 279
Data columns (total 9 columns):
n_in_pos        472 non-null float64
n_in_pos_adv    472 non-null float64
wins            472 non-null int64
losses          472 non-null int64
draws           472 non-null int64
wins_adv        472 non-null int64
losses_adv      472 non-null int64
draws_adv       472 non-null int64
home            472 non-null int64
dtypes: float64(2), int64(7)
memory usage: 36.9 KB
None (472,)


In [5]:
est = KNeighborsClassifier()
param_dict = {"n_neighbors": [2, 5, 10, 15],
             "algorithm": ['ball_tree', 'kd_tree'],
             "weights": ['uniform', 'distance']}

rs = RandomizedSearchCV(estimator=est, param_distributions=param_dict, scoring=scorer,
                        n_iter=16, cv=50, random_state=1984)
rs.fit(X, ya)

print("KNN - best recall score = %.3f"%rs.best_score_)
print("KNN - best params = ",rs.best_params_)
print("KNN - mean recall score = %.3f"%rs.cv_results_['mean_test_score'].mean())

KNN - best recall score = 0.659
KNN - best params =  {'weights': 'distance', 'n_neighbors': 15, 'algorithm': 'kd_tree'}
KNN - mean recall score = 0.633


In [6]:
est = DecisionTreeClassifier()
param_dict = {"max_depth": [5, 10, 20, 30],
             "min_samples_leaf": [1, 3, 5],
             "max_features": [2, 3, 5, 9]} #'auto', 'sqrt', 'log2']}

rs = RandomizedSearchCV(estimator=est, param_distributions=param_dict, scoring=scorer,
                        n_iter=48, cv=50, random_state=1984)
rs.fit(X, ya)

print("DecisionTree - best recall score = %.3f"%rs.best_score_)
print("DecisionTree - best params = ",rs.best_params_)
print("DecisionTree - mean recall score = %.3f"%rs.cv_results_['mean_test_score'].mean())

DecisionTree - best recall score = 0.672
DecisionTree - best params =  {'min_samples_leaf': 5, 'max_features': 2, 'max_depth': 5}
DecisionTree - mean recall score = 0.600


In [7]:
est = RandomForestClassifier(max_features=9, max_depth=5)
param_dict = {"n_estimators": [50, 100, 200],
              #"max_depth": [20, 30, 40],
             "min_samples_leaf": [1, 3, 5]}#,
             #"max_features": [2, 5, 7]} 

rs = RandomizedSearchCV(estimator=est, param_distributions=param_dict, scoring=scorer,
                        n_iter=9, cv=50, random_state=1984)
time = timeit.timeit()
rs.fit(X, ya)
print("DecisionTree - fit ellapsed time = %.3f seconds"%(time-timeit.timeit()))

print("RandomForest - best recall score = %.3f"%rs.best_score_)
print("RandomForest - best params = ",rs.best_params_)
print("RandomForest - mean recall score = %.3f"%rs.cv_results_['mean_test_score'].mean())

DecisionTree - fit ellapsed time = 0.002 seconds
RandomForest - best recall score = 0.659
RandomForest - best params =  {'n_estimators': 50, 'min_samples_leaf': 3}
RandomForest - mean recall score = 0.641


In [8]:
est = MultinomialNB()
param_dict = {"alpha": [0, 0.5, 1]} 

rs = RandomizedSearchCV(estimator=est, param_distributions=param_dict, scoring=scorer,
                        n_iter=2, cv=5, random_state=1984)
time = timeit.timeit()
rs.fit(X, ya)
print("Naive Bayes - fit ellapsed time = %.3f seconds"%(time-timeit.timeit()))

print("Naive Bayes - best recall score = %.3f"%rs.best_score_)
print("Naive Bayes - best params = ",rs.best_params_)
print("Naive Bayes - mean recall score = %.3f"%rs.cv_results_['mean_test_score'].mean())

Naive Bayes - fit ellapsed time = 0.003 seconds
Naive Bayes - best recall score = 0.652
Naive Bayes - best params =  {'alpha': 1}
Naive Bayes - mean recall score = 0.652


In [9]:
est = LogisticRegression()
param_dict = {"C": [.1, 1, 5, 10]} 

rs = RandomizedSearchCV(estimator=est, param_distributions=param_dict, scoring=scorer,
                        n_iter=4, cv=50, random_state=1984)
time = timeit.timeit()
rs.fit(X, ya)
print("LogisticRegression - fit ellapsed time = %.3f seconds"%(time-timeit.timeit()))

print("LogisticRegression - best recall score = %.3f"%rs.best_score_)
print("LogisticRegression - best params = ",rs.best_params_)
print("LogisticRegression - mean recall score = %.3f"%rs.cv_results_['mean_test_score'].mean())

LogisticRegression - fit ellapsed time = 0.008 seconds
LogisticRegression - best recall score = 0.670
LogisticRegression - best params =  {'C': 0.1}
LogisticRegression - mean recall score = 0.662
