In [None]:
import numpy as np
import pandas as pd
from datetime import datetime
import mytrain_lib as ml
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix as confmat
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV,KFold
from sklearn.ensemble import RandomForestClassifier
from scipy.stats import uniform, randint
%matplotlib inline
import importlib

import random
random.seed(0)

In [None]:
path_rawdata    = 'F://TFG//datasets/raw_datasets//'
path_train      = 'F://TFG//datasets//data_train//'
path_graphs     = 'F://TFG//graphs//'
path            = 'F:/TFG/datasets/nature-dataset/'

In [None]:
data             = pd.read_csv(path_rawdata+'matches_wUltPartidos.csv',sep=';',index_col='wyId')

target = []

for match in data.itertuples():
    if match.winner == 0: target.append(0)
    elif match.winner == match.teamId_home: target.append(1)
    else: target.append(2)

data['res'] = target
data['matchId'] = data.index
data.head(3)

In [None]:
train_data  = ml.FootballMatchesDataset(file = 'train')
test_data   = ml.FootballMatchesDataset(file = 'test')

In [None]:
trainmatches = train_data.matches
testmatches  = test_data.matches

In [None]:
def get_odds_res(X):
    odds = [1/X.B365D.to_numpy(),1/(X.B365H.to_numpy()),1/(X.B365A.to_numpy())]
    odds = np.array(odds).T
    res = np.array(X.res)
    return odds,res

## Baseline I: max odds

In [None]:
odds,res = get_odds_res(data.loc[trainmatches])
pred = np.argmax(odds,axis=1)
assert len(pred) == len(res)
np.mean(pred==res)

In [None]:
importlib.reload(ml)

In [None]:
ml.dispConfusionMatrix(confmat(res,pred,labels=[0,1,2]),'Baseline max odds training confusion matrix',filename='confmat_base_maxodds_train',save=True,size=(6,5))

#### Baseline test

In [None]:
odds,res = get_odds_res(data.loc[testmatches])
pred = np.argmax(odds,axis=1)
assert len(pred) == len(res)
np.mean(pred==res)

In [None]:
ml.dispConfusionMatrix(confmat(res,pred,labels=[0,1,2]),'Baseline max odds test confusion matrix',filename='confmat_base_maxodds_test',save=True,size=(6,5))

## Baseline II: SVM Baseline model

In [None]:
odds,res = get_odds_res(data.loc[trainmatches])
rand_list = {"C": uniform(2, 10),
             "gamma": uniform(0.1, 1),
             "degree": uniform(2,10),
             "kernel":['linear','poly','rbf']}

svm = SVC(random_state=1)

In [None]:
rand_search = RandomizedSearchCV(svm, param_distributions=rand_list, n_iter=50, n_jobs=4, cv=3, random_state=0) 
rand_search.fit(odds,res)
cv_results = pd.DataFrame(rand_search.cv_results_)

In [None]:
cv_results.sort_values('rank_test_score')[:3]

In [None]:
np.max(cv_results.mean_test_score)

#### Baseline test

In [None]:
odds,res = get_odds_res(data.loc[testmatches])
pred = rand_search.best_estimator_.predict(odds)
assert len(pred) == len(res)
np.mean(pred==res)

In [None]:
ml.dispConfusionMatrix(confmat(res,pred,labels=[0,1,2]),'Baseline svm test confusion matrix',filename='confmat_base_svm_test',save=True,size=(6,5))

## Baseline III: Random Forest

In [None]:
odds,res = get_odds_res(data.loc[trainmatches])
rand_list = {"n_estimators": randint(5, 200),
             "max_depth": randint(2, 10),
             "min_samples_split": randint(2,100),
             "min_samples_leaf": randint(1,100),
             "max_leaf_nodes": randint(3,20)}

randforest = RandomForestClassifier(random_state=0)

In [None]:
rand_search = RandomizedSearchCV(randforest, param_distributions=rand_list, n_iter=50, n_jobs=4, cv=3, random_state=0) 
rand_search.fit(odds,res)
cv_results = pd.DataFrame(rand_search.cv_results_)

In [None]:
cv_results.sort_values('rank_test_score')[:3]

In [None]:
np.max(cv_results.mean_test_score)

#### Baseline test

In [None]:
odds,res = get_odds_res(data.loc[testmatches])
pred = rand_search.best_estimator_.predict(odds)
assert len(pred) == len(res)
np.mean(pred==res)

In [None]:
ml.dispConfusionMatrix(confmat(res,pred,labels=[0,1,2]),'Baseline random forest test confusion matrix',filename='confmat_base_randforest_test',save=True,size=(6,5))

## Baseline IV: Multi-layer Perceptron

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
torch.manual_seed(0)

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")

In [None]:
class BettingDataset(Dataset):
    def __init__(self,odds,res):
        self.data       = torch.tensor(odds).float()
        self.labels     = F.one_hot(torch.tensor(res),num_classes=3).float()
        
    def __len__(self):
        return len(self.data)

    def shape(self):
        return self.data.shape

    def __getitem__(self,idx):
        sample  = self.data[idx]
        label   = self.labels[idx]
        return sample, label, -1


class NeuralNetwork(nn.Module):
    def __init__(self, input_feature, ouput_classes, hidden_neurons=5):
        super().__init__()
        
        self.h1 = nn.Linear(in_features=input_feature,out_features=hidden_neurons)
        self.bn = nn.BatchNorm1d(hidden_neurons)
        self.out = nn.Linear(hidden_neurons,ouput_classes)

    def forward(self,x):
        x = self.h1(x)
        x = F.relu(self.bn(x))
        return F.softmax(self.out(x),1)    

    def reset_weights(self):
        self.h1.reset_parameters()
        self.bn.reset_parameters()
        self.out.reset_parameters()    

In [None]:
model = NeuralNetwork(3,3,3)
train_data = BettingDataset(odds,res)
train_data[0]

In [None]:
optimizer = torch.optim.SGD(model.parameters(), lr=1e-1)
kfold = KFold(n_splits=3,shuffle=True,random_state=0)
ml.log = {}
error, accuracy_train, accuracy_test,_ = ml.train_wCrossValidation(model,nn.BCELoss(),
                                            optimizer,train_data,kfold,epochs=100,bat_size=32)
ml.save_logging(datetime.now().strftime("_%m%d_%H%M%S"), title='baseline_cv')

In [None]:
np.mean(np.array(accuracy_test)[:,-1])

#### Baseline Test

In [None]:
odds,res   = get_odds_res(data.loc[testmatches])
test_data  = BettingDataset(odds,res)
testloader = DataLoader(test_data,32,shuffle=True)

In [None]:
importlib.reload(ml)
acc_test, cm = ml.test_model(model,testloader)
acc_test

In [None]:
ml.dispConfusionMatrix(cm,'Baseline MLP 1x3 test confusion matrix',
                        filename='confmat_base_mlp1x3_test',save=True,size=(6,5))