In [1]:
import os
import pandas as pd

from preprocessing import data_preprocess
from models import make_ds, train_models, plot_rocs, get_preds, get_probas, plot_bar, hard_voting, soft_voting

In [2]:
data = data_preprocess(pd.read_csv('archive/atp_matches_2004.csv'))
# data = pd.read_csv('train_val_2000_2012.csv')
data.shape

(2738, 23)

In [11]:
dataset = make_ds(data, train_split=0.8, shuffle=True)
dataset[0].shape

(2190, 14)

In [50]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

from sklearn.metrics import accuracy_score

## Logistic Regression

In [77]:
paras = {
    'C': [0.01, 0.03, 0.1, 0.3, 1, 3, 10]
}

In [78]:
model = LogisticRegression(max_iter=3000)
cv = GridSearchCV(model, paras, cv=5, verbose=2, scoring='accuracy')

In [79]:
cv.fit(dataset[0], dataset[2])

Fitting 5 folds for each of 7 candidates, totalling 35 fits
[CV] END .............................................C=0.01; total time=   0.1s
[CV] END .............................................C=0.01; total time=   0.0s
[CV] END .............................................C=0.01; total time=   0.0s
[CV] END .............................................C=0.01; total time=   0.1s
[CV] END .............................................C=0.01; total time=   0.1s
[CV] END .............................................C=0.03; total time=   0.2s
[CV] END .............................................C=0.03; total time=   0.1s
[CV] END .............................................C=0.03; total time=   0.2s
[CV] END .............................................C=0.03; total time=   0.2s
[CV] END .............................................C=0.03; total time=   0.3s
[CV] END ..............................................C=0.1; total time=   0.3s
[CV] END ........................................

GridSearchCV(cv=5, estimator=LogisticRegression(max_iter=3000),
             param_grid={'C': [0.01, 0.03, 0.1, 0.3, 1, 3, 10]},
             scoring='accuracy', verbose=2)

In [83]:
print('Best Params:', cv.best_params_, '\n', 'Train acc:', cv.best_score_, '\n', 'test acc:', cv.score(dataset[1], dataset[3]))

Best Params: {'C': 3} 
 Train acc: 0.6424657534246576 
 test acc: 0.6642335766423357


## Random Forest

In [86]:
paras = {
    'bootstrap': [True, False],
    'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
    'max_features': ['auto', 'sqrt'],
    'min_samples_leaf': [1, 2, 4],
    'min_samples_split': [2, 5, 10],
    'n_estimators': [130, 180, 230]
}

In [89]:
model = RandomForestClassifier()
cv = RandomizedSearchCV(model, paras, cv=5, verbose=2, scoring='accuracy', n_iter=10)

In [90]:
cv.fit(dataset[0], dataset[2])

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END bootstrap=False, max_depth=50, max_features=auto, min_samples_leaf=4, min_samples_split=2, n_estimators=180; total time=   0.5s
[CV] END bootstrap=False, max_depth=50, max_features=auto, min_samples_leaf=4, min_samples_split=2, n_estimators=180; total time=   0.5s
[CV] END bootstrap=False, max_depth=50, max_features=auto, min_samples_leaf=4, min_samples_split=2, n_estimators=180; total time=   0.5s
[CV] END bootstrap=False, max_depth=50, max_features=auto, min_samples_leaf=4, min_samples_split=2, n_estimators=180; total time=   0.5s
[CV] END bootstrap=False, max_depth=50, max_features=auto, min_samples_leaf=4, min_samples_split=2, n_estimators=180; total time=   0.5s
[CV] END bootstrap=False, max_depth=90, max_features=auto, min_samples_leaf=2, min_samples_split=10, n_estimators=180; total time=   0.5s
[CV] END bootstrap=False, max_depth=90, max_features=auto, min_samples_leaf=2, min_samples_split=10, n_estimators=18

RandomizedSearchCV(cv=5, estimator=RandomForestClassifier(),
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, 110,
                                                      None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [130, 180, 230]},
                   scoring='accuracy', verbose=2)

In [92]:
print('Best Params:', cv.best_params_, '\n', 'Train acc:', cv.best_score_, '\n', 'test acc:', cv.score(dataset[1], dataset[3]))

Best Params: {'n_estimators': 230, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 40, 'bootstrap': False} 
 Train acc: 0.8301369863013699 
 test acc: 0.8375912408759124


## Decision Tree

In [98]:
paras = {
    'criterion':['gini','entropy'],
    'max_depth':[4,5,6,7,8,9,10,11,12,15,20,30,40,50,70,90,120,150]
}

In [100]:
model = DecisionTreeClassifier()
cv = GridSearchCV(model, paras, cv=5, verbose=2, scoring='accuracy')

In [101]:
cv.fit(dataset[0], dataset[2])

Fitting 5 folds for each of 36 candidates, totalling 180 fits
[CV] END ........................criterion=gini, max_depth=4; total time=   0.0s
[CV] END ........................criterion=gini, max_depth=4; total time=   0.0s
[CV] END ........................criterion=gini, max_depth=4; total time=   0.0s
[CV] END ........................criterion=gini, max_depth=4; total time=   0.0s
[CV] END ........................criterion=gini, max_depth=4; total time=   0.0s
[CV] END ........................criterion=gini, max_depth=5; total time=   0.0s
[CV] END ........................criterion=gini, max_depth=5; total time=   0.0s
[CV] END ........................criterion=gini, max_depth=5; total time=   0.0s
[CV] END ........................criterion=gini, max_depth=5; total time=   0.0s
[CV] END ........................criterion=gini, max_depth=5; total time=   0.0s
[CV] END ........................criterion=gini, max_depth=6; total time=   0.0s
[CV] END ........................criterion=gini

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [4, 5, 6, 7, 8, 9, 10, 11, 12, 15, 20, 30,
                                       40, 50, 70, 90, 120, 150]},
             scoring='accuracy', verbose=2)

In [102]:
print('Best Params:', cv.best_params_, '\n', 'Train acc:', cv.best_score_, '\n', 'test acc:', cv.score(dataset[1], dataset[3]))

Best Params: {'criterion': 'entropy', 'max_depth': 40} 
 Train acc: 0.8228310502283105 
 test acc: 0.8156934306569343


## KNN

In [106]:
paras = {
    'leaf_size': [10, 20, 30, 40, 50],
    'n_neighbors': [3, 5, 7, 9]
}

In [107]:
model = KNeighborsClassifier()
cv = GridSearchCV(model, paras, cv=5, verbose=2, scoring='accuracy')

In [108]:
cv.fit(dataset[0], dataset[2])

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV] END ........................leaf_size=10, n_neighbors=3; total time=   0.0s
[CV] END ........................leaf_size=10, n_neighbors=3; total time=   0.0s
[CV] END ........................leaf_size=10, n_neighbors=3; total time=   0.0s
[CV] END ........................leaf_size=10, n_neighbors=3; total time=   0.0s
[CV] END ........................leaf_size=10, n_neighbors=3; total time=   0.0s
[CV] END ........................leaf_size=10, n_neighbors=5; total time=   0.0s
[CV] END ........................leaf_size=10, n_neighbors=5; total time=   0.0s
[CV] END ........................leaf_size=10, n_neighbors=5; total time=   0.0s
[CV] END ........................leaf_size=10, n_neighbors=5; total time=   0.0s
[CV] END ........................leaf_size=10, n_neighbors=5; total time=   0.0s
[CV] END ........................leaf_size=10, n_neighbors=7; total time=   0.0s
[CV] END ........................leaf_size=10, 

GridSearchCV(cv=5, estimator=KNeighborsClassifier(),
             param_grid={'leaf_size': [10, 20, 30, 40, 50],
                         'n_neighbors': [3, 5, 7, 9]},
             scoring='accuracy', verbose=2)

In [109]:
print('Best Params:', cv.best_params_, '\n', 'Train acc:', cv.best_score_, '\n', 'test acc:', cv.score(dataset[1], dataset[3]))

Best Params: {'leaf_size': 10, 'n_neighbors': 5} 
 Train acc: 0.7470319634703196 
 test acc: 0.7536496350364964


## Naive Bayes

In [None]:
paras = {
    
}

In [110]:
model = GaussianNB()
cv = GridSearchCV(model, paras, cv=5, verbose=2, scoring='accuracy')

In [111]:
model.get_params()

{'priors': None, 'var_smoothing': 1e-09}

In [112]:
cv.fit(dataset[0], dataset[2])

Fitting 5 folds for each of 20 candidates, totalling 100 fits


ValueError: Invalid parameter leaf_size for estimator GaussianNB(). Check the list of available parameters with `estimator.get_params().keys()`.

In [113]:
print('Best Params:', cv.best_params_, '\n', 'Train acc:', cv.best_score_, '\n', 'test acc:', cv.score(dataset[1], dataset[3]))

AttributeError: 'GridSearchCV' object has no attribute 'best_params_'