In [2]:
import pandas as pd
import numpy as np
import os
from datetime import datetime
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import math

In [3]:
base_modelagem = pd.read_csv("base_delta_cross.csv", index_col = 0)

In [13]:
correlations = abs(base_modelagem.corr()["fl_home_win"]).sort_values(ascending=False)

In [None]:
correlations

In [47]:
to_keep = correlations.head(50).index

In [4]:
# pega apenas as last 5
to_keep = [x for x in base_modelagem.columns if "_L 5" in x]

In [5]:
filtrada = base_modelagem.iloc[len(base_modelagem)-1063:len(base_modelagem)]
len(filtrada)

1063

In [6]:
filtrada.fl_home_win.value_counts()/len(filtrada)

1    0.581373
0    0.418627
Name: fl_home_win, dtype: float64

In [7]:
base_modelo = filtrada.iloc[0:(len(filtrada) - 100)]

In [8]:
out_of_time = filtrada.iloc[(len(filtrada) - 100):len(filtrada)]

In [126]:
import sklearn.base
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import VotingClassifier

### Forward Feature Selection

In [10]:
from mlxtend.feature_selection import SequentialFeatureSelector as sfs
from mlxtend.feature_selection import ExhaustiveFeatureSelector as efs

In [11]:
X = base_modelo[to_keep]
y = base_modelo.fl_home_win

#### Sequential FS

In [12]:
from sklearn.model_selection import KFold

In [60]:
k_fold_cv10 = KFold(20, shuffle=False)

In [61]:
clf = GaussianNB()

sfs1 = sfs(clf,
           k_features=5,
           forward=True,
           floating=True,
           verbose=10,
           scoring='accuracy',
           n_jobs=3,
           cv=k_fold_cv10)

In [62]:
sfs1 = sfs1.fit(X.values, y.values, custom_feature_names=X.columns)

[Parallel(n_jobs=3)]: Done   2 tasks      | elapsed:    0.9s
[Parallel(n_jobs=3)]: Done   7 tasks      | elapsed:    1.2s
[Parallel(n_jobs=3)]: Done  12 tasks      | elapsed:    1.3s
[Parallel(n_jobs=3)]: Batch computation too fast (0.1768s.) Setting batch_size=2.
[Parallel(n_jobs=3)]: Done  19 tasks      | elapsed:    1.4s
[Parallel(n_jobs=3)]: Batch computation too fast (0.0959s.) Setting batch_size=8.
[Parallel(n_jobs=3)]: Done  30 tasks      | elapsed:    1.5s
[Parallel(n_jobs=3)]: Done  90 tasks      | elapsed:    2.1s
[Parallel(n_jobs=3)]: Done 162 tasks      | elapsed:    2.7s
[Parallel(n_jobs=3)]: Done 250 tasks      | elapsed:    3.7s
[Parallel(n_jobs=3)]: Done 338 tasks      | elapsed:    4.4s
[Parallel(n_jobs=3)]: Done 446 out of 446 | elapsed:    5.3s finished

[2018-10-12 20:29:39] Features: 1/5 -- score: 0.6623511904761903[Parallel(n_jobs=3)]: Done   2 tasks      | elapsed:    0.9s
[Parallel(n_jobs=3)]: Done   7 tasks      | elapsed:    1.2s
[Parallel(n_jobs=3)]: Done  12

In [63]:
top_features = list(sfs1.k_feature_names_)

In [64]:
modelo = GaussianNB()
modelo.fit(X[top_features], y)

GaussianNB(priors=None)

In [45]:
y_pred = modelo.predict(out_of_time[top_features])
y_true = out_of_time.fl_home_win.values

print(top_features)
print("No Holdout: " + str(accuracy_score(y_true, y_pred)))
print("No KFold 10: " + str(sfs1.k_score_))

['C2_PCT_FGA_2PT_L 5', 'D2_PFD_L 5', 'D1_N_GAMES_AWAY_L6_days_L 5', 'C1_PIE_L 5', 'C1_TS_PCT_L 5', 'D2_NET_RATING_L 5', 'C1_REB_PCT_L 5']
No Holdout: 0.65
No KFold 10: 0.6957193702223157


In [66]:
y_pred = modelo.predict(out_of_time[top_features])
y_true = out_of_time.fl_home_win.values

print(top_features)
print("No Holdout: " + str(accuracy_score(y_true, y_pred)))
print("No KFold 20: " + str(sfs1.k_score_))

['C1_PF_misc_L 5', 'D2_WIN_PCT_L 5', 'C1_FTAST_L 5', 'D2_BLKA_L 5', 'D2_UFGA_L 5']
No Holdout: 0.63
No KFold 20: 0.679124149659864


#### Exhaustive FS

In [13]:
clf = GaussianNB()

efs1 = efs(clf,
           min_features=2,
           max_features=2,
           print_progress=True,
           scoring='accuracy',
           n_jobs=3,
           cv=10)

In [14]:
efs1 = efs1.fit(X.values, y.values, custom_feature_names=X.columns)

Features: 99235/99235

In [16]:
efs1.best_score_

0.679116698225682

In [15]:
efs1.best_feature_names_

('D2_PFD_L 5', 'D2_NET_RATING_L 5')

#### SemiForward

In [178]:
class SemiForwardFeatureSelector():
    def __init__(self, model, k_features, pre_features_list = [], floating=False,
                 verbose=True, scoring='accuracy', cv=10, holdout=None, holdout_y=None):
        
        if(k_features <= len(pre_features_list)):
            print("Erro: K_Features <= len(Pre_Features_List)")
            return
        
        self.model = model
        self.k_features = k_features
        self.pre_features_list = pre_features_list
        self.floating = floating
        self.verbose = verbose
        self.scoring = scoring
        self.n_jobs = n_jobs
        self.holdout = holdout
        self.holdout_y = holdout_y
        
        if(isinstance(cv, int)):
            self.cv = KFold(cv)
        else:
            self.cv = cv
        self.is_fitted = False
        self.best_feature_names_ = pre_features_list
        
        self.best_score_ = 0
        
        self.metric_dict = {}
        self.metric_dict["feature_names"] = []
        self.metric_dict["avg_score"] = []
        self.metric_dict["std_dev"] = []
        
        if holdout is not None:
            self.metric_dict["holdout_score"] = []
            
        self.metric_dict["cv_scores"] = []
        self.metric_dict["min_cv_score"] = []
        self.metric_dict["max_cv_score"] = []
        self.metric_dict["trained_model"] = []
        
    def fit(self, X, y):
        i = 0
        for _ in range(self.k_features - len(self.pre_features_list)):
            candidates = X.drop(self.best_feature_names_, axis=1)
            self._add_feature = None
            for new_column in (candidates.columns):
                self.metric_dict["feature_names"].append(self.best_feature_names_ + [new_column])
                
                current_X = X[self.best_feature_names_ + [new_column]]
                
                score_folds = []
                for train_index, test_index in self.cv.split(current_X):
                    test_model = sklearn.base.clone(self.model)
                    test_model.fit(current_X.iloc[train_index], y[train_index])
                    
                    y_test_pred = test_model.predict(current_X.iloc[test_index])
                    y_test_true = y[test_index]
                    score_folds.append(accuracy_score(y_test_true, y_test_pred))
                
                if(self.holdout is not None):
                    test_model = sklearn.base.clone(self.model)
                    test_model.fit(current_X, y)
                    
                    y_pred_holdout = test_model.predict(self.holdout[self.metric_dict["feature_names"][-1]])
                    y_true_holdout = self.holdout_y
                    self.metric_dict["holdout_score"].append(accuracy_score(y_true_holdout, y_pred_holdout))
                    
                
                avg_score = np.mean(score_folds)
                if(self.best_score_ < avg_score):
                    self.best_score_ = avg_score
                    self._add_feature = new_column
                elif(self.best_score_ == avg_score):
                    pass
                
                self.metric_dict["avg_score"].append(avg_score)
                self.metric_dict["std_dev"].append(np.std(score_folds))
                self.metric_dict["cv_scores"].append(score_folds)
                self.metric_dict["min_cv_score"].append(np.min(score_folds))
                self.metric_dict["max_cv_score"].append(np.max(score_folds))
                
                test_model = sklearn.base.clone(self.model)
                test_model.fit(current_X, y)
                self.metric_dict["trained_model"].append(test_model)
                
                if(self.verbose):
                    print(str(i) + ", " + str(self.best_feature_names_) + " + " + new_column + "            ", end="\r")
                    
                i += 1      
            if(self._add_feature is None):
                print("Adicionar mais features não melhorou o modelo")
                return
            else:
                self.best_feature_names_ += [self._add_feature]
        
        print()
        print("Best Combination", self.best_feature_names_)
        print("Best Score", self.best_score_)
                
    def transform(self, X):        
        return X[best_feature_names_]

In [171]:
sffs = SemiForwardFeatureSelector(GaussianNB(), 4, ['D2_PFD_L 5', 'D2_NET_RATING_L 5', 'D2_EFG_PCT_L 5'],
                                  holdout=out_of_time, holdout_y=out_of_time.fl_home_win)

In [172]:
sffs.fit(X, y)

442, ['D2_PFD_L 5', 'D2_NET_RATING_L 5', 'D2_EFG_PCT_L 5'] + C2_OPP_TOV_PCT_L 5                       
Best Combination ['D2_PFD_L 5', 'D2_NET_RATING_L 5', 'D2_EFG_PCT_L 5', 'C1_TO_L 5']
Best Score 0.6718535223367698


In [173]:
sffs.best_score_

0.6718535223367698

In [174]:
df_results = pd.DataFrame(sffs.metric_dict).sort_values("avg_score", ascending=False)
df_results.head(10)

Unnamed: 0,feature_names,avg_score,std_dev,holdout_score,cv_scores,min_cv_score,max_cv_score,trained_model
6,"[D2_PFD_L 5, D2_NET_RATING_L 5, D2_EFG_PCT_L 5...",0.671854,0.063385,0.6,"[0.7319587628865979, 0.7010309278350515, 0.587...",0.572917,0.760417,GaussianNB(priors=None)
86,"[D2_PFD_L 5, D2_NET_RATING_L 5, D2_EFG_PCT_L 5...",0.666645,0.065323,0.61,"[0.711340206185567, 0.7319587628865979, 0.5773...",0.5625,0.731959,GaussianNB(priors=None)
120,"[D2_PFD_L 5, D2_NET_RATING_L 5, D2_EFG_PCT_L 5...",0.666645,0.065323,0.61,"[0.711340206185567, 0.7319587628865979, 0.5773...",0.5625,0.731959,GaussianNB(priors=None)
359,"[D2_PFD_L 5, D2_NET_RATING_L 5, D2_EFG_PCT_L 5...",0.665646,0.061719,0.6,"[0.6804123711340206, 0.7010309278350515, 0.597...",0.5625,0.729167,GaussianNB(priors=None)
409,"[D2_PFD_L 5, D2_NET_RATING_L 5, D2_EFG_PCT_L 5...",0.665636,0.055693,0.62,"[0.6701030927835051, 0.711340206185567, 0.6082...",0.583333,0.78125,GaussianNB(priors=None)
8,"[D2_PFD_L 5, D2_NET_RATING_L 5, D2_EFG_PCT_L 5...",0.665636,0.057535,0.59,"[0.6804123711340206, 0.7216494845360825, 0.587...",0.583333,0.760417,GaussianNB(priors=None)
282,"[D2_PFD_L 5, D2_NET_RATING_L 5, D2_EFG_PCT_L 5...",0.665614,0.063806,0.63,"[0.6701030927835051, 0.7216494845360825, 0.618...",0.5625,0.75,GaussianNB(priors=None)
333,"[D2_PFD_L 5, D2_NET_RATING_L 5, D2_EFG_PCT_L 5...",0.664594,0.069793,0.6,"[0.711340206185567, 0.711340206185567, 0.56701...",0.552083,0.739583,GaussianNB(priors=None)
260,"[D2_PFD_L 5, D2_NET_RATING_L 5, D2_EFG_PCT_L 5...",0.664562,0.071521,0.6,"[0.7216494845360825, 0.711340206185567, 0.5876...",0.53125,0.75,GaussianNB(priors=None)
136,"[D2_PFD_L 5, D2_NET_RATING_L 5, D2_EFG_PCT_L 5...",0.66454,0.074887,0.57,"[0.7216494845360825, 0.7216494845360825, 0.597...",0.510417,0.739583,GaussianNB(priors=None)


In [176]:
df_results.iloc[0,0]

['D2_PFD_L 5', 'D2_NET_RATING_L 5', 'D2_EFG_PCT_L 5', 'C1_TO_L 5']

In [177]:
current_X = X[df_results.iloc[0,0]]
                
cv = KFold(10)    
score_folds = []
for train_index, test_index in cv.split(current_X):
    test_model = sklearn.base.clone(GaussianNB())
    test_model.fit(current_X.iloc[train_index], y[train_index])

    y_test_pred = test_model.predict(current_X.iloc[test_index])
    y_test_true = y[test_index]
    score_folds.append(accuracy_score(y_test_true, y_test_pred))
    
print(score_folds)
print(np.mean(score_folds))

[0.7319587628865979, 0.7010309278350515, 0.5876288659793815, 0.6458333333333334, 0.5729166666666666, 0.59375, 0.7083333333333334, 0.7604166666666666, 0.71875, 0.6979166666666666]
0.6718535223367698


In [155]:
vclf = VotingClassifier(df_results.head(5)["trained_model"], n_jobs=3)

In [156]:
vclf.fit(X, y)

TypeError: zip argument #1 must support iteration

In [151]:
vclf.predict(out_of_time[to_keep])

NotFittedError: This VotingClassifier instance is not fitted yet. Call 'fit' with appropriate arguments before using this method.