# We are going to create a function that takes a type of model as input with other inputs and outputs a model with the best features chosen based off of EV

In [48]:
import pandas as pd
from functions import custom_cv_eval, get_best_features, get_ev_from_df
from sklearn.tree import DecisionTreeClassifier
import numpy as np
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.s

In [2]:
df=pd.read_csv("data/owl-with-odds.csv")

In [3]:
#The odds need to be changed to numeric...

subset = ['t1_odds', 't2_odds']
df.dropna(subset=subset ,inplace=True)
df['t1_odds'] = pd.to_numeric(df['t1_odds'], errors='coerce')
df['t2_odds'] = pd.to_numeric(df['t2_odds'], errors='coerce')
df.dropna(subset=subset ,inplace=True)
#Create the odds df
odds_df = df[subset]

#Create a label df
label_df = df['winner_label']

In [4]:
#Split the test set.  We are always(?) going to use the last 60 matches as the test set, so we don't want those around
#as we pick models
df_train = df[:-60]
odds_train = odds_df[:-60]
label_train = label_df[:-60]

df_test = df[-60:]
odds_test = odds_df[-60:]
label_test = label_df[-60:]

print(len(df_test))
print(len(odds_test))
print(len(label_test))

print(len(df_train))
print(len(odds_train))
print(len(label_train))

60
60
60
508
508
508


In [5]:
display(df_train)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,id,team_one,team_two,stage,winner,date,corona_virus_isolation,t1_wins_season,...,t2_win_percent_last_20,t1_place_last_season,t2_place_last_season,t1_wins_vs_t2,t1_losses_vs_t2,t1_matches_vs_t2,t1_win_percent_vs_t2,winner_label,t1_odds,t2_odds
1,1,1,10224,Los Angeles Gladiators,Shanghai Dragons,Overwatch League - Stage 1,Los Angeles Gladiators,2018-01-11,False,0.0,...,,,,0,0,0,,0,-370.0,276.0
2,2,2,10225,Seoul Dynasty,Dallas Fuel,Overwatch League - Stage 1,Seoul Dynasty,2018-01-11,False,0.0,...,,,,0,0,0,,0,-588.0,413.0
3,3,3,10226,Florida Mayhem,London Spitfire,Overwatch League - Stage 1,London Spitfire,2018-01-11,False,0.0,...,,,,0,0,0,,1,652.0,-1667.0
4,4,4,10227,Houston Outlaws,Philadelphia Fusion,Overwatch League - Stage 1,Philadelphia Fusion,2018-01-11,False,0.0,...,,,,0,0,0,,1,-161.0,143.0
5,5,5,10228,New York Excelsior,Boston Uprising,Overwatch League - Stage 1,New York Excelsior,2018-01-12,False,0.0,...,,,,0,0,0,,0,-769.0,380.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
611,611,611,31043,Boston Uprising,Atlanta Reign,OWL 2020 Regular Season,Atlanta Reign,2020-03-08,False,1.0,...,0.60,19.0,6.0,1,1,2,0.5,1,525.0,-1000.0
617,617,617,31000,Chengdu Hunters,Shanghai Dragons,OWL 2020 Regular Season,Chengdu Hunters,2020-03-29,True,0.0,...,0.55,12.0,11.0,0,2,2,0.0,0,399.0,-625.0
618,618,618,31058,Hangzhou Spark,Guangzhou Charge,OWL 2020 Regular Season,Guangzhou Charge,2020-03-29,True,1.0,...,0.65,4.0,9.0,1,1,2,0.5,1,-196.0,145.0
619,619,619,31060,Florida Mayhem,Atlanta Reign,OWL 2020 Regular Season,Atlanta Reign,2020-03-29,True,2.0,...,0.60,20.0,6.0,0,2,2,0.0,1,341.0,-526.0


In [6]:
#inclusive features
inc_features = ['corona_virus_isolation', 't1_wins_season',
       't1_losses_season', 't2_wins_season', 't2_losses_season',
       't1_matches_season', 't2_matches_season', 't1_win_percent_season',
       't2_win_percent_season', 't1_wins_alltime', 't1_losses_alltime',
       't2_wins_alltime', 't2_losses_alltime', 't1_matches_alltime',
       't2_matches_alltime', 't1_win_percent_alltime',
       't2_win_percent_alltime', 't1_wins_last_3', 't1_losses_last_3',
       't2_wins_last_3', 't2_losses_last_3', 't1_win_percent_last_3',
       't2_win_percent_last_3', 't1_wins_last_5', 't1_losses_last_5',
       't2_wins_last_5', 't2_losses_last_5', 't1_win_percent_last_5',
       't2_win_percent_last_5', 't1_wins_last_10', 't1_losses_last_10',
       't2_wins_last_10', 't2_losses_last_10', 't1_win_percent_last_10',
       't2_win_percent_last_10', 
        't1_wins_vs_t2', 't1_losses_vs_t2',
       't1_matches_vs_t2', 't1_odds', 't2_odds', 'winner_label']

In [7]:
#Full list of possible features
#Basically inclusive features with the winner_label removed
my_pos_features = ['corona_virus_isolation', 't1_wins_season',
       't1_losses_season', 't2_wins_season', 't2_losses_season',
       't1_matches_season', 't2_matches_season', 't1_win_percent_season',
       't2_win_percent_season', 't1_wins_alltime', 't1_losses_alltime',
       't2_wins_alltime', 't2_losses_alltime', 't1_matches_alltime',
       't2_matches_alltime', 't1_win_percent_alltime',
       't2_win_percent_alltime', 't1_wins_last_3', 't1_losses_last_3',
       't2_wins_last_3', 't2_losses_last_3', 't1_win_percent_last_3',
       't2_win_percent_last_3', 't1_wins_last_5', 't1_losses_last_5',
       't2_wins_last_5', 't2_losses_last_5', 't1_win_percent_last_5',
       't2_win_percent_last_5', 't1_wins_last_10', 't1_losses_last_10',
       't2_wins_last_10', 't2_losses_last_10', 't1_win_percent_last_10',
       't2_win_percent_last_10', 
        't1_wins_vs_t2', 't1_losses_vs_t2',
       't1_matches_vs_t2', 't1_odds', 't2_odds']

In [8]:
#test_model = DecisionTreeClassifier(random_state=75)
test_model = LogisticRegression(random_state=75)
test_current_features = []

In [30]:
#INPUT: 
#df: The df to be evaluated
#m: The model to use
# labels: The labels
#odds: The odds
#min_ev: The minimum EV to place a bet
def custom_cv_eval_v2(df, m, labels, odds, min_ev=0):
    X = np.array(df)
    y = np.array(labels)
    odds = np.array(odds)
    running_total = 0
    count=1
    kf = KFold(n_splits=5, shuffle=True, random_state=75)
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        odds_train, odds_test = odds[train_index], odds[test_index]
        #print(test_index)
        m.fit(X_train, y_train)
        probs=m.predict_proba(X_test)
        #We need to prep the dataframe to evaluate....
        #X_odds = X_test[['t1_odds', 't2_odds']]
        #print(X_test)
        #print(X_test[:, -1])
        #print(X_test[:, -2])
        X_odds = list(zip(odds_test[:, -2], odds_test[:, -1], probs[:, 0], probs[:, 1], y_test))
        ev_prepped_df = pd.DataFrame(X_odds, columns=['t1_odds', 't2_odds', 't1_prob', 't2_prob', 'winner'])
        #display(temp_df)
        #print(f"{count}: {get_ev_from_df(ev_prepped_df, print_stats = False)}")
        count=count+1
        running_total = running_total + get_ev_from_df(ev_prepped_df, print_stats = False, min_ev = min_ev)
        #display(ev_prepped_df)
    
    return running_total

In [33]:
#We split off the labels and the odds.  Now we can rewrite the get_best_features function
#INPUT
#pos_features: The list of possible features
#m: The model
#cur_features: The list of current features
#scale: Does the data need to be scaled?  


def get_best_features_v2(pos_features, m, df, cur_features, labels, odds, scale=False):
    best_feature = ''
    
    
    #If there are no current features...
    if len(cur_features) == 0:
        best_score = 0
    else:
        df_sel = df[cur_features]
        df_sel = df_sel.dropna()
        #OK we need to filter the labels and odds based off of the indices
        labels_sel = labels[labels.index.isin(df_sel.index)]
        odds_sel = odds[odds.index.isin(df_sel.index)]        
        best_score = custom_cv_eval_v2(df_sel, m, labels_sel, odds_sel)
        
    best_feature = ""
    
    print(f"Current best score is: {best_score}")
    #Go thru every feature and test it...
    for f in pos_features:
        #If f is not a current feature
        if f not in cur_features:
            #print(f)
            new_features = [f] + cur_features
            df_sel = df[new_features]
            df_sel = df_sel.dropna()
            #OK we need to filter the labels and odds based off of the indices
            labels_sel = labels[labels.index.isin(df_sel.index)]
            odds_sel = odds[odds.index.isin(df_sel.index)]
            new_score = custom_cv_eval_v2(df_sel, m, labels_sel, odds_sel)
            #print(f"{len(df_sel)} {len(labels_sel)} {len(odds_sel)}")
            #print(f"{f}: {new_score}")
            if new_score > best_score:
                best_score = new_score
                best_feature = f
    if best_feature != "":
        print(f"The best feature was {best_feature}.  It scored {best_score}")
        print()
        cur_features = [best_feature] + cur_features
        #Keep running until we don't improve
        return(get_best_features_v2(pos_features, m, df, cur_features, labels, odds, scale))
    else:
        print("NO IMPROVEMENT")
        print(f"FINAL BEST SCORE: {best_score}")
        return cur_features                
                
    return []



In [None]:
print(get_best_features_v2(my_pos_features, test_model, df_train, [], label_train, odds_train, True))

In [11]:
model_2 = DecisionTreeClassifier(random_state=75)

In [12]:
print(get_best_features_v2(my_pos_features, model_2, df_train, [], label_train, odds_train, True))

Current best score is: 0
The best feature was t1_wins_last_10.  It scored 54.63338264072154
Current best score is: 54.63338264072154
The best feature was t1_matches_alltime.  It scored 73.12266477139364
Current best score is: 73.12266477139364
The best feature was t1_losses_last_10.  It scored 81.16891844450438
Current best score is: 81.16891844450438
The best feature was t1_win_percent_last_10.  It scored 81.23869286380679
Current best score is: 81.23869286380679
NO IMPROVEMENT
FINAL BEST SCORE: 81.23869286380679
['t1_win_percent_last_10', 't1_losses_last_10', 't1_matches_alltime', 't1_wins_last_10']


In [14]:
model_3 = RandomForestClassifier(random_state=75)

In [32]:
print(get_best_features_v2(my_pos_features, model_3, df_train, [], label_train, odds_train, True))

Current best score is: 0
corona_virus_isolation: -8.464596946318963
t1_wins_season: -43.43036626458431
t1_losses_season: 23.41723780408045
t2_wins_season: -4.013234592533767
t2_losses_season: -21.667711792012334
t1_matches_season: -11.34926273905964
t2_matches_season: -32.23598511611897
t1_win_percent_season: -10.76024140922403
t2_win_percent_season: -6.820286918107109
t1_wins_alltime: -36.519134172503456
t1_losses_alltime: -3.2299368420917967
t2_wins_alltime: -47.679715360473665
t2_losses_alltime: -12.398569323191632
t1_matches_alltime: -12.62539155351635
t2_matches_alltime: -26.64134895284515
t1_win_percent_alltime: 46.59115870700202
t2_win_percent_alltime: -28.228400673590485
t1_wins_last_3: -20.19029859685173
t1_losses_last_3: -20.19029859685173
t2_wins_last_3: -18.81106963272643
t2_losses_last_3: -18.81106963272643
t1_win_percent_last_3: -20.19029859685173
t2_win_percent_last_3: -18.81106963272643
t1_wins_last_5: 14.33847083221203
t1_losses_last_5: 14.33847083221203
t2_wins_last_5

In [17]:
model_4 = GradientBoostingClassifier(random_state=75)

In [18]:
print(get_best_features_v2(my_pos_features, model_4, df_train, [], label_train, odds_train, True))

Current best score is: 0
The best feature was t1_wins_last_10.  It scored 54.63338264072154
Current best score is: 54.63338264072154
The best feature was t2_losses_last_5.  It scored 86.45533533111917
Current best score is: 86.45533533111917
The best feature was t1_losses_season.  It scored 91.38364936313886
Current best score is: 91.38364936313886
The best feature was t2_win_percent_season.  It scored 123.93529321587049
Current best score is: 123.93529321587049
The best feature was t1_losses_last_10.  It scored 128.4874220169866
Current best score is: 128.4874220169866
NO IMPROVEMENT
FINAL BEST SCORE: 128.4874220169866
['t1_losses_last_10', 't2_win_percent_season', 't1_losses_season', 't2_losses_last_5', 't1_wins_last_10']


In [21]:
model_5 = GaussianNB()

In [23]:
print(get_best_features_v2(my_pos_features, model_5, df_train, [], label_train, odds_train, True))

Current best score is: 0
The best feature was t1_wins_last_10.  It scored 42.313990061062654
Current best score is: 42.313990061062654
The best feature was t1_win_percent_season.  It scored 64.6966294809353
Current best score is: 64.6966294809353
The best feature was t2_matches_season.  It scored 76.64468031565299
Current best score is: 76.64468031565299
The best feature was t1_matches_season.  It scored 77.61190252807359
Current best score is: 77.61190252807359
NO IMPROVEMENT
FINAL BEST SCORE: 77.61190252807359
['t1_matches_season', 't2_matches_season', 't1_win_percent_season', 't1_wins_last_10']


In [38]:
model_6 = LinearDiscriminantAnalysis()

In [39]:
print(get_best_features_v2(my_pos_features, model_6, df_train, [], label_train, odds_train, True))

Current best score is: 0
The best feature was t2_wins_last_10.  It scored 32.91136382461445

Current best score is: 32.91136382461445
The best feature was t1_losses_alltime.  It scored 43.25540144119129

Current best score is: 43.25540144119129
The best feature was t2_losses_season.  It scored 46.01371740035911

Current best score is: 46.01371740035911
The best feature was t1_matches_vs_t2.  It scored 48.723664636431366

Current best score is: 48.723664636431366
The best feature was t2_wins_alltime.  It scored 51.58527781957413

Current best score is: 51.58527781957413
The best feature was corona_virus_isolation.  It scored 56.10929228817335

Current best score is: 56.10929228817335
NO IMPROVEMENT
FINAL BEST SCORE: 56.10929228817335
['corona_virus_isolation', 't2_wins_alltime', 't1_matches_vs_t2', 't2_losses_season', 't1_losses_alltime', 't2_wins_last_10']


In [49]:
model_7 = ElasticNet()

In [50]:
print(get_best_features_v2(my_pos_features, model_7, df_train, [], label_train, odds_train, True))

Current best score is: 0


AttributeError: 'ElasticNet' object has no attribute 'predict_proba'