In [1]:
import sys
sys.path.append('../../automated_model_creation') #We need to access the function file

In [2]:
import pandas as pd
import numpy as np
from functions import *
import random
import csv
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import ExtraTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import RadiusNeighborsClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.calibration import CalibratedClassifierCV
from sklearn.svm import LinearSVC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegressionCV
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
#from sklearn.mixture import DPGMM

remove_fight_island = False

In [3]:
#Turn off warnings

import warnings
warnings.filterwarnings("ignore")

In [4]:
#Load models
#REMINDER: We are going to need to use 'eval' to get the models usable
with open('../../data/models_mov.csv', newline='') as f:
    reader = csv.reader(f)
    models = list(reader)
    
#print(len(models))


###SELECT MODEL TO OPTIMIZE
#model_num = 15

In [5]:
#models

In [6]:
df = pd.read_csv("../../data/kaggle_data/ufc-master.csv")

In [7]:
#Let's fix the date
df['date'] = pd.to_datetime(df['date'])

In [8]:
def return_finish_type(winner, finish):
    #print(winner, finish)
    #Why overcomplicate things?  We can just use a few if statements
    if winner == 'Red':
        #print("HI")
        if finish in ['U-DEC', 'S-DEC', 'M-DEC']:
            return ('Red - DEC')
        if finish in ['SUB']:
            return('Red - SUB')
        if finish in ['KO/TKO', 'DQ']:
            return('Red - KO/TKO')
    if winner == 'Blue':
        if finish in ['U-DEC', 'S-DEC', 'M-DEC']:
            return ('Blue - DEC')
        if finish in ['SUB']:
            return('Blue - SUB')
        if finish in ['KO/TKO', 'DQ']:
            return('Blue - KO/TKO')
        
    #Test for NaN
    if finish != finish:
        return('')
    
    if finish == 'Overturned':
        return('')
    
    
    return ('error')

In [9]:
#This calls for the power of lambda!
df['finish_type'] = df.apply(lambda x: return_finish_type(x['Winner'], x['finish']), axis=1)
mask = df['finish_type'] != ''
df = df[mask]

In [10]:
finish_list = ['Red - DEC', 'Red - SUB', 'Red - KO/TKO', 'Blue - DEC', 'Blue - SUB', 'Blue - KO/TKO']

#Let's put all the labels in a dataframe
df['label'] = ''
#If the winner is not Red or Blue we can remove it.

for f in range(len(finish_list)):
    mask = df['finish_type'] == finish_list[f]
    df['label'][mask] = f
    
#df["Winner"] = df["Winner"].astype('category')
#df = df[(df['Winner'] != 'Blue') | (df['Winner'] == 'Red') ]


#Make sure lable is numeric
df['label'] = pd.to_numeric(df['label'], errors='coerce')

In [11]:
df.rename(columns={'r_dec_odds': 'Red - DEC', 'r_sub_odds': 'Red - SUB', 'r_ko_odds': 'Red - KO/TKO',
                'b_dec_odds': 'Blue - DEC', 'b_sub_odds': 'Blue - SUB', 'b_ko_odds': 'Blue - KO/TKO'}, inplace=True)

In [12]:
label_df = df['label']
odds_df = df[finish_list]

In [13]:
#Split the test set.  We are always(?) going to use the last 200 matches as the test set, so we don't want those around
#as we pick models

df_train = df[250:]
odds_train = odds_df[250:]
label_train = label_df[250:]

df_test = df[:250]
odds_test = odds_df[:250]
label_test = label_df[:250]

#print(len(df_test))
#print(len(odds_test))
#print(len(label_test))

#print(len(df_train))
#print(len(odds_train))
#print(len(label_train))

In [14]:
#We need to clean
mask = df_train['finish_type'] != ''
df_train = df_train[mask]
#print(len(df_train))

mask = df_test['finish_type'] != ''
df_test = df_test[mask]
#print(len(df_test))

label_train = label_train[label_train.index.isin(df_train.index)]
label_test = label_test[label_test.index.isin(df_test.index)]

odds_train = odds_train[odds_train.index.isin(df_train.index)]
odds_test = odds_test[odds_test.index.isin(df_test.index)]


#print(len(df_train))
#print(len(label_train))
#print(len(odds_train))
#print(len(df_test))
#print(len(label_test))
#print(len(odds_test))



In [15]:
if remove_fight_island:
    ##Let's remove the Fight island contests and see how that affects score
    df_test_no_fight_island = df_test[(df_test['location'] != 'Abu Dhabi, Abu Dhabi, United Arab Emirates')]
    df_train_no_fight_island = df_train[(df_train['location'] != 'Abu Dhabi, Abu Dhabi, United Arab Emirates')]
    df_test = df_test_no_fight_island
    df_train = df_train_no_fight_island



#print(len(df_test))
#print(len(df_train))


In [16]:
#display(df_train)
#display(df_test)

In [17]:
#Set a value for the nulls in the ranks

weightclass_list = ['B_match_weightclass_rank', 'R_match_weightclass_rank', "R_Women's Flyweight_rank", "R_Women's Featherweight_rank", "R_Women's Strawweight_rank", "R_Women's Bantamweight_rank", 'R_Heavyweight_rank', 'R_Light Heavyweight_rank', 'R_Middleweight_rank', 'R_Welterweight_rank', 'R_Lightweight_rank', 'R_Featherweight_rank', 'R_Bantamweight_rank', 'R_Flyweight_rank', 'R_Pound-for-Pound_rank', "B_Women's Flyweight_rank", "B_Women's Featherweight_rank", "B_Women's Strawweight_rank", "B_Women's Bantamweight_rank", 'B_Heavyweight_rank', 'B_Light Heavyweight_rank', 'B_Middleweight_rank', 'B_Welterweight_rank', 'B_Lightweight_rank', 'B_Featherweight_rank', 'B_Bantamweight_rank', 'B_Flyweight_rank', 'B_Pound-for-Pound_rank']
df_train[weightclass_list] = df_train[weightclass_list].fillna(17)
df_test[weightclass_list] = df_test[weightclass_list].fillna(17)


In [18]:
#df_test
#df_test.to_csv('test.csv')

In [19]:
#1. Set features
#2. Set Hyperparameters
#3. Set EV
#4. Remove Features

In [20]:
models[1][model_num]

"LinearDiscriminantAnalysis(solver='lsqr')"

In [21]:
test_model_name = models[0][model_num]
test_model = eval(models[1][model_num])
test_model_features = eval(models[2][model_num])
test_model_ev = eval(models[3][model_num])

In [22]:
old_test_model = test_model
old_test_model_features = test_model_features
old_test_model_ev = test_model_ev

In [23]:
#1. set features
my_pos_features = ['R_odds', 'B_odds', 'R_ev', 'B_ev',
       'location', 'country', 'title_bout', 'weight_class', 'gender',
       'no_of_rounds', 'B_current_lose_streak', 'B_current_win_streak',
       'B_draw', 'B_avg_SIG_STR_landed', 'B_avg_SIG_STR_pct', 'B_avg_SUB_ATT',
       'B_avg_TD_landed', 'B_avg_TD_pct', 'B_longest_win_streak', 'B_losses',
       'B_total_rounds_fought', 'B_total_title_bouts',
       'B_win_by_Decision_Majority', 'B_win_by_Decision_Split',
       'B_win_by_Decision_Unanimous', 'B_win_by_KO/TKO', 'B_win_by_Submission',
       'B_win_by_TKO_Doctor_Stoppage', 'B_wins', 'B_Stance', 'B_Height_cms',
       'B_Reach_cms', 'B_Weight_lbs', 'R_current_lose_streak',
       'R_current_win_streak', 'R_draw', 'R_avg_SIG_STR_landed',
       'R_avg_SIG_STR_pct', 'R_avg_SUB_ATT', 'R_avg_TD_landed', 'R_avg_TD_pct',
       'R_longest_win_streak', 'R_losses', 'R_total_rounds_fought',
       'R_total_title_bouts', 'R_win_by_Decision_Majority',
       'R_win_by_Decision_Split', 'R_win_by_Decision_Unanimous',
       'R_win_by_KO/TKO', 'R_win_by_Submission',
       'R_win_by_TKO_Doctor_Stoppage', 'R_wins', 'R_Stance', 'R_Height_cms',
       'R_Reach_cms', 'R_Weight_lbs', 'R_age', 'B_age', 'lose_streak_dif',
       'win_streak_dif', 'longest_win_streak_dif', 'win_dif', 'loss_dif',
       'total_round_dif', 'total_title_bout_dif', 'ko_dif', 'sub_dif',
       'height_dif', 'reach_dif', 'age_dif', 'sig_str_dif', 'avg_sub_att_dif',
       'avg_td_dif', 'empty_arena', 'B_match_weightclass_rank', 'R_match_weightclass_rank', 
        "R_Women's Flyweight_rank", "R_Women's Featherweight_rank", "R_Women's Strawweight_rank",
        "R_Women's Bantamweight_rank", 'R_Heavyweight_rank', 'R_Light Heavyweight_rank', 
        'R_Middleweight_rank', 'R_Welterweight_rank', 'R_Lightweight_rank', 'R_Featherweight_rank', 
        'R_Bantamweight_rank', 'R_Flyweight_rank', 'R_Pound-for-Pound_rank', "B_Women's Flyweight_rank", 
        "B_Women's Featherweight_rank", "B_Women's Strawweight_rank", "B_Women's Bantamweight_rank", 
        'B_Heavyweight_rank', 'B_Light Heavyweight_rank', 'B_Middleweight_rank', 'B_Welterweight_rank', 
        'B_Lightweight_rank', 'B_Featherweight_rank', 'B_Bantamweight_rank', 'B_Flyweight_rank', 
        'B_Pound-for-Pound_rank', 'Red - DEC', 'Blue - DEC', 'Red - SUB', 'Blue - SUB', 'Red - KO/TKO', 'Blue - KO/TKO', 'better_rank']

In [24]:
print(test_model_name)
print(test_model)
print(test_model_features)
print(test_model_ev)



model_15
LinearDiscriminantAnalysis(solver='lsqr')
["R_Women's Featherweight_rank", 'B_Lightweight_rank', 'R_Featherweight_rank', 'R_Light Heavyweight_rank', 'total_title_bout_dif', 'R_losses', 'B_win_by_Submission', 'Blue - DEC', 'B_draw', 'R_Middleweight_rank', 'B_avg_SIG_STR_pct', "B_Women's Strawweight_rank", 'age_dif', 'B_Middleweight_rank', 'avg_sub_att_dif', 'avg_td_dif', 'B_avg_SUB_ATT', "R_Women's Flyweight_rank", 'R_Pound-for-Pound_rank', 'R_Reach_cms', 'R_wins', 'B_ev', 'R_Weight_lbs', 'sub_dif', 'R_ev', 'B_avg_TD_landed']
0


In [25]:
def save_model():
    score = evaluate_model_mov(test_model, test_model_features, test_model_ev, df_train, label_train, odds_train, df_test, label_test,
                         odds_test, finish_list, verbose = True)
    models[0][model_num] =  test_model_name 
    models[1][model_num] = test_model
    models[2][model_num] = test_model_features
    models[3][model_num] = test_model_ev
    models[4][model_num] = score    
    
    with open('../../data/models_mov.csv', 'w', newline='') as outfile:
        writer = csv.writer(outfile)
        for row in models:
            print("HI")
            writer.writerow(row)

    outfile.close()    

In [26]:
def print_model():
    print()
    print(test_model_name)
    print(test_model)
    print(test_model_features)
    print(test_model_ev)
    print()

In [27]:
#Test evaluate model
#odds_test

#print(evaluate_model_mov(test_model, test_model_features, test_model_ev, df_train, label_train, odds_train, df_test, label_test, odds_test, finish_list))

In [28]:
keep_going = True
#keep_going = False
while(keep_going):

    
    #1. Set Features
    #get_best_features(pos_features, m, df, cur_features, labels, odds, scale=False)
    test_model_features = (get_best_features_mov(my_pos_features, test_model, df_train, test_model_features, label_train, odds_train, finish_list, 
                                             min_ev=test_model_ev))
    print_model()
    save_model()
    #2 Set hyperparameters
    #def tune_hyperparameters(input_model, input_features, input_df, input_labels, odds_input):
    test_model = tune_hyperparameters(test_model, test_model_features, df_train, label_train, odds_train, 
                                      min_ev=test_model_ev)
    
    print_model()
    save_model()    
    
    #3. Set EV
    #def tune_ev(input_model, input_features, input_df, input_labels, odds_input, verbose=False):
    test_model_ev = tune_ev_mov(test_model, test_model_features, df_train, label_train, odds_train, verbose=False)
    old_test_model_features = test_model_features #This prevents
                                                  #an uneccesary loop
    print_model()
    save_model()
    
    
    #4. Remove Features
    #def remove_to_improve(cur_features, m, df, labels, odds, scale=False, min_ev = 0):
    test_model_features = remove_to_improve_mov(test_model_features, test_model, df_train, label_train, odds_train, min_ev = test_model_ev)    
    keep_going = False
    
    print_model()
    save_model()
    if old_test_model != test_model:
        print("The hyperparameters are different")
        print("OLD:")
        print(old_test_model)
        print("NEW:")
        print(test_model)
        keep_going = True
        old_test_model = test_model
    if old_test_model_features != test_model_features:
        print("The features are different")
        print("OLD:")
        print(old_test_model_features)
        print("NEW:")
        print(test_model_features)
        keep_going = True
        old_test_model_features = test_model_features
    if old_test_model_ev != test_model_ev:
        print("The EV is different")
        print("OLD:")
        print(old_test_model_ev)
        print("NEW:")
        print(test_model_ev)
        keep_going = True
        old_test_model_ev = test_model_ev
    

    

Current best score is: -5713.7552272536395
NO IMPROVEMENT
FINAL BEST SCORE: -5713.7552272536395

model_15
LinearDiscriminantAnalysis(solver='lsqr')
["R_Women's Featherweight_rank", 'B_Lightweight_rank', 'R_Featherweight_rank', 'R_Light Heavyweight_rank', 'total_title_bout_dif', 'R_losses', 'B_win_by_Submission', 'Blue - DEC', 'B_draw', 'R_Middleweight_rank', 'B_avg_SIG_STR_pct', "B_Women's Strawweight_rank", 'age_dif', 'B_Middleweight_rank', 'avg_sub_att_dif', 'avg_td_dif', 'B_avg_SUB_ATT', "R_Women's Flyweight_rank", 'R_Pound-for-Pound_rank', 'R_Reach_cms', 'R_wins', 'B_ev', 'R_Weight_lbs', 'sub_dif', 'R_ev', 'B_avg_TD_landed']
0



(2782, 26)

(2782,)

(2782, 6)

(239, 26)

(239,)

(239, 6)

250     0
252     5
253     2
254     3
255     0
       ..
4162    0
4170    1
4171    3
4172    5
4173    5
Name: label, Length: 2782, dtype: int64
Real Score: 6180.0
HI
HI
HI
HI
HI


Starting New Run for LinearDiscriminantAnalysis
LinearDiscriminantAnalysis(solver='lsqr')


Previous Best Score: -5713.7552272536395
solver:  svd tol:  0.0001 Score:  -6247.72074449502
solver:  svd tol:  0.00011000000000000002 Score:  -6247.72074449502
solver:  svd tol:  9e-05 Score:  -6247.72074449502
solver:  lsqr tol:  0.0001 Score:  -5713.7552272536395
solver:  lsqr tol:  0.00011000000000000002 Score:  -5713.7552272536395
solver:  lsqr tol:  9e-05 Score:  -5713.7552272536395
output model: LinearDiscriminantAnalysis(solver='lsqr')
pos model: LinearDiscriminantAnalysis(solver='lsqr')
 output_model: LinearDiscriminantAnalysis(solver='lsqr')
Real output model: LinearDiscriminantAnalysis(solver='lsqr')

model_15
LinearDiscriminantAnalysis(solver='lsqr')
["R_Women's Featherweight_rank", 'B_Lightweight_ran

(2782, 26)

(2782,)

(2782, 6)

(239, 26)

(239,)

(239, 6)

250     0
252     5
253     2
254     3
255     0
       ..
4162    0
4170    1
4171    3
4172    5
4173    5
Name: label, Length: 2782, dtype: int64
Real Score: 6180.0
HI
HI
HI
HI
HI

model_15
LinearDiscriminantAnalysis(solver='lsqr')
["R_Women's Featherweight_rank", 'B_Lightweight_rank', 'R_Featherweight_rank', 'R_Light Heavyweight_rank', 'total_title_bout_dif', 'R_losses', 'B_win_by_Submission', 'Blue - DEC', 'B_draw', 'R_Middleweight_rank', 'B_avg_SIG_STR_pct', "B_Women's Strawweight_rank", 'age_dif', 'B_Middleweight_rank', 'avg_sub_att_dif', 'avg_td_dif', 'B_avg_SUB_ATT', "R_Women's Flyweight_rank", 'R_Pound-for-Pound_rank', 'R_Reach_cms', 'R_wins', 'B_ev', 'R_Weight_lbs', 'sub_dif', 'R_ev', 'B_avg_TD_landed']
0



(2782, 26)

(2782,)

(2782, 6)

(239, 26)

(239,)

(239, 6)

250     0
252     5
253     2
254     3
255     0
       ..
4162    0
4170    1
4171    3
4172    5
4173    5
Name: label, Length: 2782, dtype: int64
Real Score: 6180.0
HI
HI
HI
HI
HI

model_15
LinearDiscriminantAnalysis(solver='lsqr')
["R_Women's Featherweight_rank", 'B_Lightweight_rank', 'R_Featherweight_rank', 'R_Light Heavyweight_rank', 'total_title_bout_dif', 'R_losses', 'B_win_by_Submission', 'Blue - DEC', 'B_draw', 'R_Middleweight_rank', 'B_avg_SIG_STR_pct', "B_Women's Strawweight_rank", 'age_dif', 'B_Middleweight_rank', 'avg_sub_att_dif', 'avg_td_dif', 'B_avg_SUB_ATT', "R_Women's Flyweight_rank", 'R_Pound-for-Pound_rank', 'R_Reach_cms', 'R_wins', 'B_ev', 'R_Weight_lbs', 'sub_dif', 'R_ev', 'B_avg_TD_landed']
0



(2782, 26)

(2782,)

(2782, 6)

(239, 26)

(239,)

(239, 6)

250     0
252     5
253     2
254     3
255     0
       ..
4162    0
4170    1
4171    3
4172    5
4173    5
Name: label, Length: 2782, dtype: int64
Real Score: 6180.0
HI
HI
HI
HI
HI


print(test_model_name) 
print(models[0][model_num])
print()
print(test_model)
print(eval(models[1][model_num]))
print()
print(test_model_features) 
print(eval(models[2][model_num]))
print()
print(test_model_ev)
print(eval(models[3][model_num]))