# 1. The goal here is to trace the MOV Automatic Feature selection to make sure everything is being calculated correctly

In [1]:
from sklearn.dummy import DummyClassifier #We will use the dummy classifier for speed reasons

In [44]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier


In [3]:
df = pd.read_csv("../data/kaggle_data/ufc-master.csv")

In [4]:
len(df)

4783

In [5]:
#Let's fix the date
df['date'] = pd.to_datetime(df['date'])

In [6]:
def return_finish_type(winner, finish):
    #print(winner, finish)
    #Why overcomplicate things?  We can just use a few if statements
    if winner == 'Red':
        #print("HI")
        if finish in ['U-DEC', 'S-DEC', 'M-DEC']:
            return ('Red - DEC')
        if finish in ['SUB']:
            return('Red - SUB')
        if finish in ['KO/TKO', 'DQ']:
            return('Red - KO/TKO')
    if winner == 'Blue':
        if finish in ['U-DEC', 'S-DEC', 'M-DEC']:
            return ('Blue - DEC')
        if finish in ['SUB']:
            return('Blue - SUB')
        if finish in ['KO/TKO', 'DQ']:
            return('Blue - KO/TKO')
        
    #Test for NaN
    if finish != finish:
        return('')
    
    if finish == 'Overturned':
        return('')
    
    
    return ('error')

In [7]:
#This calls for the power of lambda!
df['finish_type'] = df.apply(lambda x: return_finish_type(x['Winner'], x['finish']), axis=1)
mask = df['finish_type'] != ''
df = df[mask]

In [8]:
finish_list = ['Red - DEC', 'Red - SUB', 'Red - KO/TKO', 'Blue - DEC', 'Blue - SUB', 'Blue - KO/TKO']

#Let's put all the labels in a dataframe
df['label'] = ''
#If the winner is not Red or Blue we can remove it.

for f in range(len(finish_list)):
    mask = df['finish_type'] == finish_list[f]
    df['label'][mask] = f
    
#df["Winner"] = df["Winner"].astype('category')
#df = df[(df['Winner'] != 'Blue') | (df['Winner'] == 'Red') ]


#Make sure lable is numeric
df['label'] = pd.to_numeric(df['label'], errors='coerce')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['label'][mask] = f


In [9]:
df.rename(columns={'r_dec_odds': 'Red - DEC', 'r_sub_odds': 'Red - SUB', 'r_ko_odds': 'Red - KO/TKO',
                'b_dec_odds': 'Blue - DEC', 'b_sub_odds': 'Blue - SUB', 'b_ko_odds': 'Blue - KO/TKO'}, inplace=True)

In [10]:
label_df = df['label']
odds_df = df[finish_list]

In [11]:
#Split the test set.  We are always(?) going to use the last 200 matches as the test set, so we don't want those around
#as we pick models

df_train = df[250:]
odds_train = odds_df[250:]
label_train = label_df[250:]

df_test = df[:250]
odds_test = odds_df[:250]
label_test = label_df[:250]

print(len(df_test))
print(len(odds_test))
print(len(label_test))

print(len(df_train))
print(len(odds_train))
print(len(label_train))

250
250
250
4293
4293
4293


In [12]:
#We need to clean
mask = df_train['finish_type'] != ''
df_train = df_train[mask]
#print(len(df_train))

mask = df_test['finish_type'] != ''
df_test = df_test[mask]
#print(len(df_test))

label_train = label_train[label_train.index.isin(df_train.index)]
label_test = label_test[label_test.index.isin(df_test.index)]

odds_train = odds_train[odds_train.index.isin(df_train.index)]
odds_test = odds_test[odds_test.index.isin(df_test.index)]


print(len(df_train))
print(len(label_train))
print(len(odds_train))
print(len(df_test))
print(len(label_test))
print(len(odds_test))

4293
4293
4293
250
250
250


In [13]:
weightclass_list = ['B_match_weightclass_rank', 'R_match_weightclass_rank', "R_Women's Flyweight_rank", "R_Women's Featherweight_rank", "R_Women's Strawweight_rank", "R_Women's Bantamweight_rank", 'R_Heavyweight_rank', 'R_Light Heavyweight_rank', 'R_Middleweight_rank', 'R_Welterweight_rank', 'R_Lightweight_rank', 'R_Featherweight_rank', 'R_Bantamweight_rank', 'R_Flyweight_rank', 'R_Pound-for-Pound_rank', "B_Women's Flyweight_rank", "B_Women's Featherweight_rank", "B_Women's Strawweight_rank", "B_Women's Bantamweight_rank", 'B_Heavyweight_rank', 'B_Light Heavyweight_rank', 'B_Middleweight_rank', 'B_Welterweight_rank', 'B_Lightweight_rank', 'B_Featherweight_rank', 'B_Bantamweight_rank', 'B_Flyweight_rank', 'B_Pound-for-Pound_rank']
df_train[weightclass_list] = df_train[weightclass_list].fillna(17)
df_test[weightclass_list] = df_test[weightclass_list].fillna(17)

In [14]:
test_model_name = "dummy_model_for_trace"
test_model = DummyClassifier(random_state=75)
test_model_features = []
test_model_ev = 0

In [15]:
old_test_model = test_model
old_test_model_features = test_model_features
old_test_model_ev = test_model_ev

In [16]:
#1. set features
my_pos_features = ['R_odds', 'B_odds', 'R_ev', 'B_ev',
       'location', 'country', 'title_bout', 'weight_class', 'gender',
       'no_of_rounds', 'B_current_lose_streak', 'B_current_win_streak',
       'B_draw', 'B_avg_SIG_STR_landed', 'B_avg_SIG_STR_pct', 'B_avg_SUB_ATT',
       'B_avg_TD_landed', 'B_avg_TD_pct', 'B_longest_win_streak', 'B_losses',
       'B_total_rounds_fought', 'B_total_title_bouts',
       'B_win_by_Decision_Majority', 'B_win_by_Decision_Split',
       'B_win_by_Decision_Unanimous', 'B_win_by_KO/TKO', 'B_win_by_Submission',
       'B_win_by_TKO_Doctor_Stoppage', 'B_wins', 'B_Stance', 'B_Height_cms',
       'B_Reach_cms', 'B_Weight_lbs', 'R_current_lose_streak',
       'R_current_win_streak', 'R_draw', 'R_avg_SIG_STR_landed',
       'R_avg_SIG_STR_pct', 'R_avg_SUB_ATT', 'R_avg_TD_landed', 'R_avg_TD_pct',
       'R_longest_win_streak', 'R_losses', 'R_total_rounds_fought',
       'R_total_title_bouts', 'R_win_by_Decision_Majority',
       'R_win_by_Decision_Split', 'R_win_by_Decision_Unanimous',
       'R_win_by_KO/TKO', 'R_win_by_Submission',
       'R_win_by_TKO_Doctor_Stoppage', 'R_wins', 'R_Stance', 'R_Height_cms',
       'R_Reach_cms', 'R_Weight_lbs', 'R_age', 'B_age', 'lose_streak_dif',
       'win_streak_dif', 'longest_win_streak_dif', 'win_dif', 'loss_dif',
       'total_round_dif', 'total_title_bout_dif', 'ko_dif', 'sub_dif',
       'height_dif', 'reach_dif', 'age_dif', 'sig_str_dif', 'avg_sub_att_dif',
       'avg_td_dif', 'empty_arena', 'B_match_weightclass_rank', 'R_match_weightclass_rank', 
        "R_Women's Flyweight_rank", "R_Women's Featherweight_rank", "R_Women's Strawweight_rank",
        "R_Women's Bantamweight_rank", 'R_Heavyweight_rank', 'R_Light Heavyweight_rank', 
        'R_Middleweight_rank', 'R_Welterweight_rank', 'R_Lightweight_rank', 'R_Featherweight_rank', 
        'R_Bantamweight_rank', 'R_Flyweight_rank', 'R_Pound-for-Pound_rank', "B_Women's Flyweight_rank", 
        "B_Women's Featherweight_rank", "B_Women's Strawweight_rank", "B_Women's Bantamweight_rank", 
        'B_Heavyweight_rank', 'B_Light Heavyweight_rank', 'B_Middleweight_rank', 'B_Welterweight_rank', 
        'B_Lightweight_rank', 'B_Featherweight_rank', 'B_Bantamweight_rank', 'B_Flyweight_rank', 
        'B_Pound-for-Pound_rank', 'Red - DEC', 'Blue - DEC', 'Red - SUB', 'Blue - SUB', 'Red - KO/TKO', 'Blue - KO/TKO', 'better_rank']

In [17]:
print(test_model_name)
print(test_model)
print(test_model_features)
print(test_model_ev)


dummy_model_for_trace
DummyClassifier(random_state=75)
[]
0


In [18]:
def print_model():
    print()
    print(test_model_name)
    print(test_model)
    print(test_model_features)
    print(test_model_ev)
    print()

In [19]:
def get_bet_return(odds):
    if odds>0:
        return odds
    else:
        return (100 / abs(odds))*100

In [20]:
def get_bet_ev(odds, prob):
    if odds>0:
        return ((odds * prob) - (100 * (1-prob)) )
    else:
        return ((100 / abs(odds))*100*prob - (100 * (1-prob)))


In [90]:
def get_ev_for_optimize_mov(df_odds, probs, labels,  print_stats = False, min_ev = 0, get_total=True):
        
    score = 0
    #print(df_odds)
    for i in range(len(df_odds)):
        #print(i)
        #        df_temp_odds = df_odds.iloc[[i, :]]
        #print()
        #print()
        #print(df_odds[i])
        print()
        print("Winner: " + str(labels[i]))
        print(df_odds[i])
        print(probs[i])

        for l in range(len(probs[i])):
            temp_odds = (df_odds[i][l])
            #print((temp_odds))
            bet_ev = get_bet_ev(temp_odds, probs[i][l])
            #print(bet_ev)
            if bet_ev > min_ev:
                print("Bet " + str(l))
                #print(l)
                if labels[i] == l:
                    #print(f"{int(labels[i])} {l}")
                    score = score + get_bet_return(temp_odds)
                    print(f"Winning Bet. New Score: {score}")
                else:
                    score = score - 100
                    print(f"Losing Bet.  New Score: {score}")
                    
            #print()
            
            
            
        #print(f"Result: {labels[i]}")
    return(score)


In [91]:
get_ev_for_optimize_mov(temp_input[0], temp_input[1], temp_input[2])


Winner: 2
[ 350. 1600.  160.  350. 1800.  265.]
[0.33333333 0.         0.66666667 0.         0.         0.        ]
Bet 0
Losing Bet.  New Score: -100
Bet 2
Winning Bet. New Score: 60.0

Winner: 5
[ 210. 1000.  450.  450.  425.  300.]
[0. 0. 1. 0. 0. 0.]
Bet 2
Losing Bet.  New Score: -40.0

Winner: 1
[ 100.  675. 1400.  225.  600.  650.]
[0. 0. 1. 0. 0. 0.]
Bet 2
Losing Bet.  New Score: -140.0

Winner: 5
[ 350.  375.  650.  350. 1000.  185.]
[0.         0.         0.         0.66666667 0.         0.33333333]
Bet 3
Losing Bet.  New Score: -240.0

Winner: 0
[ 160. 1800.  150.  375.  900.  850.]
[0.5 0.  0.  0.  0.  0.5]
Bet 0
Winning Bet. New Score: -80.0
Bet 5
Losing Bet.  New Score: -180.0

Winner: 4
[ 300.  550.  170.  300. 1100.  600.]
[0. 0. 1. 0. 0. 0.]
Bet 2
Losing Bet.  New Score: -280.0

Winner: 2
[ 650.  425. 1100.  375. 1800. -135.]
[0. 0. 1. 0. 0. 0.]
Bet 2
Winning Bet. New Score: 820.0

Winner: 0
[ 215. 1400.  220.  255. 1200.  450.]
[0.66666667 0.         0.         0.    

-1740.0

In [75]:
def custom_cv_eval_mov(df, m, labels, odds, min_ev=0, verbose=False, get_total=True):
    #If we have less than 5 samples we are going to break the split.
    #print("HI")
    if len(df) < 5:
        return 0
    X = np.array(df)
    y = np.array(labels)
    odds = np.array(odds)
    running_total = 0
    count=1
    kf = KFold(n_splits=5, shuffle=True, random_state=75)
    for train_index, test_index in kf.split(X):
        print()
        print()
        print()
        print()
        print()
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        odds_train, odds_test = odds[train_index], odds[test_index]
        print(y_test)
        scaler = StandardScaler()
        scaled_train = scaler.fit_transform(X_train)
        scaled_test = scaler.transform(X_test)
        
        m.fit(scaled_train, y_train)
        probs=m.predict_proba(scaled_test)
        print(probs)
        #print(probs)
        #We need to prep the dataframe to evaluate....
        #X_odds = X_test[['t1_odds', 't2_odds']]
        #print(X_test)
        #print(X_test[:, -1])
        #print(X_test[:, -2])
        #X_odds = list(zip(odds_test[:, -2], odds_test[:, -1], probs[:, 0], probs[:, 1], y_test))
        #ev_prepped_df = pd.DataFrame(X_odds, columns=['t1_odds', 't2_odds', 't1_prob', 't2_prob', 'winner'])
        #display(ev_prepped_df)
        #display(temp_df)
        #print(f"{count}: {get_ev_from_df(ev_prepped_df, print_stats = False)}")
        count=count+1
        print()
        print("Odds Test, Probs, the y_test")
        print(str(odds_test))
        print()
        print(probs)
        print()
        print(y_test)
        running_total = running_total + get_ev_for_optimize_mov(odds_test, probs, y_test,  min_ev= min_ev, get_total=get_total )
        
        #display(ev_prepped_df)
    
    return running_total

In [76]:
rt = (custom_cv_eval_mov(df_test[['R_odds']], DecisionTreeClassifier(), label_test, odds_test, 0))







[2 5 1 5 0 4 2 0 5 3 0 1 0 0 1 1 0 3 5 3 0 0 0 4 0 0 0 5 2 0 3 2 0 1 1 3 3
 3 0 3 3 0 1 0 3 3 2 0 2 0]
[[0.33333333 0.         0.66666667 0.         0.         0.        ]
 [0.         0.         1.         0.         0.         0.        ]
 [0.         0.         1.         0.         0.         0.        ]
 [0.         0.         0.         0.66666667 0.         0.33333333]
 [0.5        0.         0.         0.         0.         0.5       ]
 [0.         0.         1.         0.         0.         0.        ]
 [0.         0.         1.         0.         0.         0.        ]
 [0.66666667 0.         0.         0.         0.         0.33333333]
 [1.         0.         0.         0.         0.         0.        ]
 [1.         0.         0.         0.         0.         0.        ]
 [0.         0.         0.         0.5        0.25       0.25      ]
 [0.         0.         0.         1.         0.         0.        ]
 [0.5        0.         0.         0.         0.         0.5    

In [77]:
print(rt)

-2738.761995986134


In [25]:
def get_best_features_mov(pos_features, m, df, cur_features, labels, odds, label_list, scale=False, min_ev=0):
    best_feature = ''
        
    #If there are no current features...
    if len(cur_features) == 0:
        best_score = -1000000
    else:
        df_sel = df[cur_features]
        df_sel = df_sel.dropna()
        df_sel = pd.get_dummies(df_sel)
        #OK we need to filter the labels and odds based off of the indices
        labels_sel = labels[labels.index.isin(df_sel.index)]
        odds_sel = odds[odds.index.isin(df_sel.index)]        
        labels_sel = labels_sel.dropna()
        odds_sel = odds_sel[odds_sel.index.isin(labels_sel.index)]     
        df_sel = df_sel[df_sel.index.isin(labels_sel.index)] 
        best_score = custom_cv_eval_mov(df_sel, m, labels_sel, odds_sel, min_ev=min_ev)
        
        
    best_feature = ""
    
    print(f"Current best score is: {best_score}")
    #Go thru every feature and test it...
    for f in pos_features:
        #If f is not a current feature
        if f not in cur_features:
            new_features = [f] + cur_features
            df_sel = df[new_features]
            df_sel = df_sel.dropna()
            df_sel = pd.get_dummies(df_sel)
            #display(df_sel)
            #OK we need to filter the labels and odds based off of the indices
            labels_sel = labels[labels.index.isin(df_sel.index)]
            odds_sel = odds[odds.index.isin(df_sel.index)]
            labels_sel = labels_sel.dropna()
            odds_sel = odds_sel[odds_sel.index.isin(labels_sel.index)]     
            df_sel = df_sel[df_sel.index.isin(labels_sel.index)] 
            
            
            new_score = custom_cv_eval_mov(df_sel, m, labels_sel, odds_sel, min_ev=min_ev)
            #print(f"{len(df_sel)} {len(labels_sel)} {len(odds_sel)}")
            if new_score > best_score:
                print(f"Feature: {f} Score: {new_score}")
                best_score = new_score
                best_feature = f
    if best_feature != "":
        print(f"The best feature was {best_feature}.  It scored {best_score}")
        cur_features = [best_feature] + cur_features
        #Keep running until we don't improve
        return(get_best_features_mov(pos_features, m, df, cur_features, labels, odds, label_list,  scale, min_ev=min_ev))
    else:
        print("NO IMPROVEMENT")
        print(f"FINAL BEST SCORE: {best_score}")
        return cur_features                
                
    return []



In [26]:
keep_going = True

while(keep_going):

    test_model_features = (get_best_features_mov(my_pos_features, test_model, df_train, test_model_features, label_train, odds_train, finish_list, 
                                             min_ev=test_model_ev))
    
    print_model()
    
    keep_going = False

Current best score is: -186119.0
R_odds
B_odds
R_ev
B_ev
location
country
title_bout
weight_class
gender
no_of_rounds
B_current_lose_streak
B_current_win_streak
B_draw
B_avg_SIG_STR_landed
B_avg_SIG_STR_pct
B_avg_SUB_ATT
B_avg_TD_landed
B_avg_TD_pct
B_longest_win_streak
B_losses
B_total_rounds_fought
B_total_title_bouts
B_win_by_Decision_Majority
B_win_by_Decision_Split
B_win_by_Decision_Unanimous
B_win_by_KO/TKO
B_win_by_Submission
B_win_by_TKO_Doctor_Stoppage
B_wins
B_Stance
B_Height_cms
B_Reach_cms
B_Weight_lbs
R_current_lose_streak
R_current_win_streak
R_draw
R_avg_SIG_STR_landed
R_avg_SIG_STR_pct
R_avg_SUB_ATT
R_avg_TD_landed
R_avg_TD_pct
R_longest_win_streak
R_losses
R_total_rounds_fought
R_total_title_bouts
R_win_by_Decision_Majority
R_win_by_Decision_Split
R_win_by_Decision_Unanimous
R_win_by_KO/TKO
R_win_by_Submission
R_win_by_TKO_Doctor_Stoppage
R_wins
R_Stance
R_Height_cms
R_Reach_cms
R_Weight_lbs
R_age
B_age
lose_streak_dif
win_streak_dif
longest_win_streak_dif
win_dif
loss