In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
import random


In [2]:
df = pd.read_csv("../data/kaggle_data/ufc-master.csv")

In [3]:
len(df)

4783

In [4]:
#Let's fix the date
df['date'] = pd.to_datetime(df['date'])

In [5]:
def return_finish_type(winner, finish):
    #print(winner, finish)
    #Why overcomplicate things?  We can just use a few if statements
    if winner == 'Red':
        #print("HI")
        if finish in ['U-DEC', 'S-DEC', 'M-DEC']:
            return ('Red - DEC')
        if finish in ['SUB']:
            return('Red - SUB')
        if finish in ['KO/TKO', 'DQ']:
            return('Red - KO/TKO')
    if winner == 'Blue':
        if finish in ['U-DEC', 'S-DEC', 'M-DEC']:
            return ('Blue - DEC')
        if finish in ['SUB']:
            return('Blue - SUB')
        if finish in ['KO/TKO', 'DQ']:
            return('Blue - KO/TKO')
        
    #Test for NaN
    if finish != finish:
        return('')
    
    if finish == 'Overturned':
        return('')
    
    
    return ('error')

In [6]:
#This calls for the power of lambda!
df['finish_type'] = df.apply(lambda x: return_finish_type(x['Winner'], x['finish']), axis=1)
mask = df['finish_type'] != ''
df = df[mask]

In [7]:
finish_list = ['Red - DEC', 'Red - SUB', 'Red - KO/TKO', 'Blue - DEC', 'Blue - SUB', 'Blue - KO/TKO']

#Let's put all the labels in a dataframe
df['label'] = ''
#If the winner is not Red or Blue we can remove it.

for f in range(len(finish_list)):
    mask = df['finish_type'] == finish_list[f]
    df['label'][mask] = f
    
#df["Winner"] = df["Winner"].astype('category')
#df = df[(df['Winner'] != 'Blue') | (df['Winner'] == 'Red') ]


#Make sure lable is numeric
df['label'] = pd.to_numeric(df['label'], errors='coerce')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['label'][mask] = f


In [8]:
df.rename(columns={'r_dec_odds': 'Red - DEC', 'r_sub_odds': 'Red - SUB', 'r_ko_odds': 'Red - KO/TKO',
                'b_dec_odds': 'Blue - DEC', 'b_sub_odds': 'Blue - SUB', 'b_ko_odds': 'Blue - KO/TKO'}, inplace=True)

In [9]:
label_df = df['label']
odds_df = df[finish_list]

In [10]:
#Split the test set.  We are always(?) going to use the last 200 matches as the test set, so we don't want those around
#as we pick models

df_train = df[250:]
odds_train = odds_df[250:]
label_train = label_df[250:]

df_test = df[:250]
odds_test = odds_df[:250]
label_test = label_df[:250]

print(len(df_test))
print(len(odds_test))
print(len(label_test))

print(len(df_train))
print(len(odds_train))
print(len(label_train))

250
250
250
4293
4293
4293


In [11]:
#We need to clean
mask = df_train['finish_type'] != ''
df_train = df_train[mask]
#print(len(df_train))

mask = df_test['finish_type'] != ''
df_test = df_test[mask]
#print(len(df_test))

label_train = label_train[label_train.index.isin(df_train.index)]
label_test = label_test[label_test.index.isin(df_test.index)]

odds_train = odds_train[odds_train.index.isin(df_train.index)]
odds_test = odds_test[odds_test.index.isin(df_test.index)]


print(len(df_train))
print(len(label_train))
print(len(odds_train))
print(len(df_test))
print(len(label_test))
print(len(odds_test))

4293
4293
4293
250
250
250


In [12]:
weightclass_list = ['B_match_weightclass_rank', 'R_match_weightclass_rank', "R_Women's Flyweight_rank", "R_Women's Featherweight_rank", "R_Women's Strawweight_rank", "R_Women's Bantamweight_rank", 'R_Heavyweight_rank', 'R_Light Heavyweight_rank', 'R_Middleweight_rank', 'R_Welterweight_rank', 'R_Lightweight_rank', 'R_Featherweight_rank', 'R_Bantamweight_rank', 'R_Flyweight_rank', 'R_Pound-for-Pound_rank', "B_Women's Flyweight_rank", "B_Women's Featherweight_rank", "B_Women's Strawweight_rank", "B_Women's Bantamweight_rank", 'B_Heavyweight_rank', 'B_Light Heavyweight_rank', 'B_Middleweight_rank', 'B_Welterweight_rank', 'B_Lightweight_rank', 'B_Featherweight_rank', 'B_Bantamweight_rank', 'B_Flyweight_rank', 'B_Pound-for-Pound_rank']
df_train[weightclass_list] = df_train[weightclass_list].fillna(17)
df_test[weightclass_list] = df_test[weightclass_list].fillna(17)

In [13]:
test_model_name = "dummy_model_for_trace"
test_model = DecisionTreeClassifier(random_state=75, min_samples_leaf=0.01)
test_model_features = ['R_odds']
test_model_ev = 0

In [14]:
old_test_model = test_model
old_test_model_features = test_model_features
old_test_model_ev = test_model_ev

In [15]:
my_pos_features = ['R_odds', 'B_odds', 'R_ev', 'B_ev',
       'location', 'country', 'title_bout', 'weight_class', 'gender',
       'no_of_rounds', 'B_current_lose_streak', 'B_current_win_streak',
       'B_draw', 'B_avg_SIG_STR_landed', 'B_avg_SIG_STR_pct', 'B_avg_SUB_ATT',
       'B_avg_TD_landed', 'B_avg_TD_pct', 'B_longest_win_streak', 'B_losses',
       'B_total_rounds_fought', 'B_total_title_bouts',
       'B_win_by_Decision_Majority', 'B_win_by_Decision_Split',
       'B_win_by_Decision_Unanimous', 'B_win_by_KO/TKO', 'B_win_by_Submission',
       'B_win_by_TKO_Doctor_Stoppage', 'B_wins', 'B_Stance', 'B_Height_cms',
       'B_Reach_cms', 'B_Weight_lbs', 'R_current_lose_streak',
       'R_current_win_streak', 'R_draw', 'R_avg_SIG_STR_landed',
       'R_avg_SIG_STR_pct', 'R_avg_SUB_ATT', 'R_avg_TD_landed', 'R_avg_TD_pct',
       'R_longest_win_streak', 'R_losses', 'R_total_rounds_fought',
       'R_total_title_bouts', 'R_win_by_Decision_Majority',
       'R_win_by_Decision_Split', 'R_win_by_Decision_Unanimous',
       'R_win_by_KO/TKO', 'R_win_by_Submission',
       'R_win_by_TKO_Doctor_Stoppage', 'R_wins', 'R_Stance', 'R_Height_cms',
       'R_Reach_cms', 'R_Weight_lbs', 'R_age', 'B_age', 'lose_streak_dif',
       'win_streak_dif', 'longest_win_streak_dif', 'win_dif', 'loss_dif',
       'total_round_dif', 'total_title_bout_dif', 'ko_dif', 'sub_dif',
       'height_dif', 'reach_dif', 'age_dif', 'sig_str_dif', 'avg_sub_att_dif',
       'avg_td_dif', 'empty_arena', 'B_match_weightclass_rank', 'R_match_weightclass_rank', 
        "R_Women's Flyweight_rank", "R_Women's Featherweight_rank", "R_Women's Strawweight_rank",
        "R_Women's Bantamweight_rank", 'R_Heavyweight_rank', 'R_Light Heavyweight_rank', 
        'R_Middleweight_rank', 'R_Welterweight_rank', 'R_Lightweight_rank', 'R_Featherweight_rank', 
        'R_Bantamweight_rank', 'R_Flyweight_rank', 'R_Pound-for-Pound_rank', "B_Women's Flyweight_rank", 
        "B_Women's Featherweight_rank", "B_Women's Strawweight_rank", "B_Women's Bantamweight_rank", 
        'B_Heavyweight_rank', 'B_Light Heavyweight_rank', 'B_Middleweight_rank', 'B_Welterweight_rank', 
        'B_Lightweight_rank', 'B_Featherweight_rank', 'B_Bantamweight_rank', 'B_Flyweight_rank', 
        'B_Pound-for-Pound_rank', 'Red - DEC', 'Blue - DEC', 'Red - SUB', 'Blue - SUB', 'Red - KO/TKO', 'Blue - KO/TKO', 'better_rank']

In [16]:
print(test_model_name)
print(test_model)
print(test_model_features)
print(test_model_ev)


dummy_model_for_trace
DecisionTreeClassifier(min_samples_leaf=0.01, random_state=75)
['R_odds']
0


In [17]:
def print_model():
    print()
    print(test_model_name)
    print(test_model)
    print(test_model_features)
    print(test_model_ev)
    print()

In [18]:

def get_ev(input_df, input_model, input_features, input_labels, odds_input, min_ev = 0, verbose=False, get_total=True):
    df_sel = input_df[input_features]
    df_sel = df_sel.dropna()
    df_sel = pd.get_dummies(df_sel)
    labels_sel = input_labels[input_labels.index.isin(df_sel.index)]
    odds_sel = odds_input[odds_input.index.isin(df_sel.index)] 
    best_score = custom_cv_eval(df_sel, input_model, labels_sel, odds_sel, min_ev = min_ev, verbose=verbose, 
                                get_total=get_total)
    return best_score


#Input: American Odds, and Probability of a Winning Bet
#Output: Bet EV based on a $100 bet
def get_bet_ev(odds, prob):
    if odds>0:
        return ((odds * prob) - (100 * (1-prob)) )
    else:
        return ((100 / abs(odds))*100*prob - (100 * (1-prob)))
    
def get_bet_return(odds):
    if odds>0:
        return odds
    else:
        return (100 / abs(odds))*100
    

In [19]:

def get_ev_for_optimize_mov(df_odds, probs, labels,  print_stats = False, min_ev = 0, get_total=True):
        
    score = 0
    #print(df_odds)
    for i in range(len(df_odds)):
        #print(i)
        #        df_temp_odds = df_odds.iloc[[i, :]]
        #print()
        #print()
        #print(df_odds[i])
        for l in range(len(probs[i])):
            temp_odds = (df_odds[i][l])
            #print((temp_odds))
            bet_ev = get_bet_ev(temp_odds, probs[i][l])
            #print(bet_ev)
            if bet_ev > min_ev:
                #print(l)
                if labels[i] == l:
                    #print(f"{int(labels[i])} {l}")
                    score = score + get_bet_return(temp_odds)
                    #print(f"Winning Bet. New Score: {score}")
                else:
                    score = score - 100
                    #print(f"Losing Bet.  New Score: {score}")
                    
            #print()
            
            
            
        #print(f"Result: {labels[i]}")
    return(score)

In [20]:
def custom_cv_eval_mov(df, m, labels, odds, min_ev=0, verbose=False, get_total=True):
    #If we have less than 5 samples we are going to break the split.
    #print("HI")
    if len(df) < 5:
        return 0
    X = np.array(df)
    y = np.array(labels)
    odds = np.array(odds)
    running_total = 0
    count=1
    kf = KFold(n_splits=5, shuffle=True, random_state=75)
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        odds_train, odds_test = odds[train_index], odds[test_index]
        #display(y_train)
        scaler = StandardScaler()
        scaled_train = scaler.fit_transform(X_train)
        scaled_test = scaler.transform(X_test)
        
        m.fit(scaled_train, y_train)
        probs=m.predict_proba(scaled_test)
        #print(probs)
        #We need to prep the dataframe to evaluate....
        #X_odds = X_test[['t1_odds', 't2_odds']]
        #print(X_test)
        #print(X_test[:, -1])
        #print(X_test[:, -2])
        #X_odds = list(zip(odds_test[:, -2], odds_test[:, -1], probs[:, 0], probs[:, 1], y_test))
        #ev_prepped_df = pd.DataFrame(X_odds, columns=['t1_odds', 't2_odds', 't1_prob', 't2_prob', 'winner'])
        #display(ev_prepped_df)
        #display(temp_df)
        #print(f"{count}: {get_ev_from_df(ev_prepped_df, print_stats = False)}")
        count=count+1
        running_total = running_total + get_ev_for_optimize_mov(odds_test, probs, y_test,  min_ev= min_ev, get_total=get_total )

        #display(ev_prepped_df)
    
    return running_total

In [21]:


def get_ev(input_df, input_model, input_features, input_labels, odds_input, min_ev = 0, verbose=False, get_total=True):
    df_sel = input_df[input_features]
    df_sel = df_sel.dropna()
    df_sel = pd.get_dummies(df_sel)
    labels_sel = input_labels[input_labels.index.isin(df_sel.index)]
    odds_sel = odds_input[odds_input.index.isin(df_sel.index)] 
    print(len(odds_sel.columns))
    if len(odds_sel.columns) == 6:
        best_score = custom_cv_eval_mov(df_sel, input_model, labels_sel, odds_sel, min_ev = min_ev, verbose=verbose, 
                                get_total=get_total)        
    else:
        best_score = custom_cv_eval(df_sel, input_model, labels_sel, odds_sel, min_ev = min_ev, verbose=verbose, 
                                get_total=get_total)
    return best_score


In [22]:
def tune_DecisionTreeClassifier(input_model, input_features, input_df, input_labels, odds_input, min_ev=0):
    ###############################################################################################################
    #Parameters we are going to fine-tune:
    #1. criterion ('gini', 'entropy')
    #2. splitter ('random', 'best')
    #3. max_depth ('none', IF A NUMBER EXISTS +1, -1, random, else 2 RANDOM INTS 1->100)
    #4. min_samples_leaf(n-1, 0,  n+1)
    #5. max_leaf_nodes:('none', n+1, n-1, OR 4 random numbers)
    ###############################################################################################################
    print()
    print()
    print("Starting New Run for DecisionTree")
    print()
    print()
    output_model = input_model
    best_score = get_ev(input_df, input_model, input_features, input_labels, odds_input, min_ev=min_ev)
    print("Previous Best Score:", best_score)    

    criterion = ['gini', 'entropy']
    splitter = ['random', 'best']
    if input_model.max_depth == None:
        max_depth = [None, random.randrange(100)+1, random.randrange(100)+1]
    else:
        max_depth = [input_model.max_depth, input_model.max_depth - 1, input_model.max_depth + 1, random.randrange(100)+1]
        max_depth = [i for i in max_depth if i > 0]

    min_samples_leaf = [input_model.min_samples_leaf, input_model.min_samples_leaf *1.01,
                         input_model.min_samples_leaf*0.99]
    min_samples_leaf = [i for i in min_samples_leaf if i > 0]    
    if ((input_model.max_leaf_nodes == None) or (input_model.max_leaf_nodes == 1)):
        max_leaf_nodes = [None, random.randrange(1000)+1, random.randrange(1000)+1]
    else:
        max_leaf_nodes = [input_model.max_leaf_nodes, input_model.max_leaf_nodes - 1, 
                     input_model.max_leaf_nodes + 1, random.randrange(1000)+1]
        max_leaf_nodes = [i for i in max_leaf_nodes if i > 0]
        
    for l in max_leaf_nodes:
        for sam in min_samples_leaf:
            for m in max_depth:
                for c in criterion:
                    for s in splitter:
                        test_model = DecisionTreeClassifier(criterion = c, splitter = s, max_depth = m,
                                                            min_samples_leaf=sam, max_leaf_nodes = l, random_state=75)
                        score = get_ev(input_df, test_model, input_features, input_labels, odds_input, min_ev=min_ev)
                        if score > best_score:
                            best_score = score
                            output_model = test_model
                            print()
                            print("NEW BEST SCORE")
                            
                            print("Criterion:", c, "splitter:", s, "max_depth:", m, 
                                  "min_samples_leaf:", sam, "max_leaf_nodes:", l, best_score)        
                            print()
                        else:
                            pass
                            print("Criterion:", c, "splitter:", s, "max_depth:", m, 
                                  "min_samples_leaf:", sam, "max_leaf_nodes:", l, score)        
                            
                                        
    
    return output_model

In [23]:
def tune_hyperparameters(input_model, input_features, input_df, input_labels, odds_input, min_ev=0):
    best_model = input_model
    keep_going = True
    
    if isinstance(input_model, DecisionTreeClassifier):
        while(keep_going):
            pos_model = (tune_DecisionTreeClassifier(best_model, input_features, input_df, input_labels, odds_input, min_ev=min_ev))
            if str(pos_model) == str(best_model):  #Direct comparisons don't seem to work....
                keep_going = False
                output_model = best_model
            else:
                best_model = pos_model            

In [24]:
tune_hyperparameters(test_model, test_model_features, df_train, label_train, odds_train, 
                                      min_ev=test_model_ev)




Starting New Run for DecisionTree


6
Previous Best Score: -167273.6666666667
6
Criterion: gini splitter: random max_depth: None min_samples_leaf: 0.01 max_leaf_nodes: None -260952.0
6
Criterion: gini splitter: best max_depth: None min_samples_leaf: 0.01 max_leaf_nodes: None -167273.6666666667
6
Criterion: entropy splitter: random max_depth: None min_samples_leaf: 0.01 max_leaf_nodes: None -260952.0
6
Criterion: entropy splitter: best max_depth: None min_samples_leaf: 0.01 max_leaf_nodes: None -174210.0
6
Criterion: gini splitter: random max_depth: 80 min_samples_leaf: 0.01 max_leaf_nodes: None -260952.0
6
Criterion: gini splitter: best max_depth: 80 min_samples_leaf: 0.01 max_leaf_nodes: None -167273.6666666667
6
Criterion: entropy splitter: random max_depth: 80 min_samples_leaf: 0.01 max_leaf_nodes: None -260952.0
6
Criterion: entropy splitter: best max_depth: 80 min_samples_leaf: 0.01 max_leaf_nodes: None -174210.0
6
Criterion: gini splitter: random max_depth: 58 min_samples_leaf:

6
Criterion: gini splitter: best max_depth: 80 min_samples_leaf: 0.01 max_leaf_nodes: 436 -167273.6666666667
6
Criterion: entropy splitter: random max_depth: 80 min_samples_leaf: 0.01 max_leaf_nodes: 436 -260952.0
6
Criterion: entropy splitter: best max_depth: 80 min_samples_leaf: 0.01 max_leaf_nodes: 436 -174210.0
6
Criterion: gini splitter: random max_depth: 58 min_samples_leaf: 0.01 max_leaf_nodes: 436 -260952.0
6
Criterion: gini splitter: best max_depth: 58 min_samples_leaf: 0.01 max_leaf_nodes: 436 -167273.6666666667
6
Criterion: entropy splitter: random max_depth: 58 min_samples_leaf: 0.01 max_leaf_nodes: 436 -260952.0
6
Criterion: entropy splitter: best max_depth: 58 min_samples_leaf: 0.01 max_leaf_nodes: 436 -174210.0
6
Criterion: gini splitter: random max_depth: None min_samples_leaf: 0.0101 max_leaf_nodes: 436 -260952.0
6
Criterion: gini splitter: best max_depth: None min_samples_leaf: 0.0101 max_leaf_nodes: 436 -167273.6666666667
6
Criterion: entropy splitter: random max_dep

Criterion: entropy splitter: best max_depth: 67 min_samples_leaf: 0.0099 max_leaf_nodes: 788 -174130.0
6
Criterion: gini splitter: random max_depth: 79 min_samples_leaf: 0.0099 max_leaf_nodes: 788 -260952.0
6
Criterion: gini splitter: best max_depth: 79 min_samples_leaf: 0.0099 max_leaf_nodes: 788 -166576.66666666666
6
Criterion: entropy splitter: random max_depth: 79 min_samples_leaf: 0.0099 max_leaf_nodes: 788 -260952.0
6
Criterion: entropy splitter: best max_depth: 79 min_samples_leaf: 0.0099 max_leaf_nodes: 788 -174130.0
6
Criterion: gini splitter: random max_depth: None min_samples_leaf: 0.009999000000000001 max_leaf_nodes: 788 -260952.0
6
Criterion: gini splitter: best max_depth: None min_samples_leaf: 0.009999000000000001 max_leaf_nodes: 788 -167273.6666666667
6
Criterion: entropy splitter: random max_depth: None min_samples_leaf: 0.009999000000000001 max_leaf_nodes: 788 -260952.0
6
Criterion: entropy splitter: best max_depth: None min_samples_leaf: 0.009999000000000001 max_leaf

Criterion: gini splitter: best max_depth: 58 min_samples_leaf: 0.009801 max_leaf_nodes: None -164846.66666666666
6
Criterion: entropy splitter: random max_depth: 58 min_samples_leaf: 0.009801 max_leaf_nodes: None -260952.0
6
Criterion: entropy splitter: best max_depth: 58 min_samples_leaf: 0.009801 max_leaf_nodes: None -173090.0
6
Criterion: gini splitter: random max_depth: None min_samples_leaf: 0.009899010000000001 max_leaf_nodes: None -260952.0
6
Criterion: gini splitter: best max_depth: None min_samples_leaf: 0.009899010000000001 max_leaf_nodes: None -166576.66666666666
6
Criterion: entropy splitter: random max_depth: None min_samples_leaf: 0.009899010000000001 max_leaf_nodes: None -260952.0
6
Criterion: entropy splitter: best max_depth: None min_samples_leaf: 0.009899010000000001 max_leaf_nodes: None -174130.0
6
Criterion: gini splitter: random max_depth: 88 min_samples_leaf: 0.009899010000000001 max_leaf_nodes: None -260952.0
6
Criterion: gini splitter: best max_depth: 88 min_sam

Criterion: entropy splitter: best max_depth: 58 min_samples_leaf: 0.009801 max_leaf_nodes: 865 -173090.0
6
Criterion: gini splitter: random max_depth: None min_samples_leaf: 0.009899010000000001 max_leaf_nodes: 865 -260952.0
6
Criterion: gini splitter: best max_depth: None min_samples_leaf: 0.009899010000000001 max_leaf_nodes: 865 -166576.66666666666
6
Criterion: entropy splitter: random max_depth: None min_samples_leaf: 0.009899010000000001 max_leaf_nodes: 865 -260952.0
6
Criterion: entropy splitter: best max_depth: None min_samples_leaf: 0.009899010000000001 max_leaf_nodes: 865 -174130.0
6
Criterion: gini splitter: random max_depth: 88 min_samples_leaf: 0.009899010000000001 max_leaf_nodes: 865 -260952.0
6
Criterion: gini splitter: best max_depth: 88 min_samples_leaf: 0.009899010000000001 max_leaf_nodes: 865 -166576.66666666666
6
Criterion: entropy splitter: random max_depth: 88 min_samples_leaf: 0.009899010000000001 max_leaf_nodes: 865 -260952.0
6
Criterion: entropy splitter: best ma

In [25]:
print(test_model)

DecisionTreeClassifier(min_samples_leaf=0.01, random_state=75)
