# The goal here is to prep a dataframe with all the probabilities for the different methods of victory with the result and extra info.  This will be used for:
 1. Visualization
 2. Stacking Models

This is based on the update model scores methods

In [1]:
import pandas as pd
import numpy as np
import random
import csv
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import ExtraTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import RadiusNeighborsClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.calibration import CalibratedClassifierCV
from sklearn.svm import LinearSVC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegressionCV
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from datetime import date
#from sklearn.mixture import DPGMM
from tabulate import tabulate

In [2]:
import sys
sys.path.append('../../../automated_model_creation') #We need to access the function file

In [3]:
from functions import *


In [4]:
#THESE ARE THE 2 VARIABLES WE SET
is_prod = True
model_num = 0



if is_prod:
    model_file = '../../../data/production_models_mov.csv'
else: #If not production then test
    model_file = '../../../data/models_mov.csv'

In [5]:
#Load models
with open(model_file, newline='') as f:
    reader = csv.reader(f)
    models = list(reader)
    
print(len(models[0]))


1


In [6]:
model_file

'../../../data/production_models_mov.csv'

In [7]:
df = pd.read_csv("../../../data/kaggle_data/ufc-master.csv")

In [8]:
#Let's fix the date
df['date'] = pd.to_datetime(df['date'])

In [9]:
def return_finish_type(winner, finish):
    #print(winner, finish)
    #Why overcomplicate things?  We can just use a few if statements
    if winner == 'Red':
        #print("HI")
        if finish in ['U-DEC', 'S-DEC', 'M-DEC']:
            return ('Red - DEC')
        if finish in ['SUB']:
            return('Red - SUB')
        if finish in ['KO/TKO', 'DQ']:
            return('Red - KO/TKO')
    if winner == 'Blue':
        if finish in ['U-DEC', 'S-DEC', 'M-DEC']:
            return ('Blue - DEC')
        if finish in ['SUB']:
            return('Blue - SUB')
        if finish in ['KO/TKO', 'DQ']:
            return('Blue - KO/TKO')
        
    #Test for NaN
    if finish != finish:
        return('')
    
    if finish == 'Overturned':
        return('')
    
    
    return ('error')

In [10]:
#This calls for the power of lambda!
df['finish_type'] = df.apply(lambda x: return_finish_type(x['Winner'], x['finish']), axis=1)
mask = df['finish_type'] != ''
df = df[mask]

In [11]:
finish_list = ['Red - DEC', 'Red - SUB', 'Red - KO/TKO', 'Blue - DEC', 'Blue - SUB', 'Blue - KO/TKO']

#Let's put all the labels in a dataframe
df['label'] = ''
#If the winner is not Red or Blue we can remove it.

for f in range(len(finish_list)):
    mask = df['finish_type'] == finish_list[f]
    df['label'][mask] = f
    
#df["Winner"] = df["Winner"].astype('category')
#df = df[(df['Winner'] != 'Blue') | (df['Winner'] == 'Red') ]


#Make sure lable is numeric
df['label'] = pd.to_numeric(df['label'], errors='coerce')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['label'][mask] = f


In [12]:
df.rename(columns={'r_dec_odds': 'Red - DEC', 'r_sub_odds': 'Red - SUB', 'r_ko_odds': 'Red - KO/TKO',
                'b_dec_odds': 'Blue - DEC', 'b_sub_odds': 'Blue - SUB', 'b_ko_odds': 'Blue - KO/TKO'}, inplace=True)

In [13]:
label_df = df['label']
odds_df = df[finish_list]

In [14]:
df_train = df[250:]
odds_train = odds_df[250:]
label_train = label_df[250:]

df_test = df[:250]
odds_test = odds_df[:250]
label_test = label_df[:250]

In [15]:
#We need to clean
mask = df_train['finish_type'] != ''
df_train = df_train[mask]
#print(len(df_train))

mask = df_test['finish_type'] != ''
df_test = df_test[mask]
#print(len(df_test))

label_train = label_train[label_train.index.isin(df_train.index)]
label_test = label_test[label_test.index.isin(df_test.index)]

odds_train = odds_train[odds_train.index.isin(df_train.index)]
odds_test = odds_test[odds_test.index.isin(df_test.index)]

In [16]:
#Set a value for the nulls in the ranks

weightclass_list = ['B_match_weightclass_rank', 'R_match_weightclass_rank', "R_Women's Flyweight_rank", "R_Women's Featherweight_rank", "R_Women's Strawweight_rank", "R_Women's Bantamweight_rank", 'R_Heavyweight_rank', 'R_Light Heavyweight_rank', 'R_Middleweight_rank', 'R_Welterweight_rank', 'R_Lightweight_rank', 'R_Featherweight_rank', 'R_Bantamweight_rank', 'R_Flyweight_rank', 'R_Pound-for-Pound_rank', "B_Women's Flyweight_rank", "B_Women's Featherweight_rank", "B_Women's Strawweight_rank", "B_Women's Bantamweight_rank", 'B_Heavyweight_rank', 'B_Light Heavyweight_rank', 'B_Middleweight_rank', 'B_Welterweight_rank', 'B_Lightweight_rank', 'B_Featherweight_rank', 'B_Bantamweight_rank', 'B_Flyweight_rank', 'B_Pound-for-Pound_rank']
df_train[weightclass_list] = df_train[weightclass_list].fillna(17)
df_test[weightclass_list] = df_test[weightclass_list].fillna(17)

In [17]:
score_list = []

In [18]:
#OK WE NEED TO CREATE A NEW VERSION OF evaluate_model_mov that also returns a dataframe of probs

In [19]:
def get_bets_dataframe(df_odds, probs, labels, label_list, probs_label_list, print_stats = False, min_ev = 0, get_total=True):
    probs_label_list = [int(a) for a in probs_label_list]
    #labels = [int(a) for a in labels]
    
    bets_list = []
    df_odds.reset_index(drop=True, inplace=True)
    labels.reset_index(drop=True, inplace=True)
    score = 0
    #print(df_odds)
    for i in range(len(df_odds)):
        #print(i)
        #        df_temp_odds = df_odds.iloc[[i, :]]
        #print(df_odds.iloc[[i]])
        for l in range(len(probs[i])):
            #print(f"{label_list[probs_label_list[l]]}: {probs[i][l]}")
            temp_odds = (df_odds.loc[[i]])[label_list[probs_label_list[l]]][i]
            #print((temp_odds))
            bet_ev = get_bet_ev(temp_odds, probs[i][l])
            #print(bet_ev)
            if bet_ev > min_ev:
                #print(l)
                if labels[i] == probs_label_list[l]:
                    #print(f"{int(labels[i])} {probs_label_list[l]}")
                    score = score + get_bet_return(temp_odds)
                    temp_score = get_bet_return(temp_odds)
                    #print(f"Winning Bet. New Score: {score}")
                else:
                    score = score - 100
                    temp_score = -100
                    #print(f"Losing Bet.  New Score: {score}")
                #print(f"{labels[i]} {probs_label_list[l]} {probs[i][l]} {temp_odds} {temp_score} {label_list[labels[i]]}")
                bets_list.append([labels[i], probs_label_list[l], probs[i][l], temp_odds, temp_score, label_list[labels[i]], label_list[probs_label_list[l]] ],)
            #print()
            
            
            
        #print(f"Result: {label_list[int(labels[i])]} ({int(labels[i])})")
    print("Real Score: " + str(score))
    return(score, bets_list)


In [20]:
def get_model_dataframe(input_model, input_features, input_ev, train_df, train_labels, train_odds, test_df, test_labels,
                  test_odds, label_list, verbose=True):
    model_score = 0
    
    df_train = train_df[input_features].copy()
    df_test = test_df[input_features].copy()
    df_train = df_train.dropna()
    df_test = df_test.dropna()
    
    df_train = pd.get_dummies(df_train)
    df_test = pd.get_dummies(df_test)
    df_train, df_test = df_train.align(df_test, join='left', axis=1)    #Ensures both sets are dummified the same
    df_test = df_test.fillna(0)

    labels_train = train_labels[train_labels.index.isin(df_train.index)]
    odds_train = train_odds[train_odds.index.isin(df_train.index)] 
    labels_test = test_labels[test_labels.index.isin(df_test.index)]
    odds_test = test_odds[test_odds.index.isin(df_test.index)] 
    
    odds_train = odds_train.dropna()
    odds_test = odds_test.dropna()
    
    df_train = df_train[df_train.index.isin(odds_train.index)]
    df_test = df_test[df_test.index.isin(odds_test.index)]
    
    labels_train = labels_train[labels_train.index.isin(odds_train.index)]
    labels_test = labels_test[labels_test.index.isin(odds_test.index)]    
    
    

    if verbose:
        display(df_train.shape)
        display(labels_train.shape)
        display(odds_train.shape)
        display(df_test.shape)
        display(labels_test.shape)
        display(odds_test.shape)

    #print(labels_train)

    scaler = StandardScaler()
    scaled_train = scaler.fit_transform(df_train)
    
    input_model.fit(scaled_train, labels_train)

    scaled_test = scaler.transform(df_test)
    
    probs = input_model.predict_proba(scaled_test)
    model_score, bets_list = get_bets_dataframe(odds_test, probs, labels_test, label_list, input_model.classes_, print_stats = True, min_ev = input_ev, get_total=True)

    #print((odds_test))
    #print(probs)
    #print(label_list)
    #print(input_model.classes_)
    #print(label_test)
    #print(bets_list)
    #Let's turn bets list into a dataframe
    #print(f"{labels[i]} {probs_label_list[l]} {probs[i][l]} {temp_odds} {temp_score} {label_list[labels[i]]}")

    df = pd.DataFrame.from_records(bets_list, columns=['label_code', 'bet_code', 'probability', 'odds', 'score', 'result', 'bet'])
    return(df)


In [21]:
test_model_name = models[0][model_num]
test_model = eval(models[1][model_num])
test_model_features = eval(models[2][model_num])
test_model_ev = eval(models[3][model_num])
model_result_df = (get_model_dataframe(test_model, test_model_features, test_model_ev, df_train, label_train, odds_train, df_test, label_test,
                     odds_test, finish_list, verbose = True))

(2834, 26)

(2834,)

(2834, 6)

(242, 26)

(242,)

(242, 6)

Real Score: 8205.0


In [22]:
test_model

LinearDiscriminantAnalysis()

In [23]:
len(model_result_df)

371

In [24]:
d = date.today()
print(d)

2021-08-28


In [25]:
f = "model_results_" + str(d) + ".csv"
print(f)

model_results_2021-08-28.csv


In [26]:
model_result_df.to_csv(f)