In [1]:
%run func_def.ipynb

**imports loaded.

> Pre-Processing func:
    get_striking_efficiency(name, date)
    get_height(item)
    get_weight(item)
    get_reach(item)
    get_std_reach(item, height, reach_list, height_list)
    get_age(item)
    get_std_age(item, age_list)
    get_win_prob(name, date, df)
    get_win_streak(name, date, df) #return count of recent wins
    get_lose_streak(name, date, df)
    get_fighter_stats(name, data )

> Feature Engineering func:
    get_stat(index, data, stat='height', fo='')
    get_swaped_row(index, data) # return fighter & opponent statistics swapped. 
    get_wins(combo, data)
    get_wins_ratio(combo, data)

> outlier_cleaning func:
    set_outlier_detection_model(X_columns, data)

> model_stats_pred_training func:
    set_lower_cases(string)
    set_train_test(X_columns, y_columns, data, test_size=0.3, valid_size=0.3)
    set_xgboost_reg(X_columns, y_columns, data, n=200, learning_rate=0.2, subsample=0.4, max_depth=5)
    set_predictions(model, data, X_columns, y_c

# Card file (get Path from user)

In [2]:
CARD_PATH = '../cards/ufc_card_06-17-2023.csv'
DATA_PATH = '../data/'

# main()

In [3]:
card = pd.read_csv(CARD_PATH)
card

Unnamed: 0,fighter,opponent,weight_class
0,Marvin Vettori,Jared Cannonier,Middleweight
1,Arman Tsarukyan,Joaquim Silva,Lightweight
2,Armen Petrosyan,Christian Duncan,Middleweight
3,Pat Sabatini,Lucas Almeida,Featherweight
4,Manuel Torres,Nikolas Motta,Lightweight
5,Nicolas Dalby,Muslim Salikhov,Welterweight
6,Raoni Barcelos,Miles Johns,Bantamweight
7,Jimmy Flick,Alessandro Costa,Flyweight
8,Kyung Ho Kang,Cristian Quinonez,Bantamweight
9,Carlos Hernandez,Denys Bondar,Flyweight


### Load data:

In [4]:
df = pd.read_csv(DATA_PATH + 'data_features_outliers_detected.csv')
stats = pd.read_csv(DATA_PATH + 'data_fighters_stats_processed.csv')

### Load models:

In [5]:
stat_pred = load_trained_model(file_path='../global_variables/stats_pred_general_xgboost_reg.pkl')
result_pred = load_trained_model(file_path='../global_variables/result_pred_general_logistic_reg.pkl')
method_pred = load_trained_model(file_path='../global_variables/method_pred_knn_classifier.pkl')

### Set matches:

In [6]:
data = []
for i in range(len(card)):
    fighter = card.loc[i,'fighter']
    opponent = card.loc[i,'opponent']
    fighter_stats = stats[stats.name==fighter]
    opponent_stats = stats[stats.name==opponent]

    if (len(fighter_stats) > 0) & (len(opponent_stats) > 0):
        match = {'result':None,
             'fighter':fighter_stats['name'].tolist().pop(),
             'opponent':opponent_stats['name'].tolist().pop(),
             'date':None,
             'weight_class':card[(card.fighter==fighter)&(card.opponent==opponent)]['weight_class'].tolist().pop(),
             'method':None,
             'type':None,
             'kd_dif':None,
             'str_dif':None,
             'td_dif':None,
             'sub_dif':None,
             'td_avg_dif':round(fighter_stats['td_avg'].tolist().pop() - opponent_stats['td_avg'].tolist().pop(),4),
             'td_def_dif':round(fighter_stats['td_def'].tolist().pop() - opponent_stats['td_def'].tolist().pop(),4),
             'sub_avg_dif':round(fighter_stats['sub_avg'].tolist().pop() - opponent_stats['sub_avg'].tolist().pop(),4),
             'slpm_dif':round(fighter_stats['slpm'].tolist().pop() - opponent_stats['slpm'].tolist().pop(),4),
             'str_acc_dif':round(fighter_stats['str_acc'].tolist().pop() - opponent_stats['str_acc'].tolist().pop(),4),
             'sapm_dif':round(fighter_stats['sapm'].tolist().pop() - opponent_stats['sapm'].tolist().pop(),4),
             'str_def_dif':round(fighter_stats['str_def'].tolist().pop() - opponent_stats['str_def'].tolist().pop(),4),
             'str_efc_dif':0,
             'stance_win_ratio':get_stance_wins_ratio(fighter=fighter, stats_data=stats) - get_stance_wins_ratio(fighter=opponent, stats_data=stats),
             'win_perc_dif':round(get_win_prob(name=fighter, date=get_current_date(), df=df) - get_win_prob(name=opponent, date=get_current_date(), df=df),4),
             'win_strk_dif':get_win_streak(name=fighter, date=get_current_date(), df=df) - get_win_streak(name=opponent, date=get_current_date(), df=df),
             'lose_strk_dif':get_lose_streak(name=fighter, date=get_current_date(), df=df) - get_lose_streak(name=opponent, date=get_current_date(), df=df),
             'outlier':None,
             'lof_outlier':None
            }
        data.append(match)
    else:
        print(f"(Match error)Data found: {fighter}={len(fighter_stats)>0} vs {opponent}={len(opponent_stats)>0}")
        
data = pd.DataFrame.from_dict(data)     
data

(Match error)Data found: Zhalgas Zhumagulov=True vs Felipe Bunes=False


Unnamed: 0,result,fighter,opponent,date,weight_class,method,type,kd_dif,str_dif,td_dif,sub_dif,td_avg_dif,td_def_dif,sub_avg_dif,slpm_dif,str_acc_dif,sapm_dif,str_def_dif,str_efc_dif,stance_win_ratio,win_perc_dif,win_strk_dif,lose_strk_dif,outlier,lof_outlier
0,,Marvin Vettori,Jared Cannonier,,Middleweight,,,,,,,1.6,0.11,0.5,0.38,-0.06,-0.21,-0.02,0,-0.0318,0.2,-1,2,,
1,,Arman Tsarukyan,Joaquim Silva,,Lightweight,,,,,,,3.09,0.1,-0.6,-0.52,0.06,-2.65,-0.01,0,-0.0157,0.4464,6,-1,,
2,,Armen Petrosyan,Christian Duncan,,Middleweight,,,,,,,0.0,0.25,0.0,1.17,-0.13,0.61,0.25,0,-0.0475,0.3333,0,1,,
3,,Pat Sabatini,Lucas Almeida,,Featherweight,,,,,,,3.75,-0.08,1.9,-4.93,0.08,-4.13,0.04,0,0.0,-0.8,-1,2,,
4,,Manuel Torres,Nikolas Motta,,Lightweight,,,,,,,0.0,0.0,0.0,7.13,0.21,-0.8,-0.02,0,0.0,-1.0,-1,1,,
5,,Nicolas Dalby,Muslim Salikhov,,Welterweight,,,,,,,-0.11,-0.12,0.1,0.44,-0.08,0.92,-0.14,0,0.0,0.1964,1,0,,
6,,Raoni Barcelos,Miles Johns,,Bantamweight,,,,,,,0.77,0.01,0.4,2.06,0.04,2.21,-0.08,0,0.0,-0.1,1,-1,,
7,,Jimmy Flick,Alessandro Costa,,Flyweight,,,,,,,2.13,-0.87,5.7,-0.38,0.27,0.25,-0.18,0,0.0,1.0,1,-1,,
8,,Kyung Ho Kang,Cristian Quinonez,,Bantamweight,,,,,,,1.18,-0.3,0.8,-2.22,0.07,-1.73,-0.04,0,0.0,0.5,0,0,,
9,,Carlos Hernandez,Denys Bondar,,Flyweight,,,,,,,-10.08,0.66,0.5,2.0,-0.33,1.92,-0.27,0,0.0,1.0,1,0,,


# Statistics prediction:

In [7]:
data[['kd_dif','str_dif','td_dif','sub_dif']] = stat_pred.predict(data[stat_pred.feature_names_in_])
data.tail(3)

Unnamed: 0,result,fighter,opponent,date,weight_class,method,type,kd_dif,str_dif,td_dif,sub_dif,td_avg_dif,td_def_dif,sub_avg_dif,slpm_dif,str_acc_dif,sapm_dif,str_def_dif,str_efc_dif,stance_win_ratio,win_perc_dif,win_strk_dif,lose_strk_dif,outlier,lof_outlier
10,,Tereza Bleda,Gabriella Fernandes,,Women's Flyweight,,,0.179804,0.222836,0.157502,1.467955,1.13,0.33,1.1,0.84,0.04,0.44,-0.11,0,-0.0157,0.0,0,-1,,
11,,Dan Argueta,Ronnie Lawrence,,Bantamweight,,,0.487097,-0.775016,-0.3707,0.59386,-4.53,0.03,0.7,-0.41,-0.05,-0.22,-0.01,0,-0.0318,0.0,0,0,,
12,,Zac Pauga,Modestas Bukauskas,,Light Heavyweight,,,-0.232945,3.168872,0.060541,0.207582,0.0,0.29,0.0,1.05,0.29,-2.03,0.15,0,-0.0475,-0.8,-3,2,,


In [8]:
data['result'] = result_pred.predict(data[result_pred.feature_names_in_])
data.tail(3)

Unnamed: 0,result,fighter,opponent,date,weight_class,method,type,kd_dif,str_dif,td_dif,sub_dif,td_avg_dif,td_def_dif,sub_avg_dif,slpm_dif,str_acc_dif,sapm_dif,str_def_dif,str_efc_dif,stance_win_ratio,win_perc_dif,win_strk_dif,lose_strk_dif,outlier,lof_outlier
10,win,Tereza Bleda,Gabriella Fernandes,,Women's Flyweight,,,0.179804,0.222836,0.157502,1.467955,1.13,0.33,1.1,0.84,0.04,0.44,-0.11,0,-0.0157,0.0,0,-1,,
11,lose,Dan Argueta,Ronnie Lawrence,,Bantamweight,,,0.487097,-0.775016,-0.3707,0.59386,-4.53,0.03,0.7,-0.41,-0.05,-0.22,-0.01,0,-0.0318,0.0,0,0,,
12,win,Zac Pauga,Modestas Bukauskas,,Light Heavyweight,,,-0.232945,3.168872,0.060541,0.207582,0.0,0.29,0.0,1.05,0.29,-2.03,0.15,0,-0.0475,-0.8,-3,2,,


In [9]:
for i in range(len(data)):
    data.loc[i,'result'] = int(1) if data.loc[i,'result'] == 'win' else int(0)

data.tail(3)    

Unnamed: 0,result,fighter,opponent,date,weight_class,method,type,kd_dif,str_dif,td_dif,sub_dif,td_avg_dif,td_def_dif,sub_avg_dif,slpm_dif,str_acc_dif,sapm_dif,str_def_dif,str_efc_dif,stance_win_ratio,win_perc_dif,win_strk_dif,lose_strk_dif,outlier,lof_outlier
10,1,Tereza Bleda,Gabriella Fernandes,,Women's Flyweight,,,0.179804,0.222836,0.157502,1.467955,1.13,0.33,1.1,0.84,0.04,0.44,-0.11,0,-0.0157,0.0,0,-1,,
11,0,Dan Argueta,Ronnie Lawrence,,Bantamweight,,,0.487097,-0.775016,-0.3707,0.59386,-4.53,0.03,0.7,-0.41,-0.05,-0.22,-0.01,0,-0.0318,0.0,0,0,,
12,1,Zac Pauga,Modestas Bukauskas,,Light Heavyweight,,,-0.232945,3.168872,0.060541,0.207582,0.0,0.29,0.0,1.05,0.29,-2.03,0.15,0,-0.0475,-0.8,-3,2,,


In [10]:
data['win_prob'] = result_pred.predict_proba(data[result_pred.feature_names_in_])[:,1]
data.tail(3)

Unnamed: 0,result,fighter,opponent,date,weight_class,method,type,kd_dif,str_dif,td_dif,sub_dif,td_avg_dif,td_def_dif,sub_avg_dif,slpm_dif,str_acc_dif,sapm_dif,str_def_dif,str_efc_dif,stance_win_ratio,win_perc_dif,win_strk_dif,lose_strk_dif,outlier,lof_outlier,win_prob
10,1,Tereza Bleda,Gabriella Fernandes,,Women's Flyweight,,,0.179804,0.222836,0.157502,1.467955,1.13,0.33,1.1,0.84,0.04,0.44,-0.11,0,-0.0157,0.0,0,-1,,,0.872412
11,0,Dan Argueta,Ronnie Lawrence,,Bantamweight,,,0.487097,-0.775016,-0.3707,0.59386,-4.53,0.03,0.7,-0.41,-0.05,-0.22,-0.01,0,-0.0318,0.0,0,0,,,0.481745
12,1,Zac Pauga,Modestas Bukauskas,,Light Heavyweight,,,-0.232945,3.168872,0.060541,0.207582,0.0,0.29,0.0,1.05,0.29,-2.03,0.15,0,-0.0475,-0.8,-3,2,,,0.996483


In [11]:
data['method'] = method_pred.predict(data[method_pred.feature_names_in_])
data.tail(3)

Unnamed: 0,result,fighter,opponent,date,weight_class,method,type,kd_dif,str_dif,td_dif,sub_dif,td_avg_dif,td_def_dif,sub_avg_dif,slpm_dif,str_acc_dif,sapm_dif,str_def_dif,str_efc_dif,stance_win_ratio,win_perc_dif,win_strk_dif,lose_strk_dif,outlier,lof_outlier,win_prob
10,1,Tereza Bleda,Gabriella Fernandes,,Women's Flyweight,submission,,0.179804,0.222836,0.157502,1.467955,1.13,0.33,1.1,0.84,0.04,0.44,-0.11,0,-0.0157,0.0,0,-1,,,0.872412
11,0,Dan Argueta,Ronnie Lawrence,,Bantamweight,decision,,0.487097,-0.775016,-0.3707,0.59386,-4.53,0.03,0.7,-0.41,-0.05,-0.22,-0.01,0,-0.0318,0.0,0,0,,,0.481745
12,1,Zac Pauga,Modestas Bukauskas,,Light Heavyweight,decision,,-0.232945,3.168872,0.060541,0.207582,0.0,0.29,0.0,1.05,0.29,-2.03,0.15,0,-0.0475,-0.8,-3,2,,,0.996483


In [12]:
data['method_prob'] = np.max(method_pred.predict_proba(data[method_pred.feature_names_in_]),axis=1)
data.tail(3)

Unnamed: 0,result,fighter,opponent,date,weight_class,method,type,kd_dif,str_dif,td_dif,sub_dif,td_avg_dif,td_def_dif,sub_avg_dif,slpm_dif,str_acc_dif,sapm_dif,str_def_dif,str_efc_dif,stance_win_ratio,win_perc_dif,win_strk_dif,lose_strk_dif,outlier,lof_outlier,win_prob,method_prob
10,1,Tereza Bleda,Gabriella Fernandes,,Women's Flyweight,submission,,0.179804,0.222836,0.157502,1.467955,1.13,0.33,1.1,0.84,0.04,0.44,-0.11,0,-0.0157,0.0,0,-1,,,0.872412,0.42
11,0,Dan Argueta,Ronnie Lawrence,,Bantamweight,decision,,0.487097,-0.775016,-0.3707,0.59386,-4.53,0.03,0.7,-0.41,-0.05,-0.22,-0.01,0,-0.0318,0.0,0,0,,,0.481745,0.57
12,1,Zac Pauga,Modestas Bukauskas,,Light Heavyweight,decision,,-0.232945,3.168872,0.060541,0.207582,0.0,0.29,0.0,1.05,0.29,-2.03,0.15,0,-0.0475,-0.8,-3,2,,,0.996483,0.67


# Final results:

In [13]:
for i in range(len(data)):
    data.loc[i,'result'] = data.loc[i,'fighter'] if data.loc[i,'result'] == int(1) else data.loc[i,'opponent']
    
data = data.rename(columns={'result':'W'})    

In [14]:
data[['fighter','opponent','weight_class','W','win_prob','method','method_prob','str_dif','kd_dif','td_dif','sub_dif']]

Unnamed: 0,fighter,opponent,weight_class,W,win_prob,method,method_prob,str_dif,kd_dif,td_dif,sub_dif
0,Marvin Vettori,Jared Cannonier,Middleweight,Jared Cannonier,0.125537,knockout,0.44,-0.367645,-0.713222,0.03703,-0.188585
1,Arman Tsarukyan,Joaquim Silva,Lightweight,Arman Tsarukyan,0.806894,decision,0.52,0.164391,0.152424,1.408623,-0.379105
2,Armen Petrosyan,Christian Duncan,Middleweight,Armen Petrosyan,0.716223,decision,0.54,0.788855,-0.23776,-0.355367,-0.192892
3,Pat Sabatini,Lucas Almeida,Featherweight,Pat Sabatini,0.514079,knockout,0.4,0.866488,-0.646581,0.421967,-0.427852
4,Manuel Torres,Nikolas Motta,Lightweight,Manuel Torres,0.98809,decision,0.51,1.676234,1.141826,0.290099,0.380055
5,Nicolas Dalby,Muslim Salikhov,Welterweight,Muslim Salikhov,0.139823,knockout,0.44,-0.556235,-0.547639,-0.244864,-0.099943
6,Raoni Barcelos,Miles Johns,Bantamweight,Miles Johns,0.315609,decision,0.47,-0.37049,-0.295962,0.131124,0.148933
7,Jimmy Flick,Alessandro Costa,Flyweight,Alessandro Costa,0.329245,submission,0.36,-0.371174,-1.187659,0.338697,1.652924
8,Kyung Ho Kang,Cristian Quinonez,Bantamweight,Kyung Ho Kang,0.954548,decision,0.56,-0.016858,0.338028,1.484957,1.635256
9,Carlos Hernandez,Denys Bondar,Flyweight,Denys Bondar,0.366376,decision,0.59,-0.674925,-0.553429,-1.388573,0.779629
