# Data preprocessing for better usability of UFC fight statistics

In [42]:
import pandas as pd

In [43]:
df = pd.read_csv("../../datasets/ufc_fight_stats.csv")
df.head()

Unnamed: 0,fighter_1,fighter_2,knockdowns_1,knockdowns_2,total_strikes_1,total_strikes_2,significant_strikes_1,significant_strikes_2,head_strikes_1,head_strikes_2,...,reversals_1,reversals_2,control_time_1,control_time_2,result,method,round,time,referee,ufc_stats_com_url
0,Alexa Grasso,Valentina Shevchenko,1,0,262 of 408,199 of 301,84 of 203,80 of 179,54 of 151,61 of 158,...,1,0,2:39,8:37,D,Decision - Split,5,5:00,Herb Dean,http://ufcstats.com/fight-details/b395c89e19a3...
1,Kevin Holland,Jack Della Maddalena,0,0,127 of 356,105 of 190,127 of 356,105 of 190,59 of 258,46 of 115,...,0,0,0:00,0:00,L,Decision - Split,3,5:00,Jason Herzog,http://ufcstats.com/fight-details/697efaf0d162...
2,Raul Rosas Jr.,Terrence Mitchell,1,0,19 of 31,6 of 19,18 of 30,6 of 19,17 of 29,5 of 18,...,0,0,0:17,0:00,W,KO/TKO,1,0:54,Mark Smith,http://ufcstats.com/fight-details/c0ca4c201d08...
3,Daniel Zellhuber,Christos Giagos,0,0,36 of 101,38 of 82,36 of 101,38 of 82,18 of 75,25 of 63,...,0,0,0:17,0:11,W,Submission,2,3:26,Jason Herzog,http://ufcstats.com/fight-details/2e1435c160bf...
4,Fernando Padilla,Kyle Nelson,0,0,73 of 209,83 of 185,72 of 208,82 of 184,42 of 168,43 of 143,...,0,0,0:07,0:00,L,Decision - Unanimous,3,5:00,Chris Tognoni,http://ufcstats.com/fight-details/a5d76e93a505...


In [44]:
df.shape

(3561, 34)

In [45]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3561 entries, 0 to 3560
Data columns (total 34 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   fighter_1              3561 non-null   object
 1   fighter_2              3561 non-null   object
 2   knockdowns_1           3561 non-null   int64 
 3   knockdowns_2           3561 non-null   int64 
 4   total_strikes_1        3561 non-null   object
 5   total_strikes_2        3561 non-null   object
 6   significant_strikes_1  3561 non-null   object
 7   significant_strikes_2  3561 non-null   object
 8   head_strikes_1         3561 non-null   object
 9   head_strikes_2         3561 non-null   object
 10  body_strikes_1         3561 non-null   object
 11  body_strikes_2         3561 non-null   object
 12  leg_strikes_1          3561 non-null   object
 13  leg_strikes_2          3561 non-null   object
 14  distance_strikes_1     3561 non-null   object
 15  distance_strikes_2   

In [46]:
# checking for null values
df.isna().sum()

fighter_1                 0
fighter_2                 0
knockdowns_1              0
knockdowns_2              0
total_strikes_1           0
total_strikes_2           0
significant_strikes_1     0
significant_strikes_2     0
head_strikes_1            0
head_strikes_2            0
body_strikes_1            0
body_strikes_2            0
leg_strikes_1             0
leg_strikes_2             0
distance_strikes_1        0
distance_strikes_2        0
clinch_strikes_1          0
clinch_strikes_2          0
ground_strikes_1          0
ground_strikes_2          0
takedowns_1               0
takedowns_2               0
submission_attempts_1     0
submission_attempts_2     0
reversals_1               0
reversals_2               0
control_time_1            0
control_time_2            0
result                    0
method                    0
round                     0
time                      0
referee                  17
ufc_stats_com_url         0
dtype: int64

In [47]:
# chicking for invalid fight results
df.method.value_counts()

method
Decision - Unanimous       1349
KO/TKO                     1092
Submission                  629
Decision - Split            369
Decision - Majority          51
TKO - Doctor's Stoppage      26
Overturned                   24
Could Not Continue           12
DQ                            8
Other                         1
Name: count, dtype: int64

In [48]:
# results of fights ending in a 'No contest' or disqualification do not reflect the capabilities of the fighters
# searching for the results of fights before they were overturned can help, albeit fighters are usually found cheating in those cases
# but it is still too tedious to look for the actual result announced for those overturned fights, so we get rid of them as well

invalid_results = ['Overturned', 'Other', 'DQ', 'Could Not Continue']
df = df[~df['method'].isin(invalid_results)]
df["method"].value_counts()

method
Decision - Unanimous       1349
KO/TKO                     1092
Submission                  629
Decision - Split            369
Decision - Majority          51
TKO - Doctor's Stoppage      26
Name: count, dtype: int64

In [49]:
df["result"].value_counts()

result
W    1997
L    1486
D      33
Name: count, dtype: int64

In [50]:
# removing draw results as betting lines are usually for win/loss results

df = df[~(df['result'] == 'D')]
df["result"].value_counts()

result
W    1997
L    1486
Name: count, dtype: int64

In [51]:
# new dataframe for refined data
improved_df = df.loc[:, ['fighter_1', 'fighter_2', 'result']]

In [52]:
# some stats have information of strikes landed & attempted
# fuse the two together to get accuracy

# extract landed & attempted numbers and get the ratio
def rate_from_text(s):
  l, a = s.split(" of ")[:2]
  l, a = int(l), int(a)
  return 0 if a==0 else l/a


improved_df['significant_strikes_accuracy_1'] = df['significant_strikes_1'].apply(rate_from_text)
improved_df['significant_strikes_accuracy_2'] = df['significant_strikes_2'].apply(rate_from_text)
improved_df['head_strikes_accuracy_1'] = df['head_strikes_1'].apply(rate_from_text)
improved_df['head_strikes_accuracy_2'] = df['head_strikes_2'].apply(rate_from_text)
improved_df['body_strikes_accuracy_1'] = df['body_strikes_1'].apply(rate_from_text)
improved_df['body_strikes_accuracy_2'] = df['body_strikes_2'].apply(rate_from_text)
improved_df['leg_strikes_accuracy_1'] = df['leg_strikes_1'].apply(rate_from_text)
improved_df['leg_strikes_accuracy_2'] = df['leg_strikes_2'].apply(rate_from_text)
improved_df['distance_strikes_accuracy_1'] = df['distance_strikes_1'].apply(rate_from_text)
improved_df['distance_strikes_accuracy_2'] = df['distance_strikes_2'].apply(rate_from_text)
improved_df['clinch_strikes_accuracy_1'] = df['clinch_strikes_1'].apply(rate_from_text)
improved_df['clinch_strikes_accuracy_2'] = df['clinch_strikes_2'].apply(rate_from_text)
improved_df['ground_strikes_accuracy_1'] = df['ground_strikes_1'].apply(rate_from_text)
improved_df['ground_strikes_accuracy_2'] = df['ground_strikes_2'].apply(rate_from_text)
improved_df['takedown_accuracy_1'] = df['takedowns_1'].apply(rate_from_text)
improved_df['takedown_accuracy_2'] = df['takedowns_2'].apply(rate_from_text)

In [53]:
# strikes by target & position

# get significant strikes landed, infinity if zero to avoid division by zero
def landed_from_text(s):
  sig_strikes = int(s.split(" of ")[0])
  return float('inf') if sig_strikes == 0 else sig_strikes

sig_strks_1 = df['significant_strikes_1'].apply(landed_from_text)
sig_strks_2 = df['significant_strikes_2'].apply(landed_from_text)

improved_df['head_strikes_percent_1'] = df['head_strikes_1'].apply(lambda x: int(x.split(" of ")[0]) ) / sig_strks_1
improved_df['head_strikes_percent_2'] = df['head_strikes_2'].apply(lambda x: int(x.split(" of ")[0]) ) / sig_strks_2
improved_df['body_strikes_percent_1'] = df['body_strikes_1'].apply(lambda x: int(x.split(" of ")[0]) ) / sig_strks_1
improved_df['body_strikes_percent_2'] = df['body_strikes_2'].apply(lambda x: int(x.split(" of ")[0]) ) / sig_strks_2
improved_df['leg_strikes_percent_1'] = df['leg_strikes_1'].apply(lambda x: int(x.split(" of ")[0]) ) / sig_strks_1
improved_df['leg_strikes_percent_2'] = df['leg_strikes_2'].apply(lambda x: int(x.split(" of ")[0]) ) / sig_strks_2
improved_df['distance_strikes_percent_1'] = df['distance_strikes_1'].apply(lambda x: int(x.split(" of ")[0]) ) / sig_strks_1
improved_df['distance_strikes_percent_2'] = df['distance_strikes_2'].apply(lambda x: int(x.split(" of ")[0]) ) / sig_strks_2
improved_df['clinch_strikes_percent_1'] = df['clinch_strikes_1'].apply(lambda x: int(x.split(" of ")[0]) ) / sig_strks_1
improved_df['clinch_strikes_percent_2'] = df['clinch_strikes_2'].apply(lambda x: int(x.split(" of ")[0]) ) / sig_strks_2
improved_df['ground_strikes_percent_1'] = df['ground_strikes_1'].apply(lambda x: int(x.split(" of ")[0]) ) / sig_strks_1
improved_df['ground_strikes_percent_2'] = df['ground_strikes_2'].apply(lambda x: int(x.split(" of ")[0]) ) / sig_strks_2

In [54]:
# forming more stats that are available on athletes so that the model is better suited for future predictions

SECONDS_PER_ROUND=300

def seconds_from_time_str(s):
  mins, secs = s.split(":")
  mins, secs = int(mins), int(secs)
  return mins*60 + secs


# get timed stats

improved_df['fight_time'] = (df['round'] - 1)*SECONDS_PER_ROUND + df['time'].apply(seconds_from_time_str)
improved_df['control_time_1'] = df['control_time_1'].apply(seconds_from_time_str)
improved_df['control_time_2'] = df['control_time_2'].apply(seconds_from_time_str)

improved_df['significant_strikes_per_min_1'] = df['significant_strikes_1'].apply(lambda x: int(x.split(" of ")[0]) ) * 60 / improved_df['fight_time']
improved_df['significant_strikes_per_min_2'] = df['significant_strikes_2'].apply(lambda x: int(x.split(" of ")[0]) ) * 60 / improved_df['fight_time']
improved_df['takedowns_per_15min_1'] = df['takedowns_1'].apply(lambda x: int(x.split(" of ")[0]) ) * 15 * 60 / improved_df['fight_time']
improved_df['takedowns_per_15min_2'] = df['takedowns_2'].apply(lambda x: int(x.split(" of ")[0]) ) * 15 * 60 / improved_df['fight_time']
improved_df['submissions_attempted_per_15min_1'] = df['submission_attempts_1'] * 15 * 60 / improved_df['fight_time']
improved_df['submissions_attempted_per_15min_2'] = df['submission_attempts_2'] * 15 * 60 / improved_df['fight_time']
improved_df['knockdowns_per_15min_1'] = df['knockdowns_1'] * 15 * 60 / improved_df['fight_time']
improved_df['knockdowns_per_15min_2'] = df['knockdowns_2'] * 15 * 60 / improved_df['fight_time']

In [55]:
improved_df.head()

Unnamed: 0,fighter_1,fighter_2,result,significant_strikes_accuracy_1,significant_strikes_accuracy_2,head_strikes_accuracy_1,head_strikes_accuracy_2,body_strikes_accuracy_1,body_strikes_accuracy_2,leg_strikes_accuracy_1,...,control_time_1,control_time_2,significant_strikes_per_min_1,significant_strikes_per_min_2,takedowns_per_15min_1,takedowns_per_15min_2,submissions_attempted_per_15min_1,submissions_attempted_per_15min_2,knockdowns_per_15min_1,knockdowns_per_15min_2
1,Kevin Holland,Jack Della Maddalena,L,0.356742,0.552632,0.228682,0.4,0.6,0.734694,0.813953,...,0,0,8.466667,7.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Raul Rosas Jr.,Terrence Mitchell,W,0.6,0.315789,0.586207,0.277778,1.0,1.0,0.0,...,17,0,20.0,6.666667,0.0,0.0,0.0,0.0,16.666667,0.0
3,Daniel Zellhuber,Christos Giagos,W,0.356436,0.463415,0.24,0.396825,0.578947,0.428571,1.0,...,17,11,4.268775,4.505929,0.0,0.0,1.778656,0.0,0.0,0.0
4,Fernando Padilla,Kyle Nelson,L,0.346154,0.445652,0.25,0.300699,0.772727,0.95,0.722222,...,7,0,4.8,5.466667,0.0,0.0,0.0,0.0,0.0,0.0
5,Loopy Godinez,Elise Reed,W,0.677419,0.230769,0.6,0.0,1.0,0.666667,1.0,...,389,22,2.432432,0.34749,8.687259,0.0,10.42471,0.0,1.737452,0.0


In [56]:
# categorize fights by ending at/under 2.5 rounds or over

improved_df['two_n_half'] = improved_df['fight_time'].apply(lambda t: 'Over' if t > 750 else 'Under')
improved_df[['fighter_1', 'fighter_2', 'fight_time', 'two_n_half']].head()

Unnamed: 0,fighter_1,fighter_2,fight_time,two_n_half
1,Kevin Holland,Jack Della Maddalena,900,Over
2,Raul Rosas Jr.,Terrence Mitchell,54,Under
3,Daniel Zellhuber,Christos Giagos,506,Under
4,Fernando Padilla,Kyle Nelson,900,Over
5,Loopy Godinez,Elise Reed,518,Under


In [57]:
# to debias prediction models for favoring one fighter over the other by order, we replicate the whole dataset

improved_df_copy = improved_df.copy()
improved_df_copy.rename(columns={
  'fighter_1': 'fighter_2',
  'significant_strikes_accuracy_1': 'significant_strikes_accuracy_2',
  'head_strikes_accuracy_1': 'head_strikes_accuracy_2',
  'body_strikes_accuracy_1': 'body_strikes_accuracy_2',
  'leg_strikes_accuracy_1': 'leg_strikes_accuracy_2',
  'distance_strikes_accuracy_1': 'distance_strikes_accuracy_2',
  'clinch_strikes_accuracy_1': 'clinch_strikes_accuracy_2',
  'ground_strikes_accuracy_1': 'ground_strikes_accuracy_2',
  'takedown_accuracy_1': 'takedown_accuracy_2',
  'significant_strikes_per_min_1': 'significant_strikes_per_min_2',
  'takedowns_per_15min_1': 'takedowns_per_15min_2',
  'submissions_attempted_per_15min_1': 'submissions_attempted_per_15min_2',
  'knockdowns_per_15min_1': 'knockdowns_per_15min_2',
  'control_time_1': 'control_time_2',
  'fighter_2': 'fighter_1',
  'significant_strikes_accuracy_2': 'significant_strikes_accuracy_1',
  'head_strikes_accuracy_2': 'head_strikes_accuracy_1',
  'body_strikes_accuracy_2': 'body_strikes_accuracy_1',
  'leg_strikes_accuracy_2': 'leg_strikes_accuracy_1',
  'distance_strikes_accuracy_2': 'distance_strikes_accuracy_1',
  'clinch_strikes_accuracy_2': 'clinch_strikes_accuracy_1',
  'ground_strikes_accuracy_2': 'ground_strikes_accuracy_1',
  'takedown_accuracy_2': 'takedown_accuracy_1',
  'significant_strikes_per_min_2': 'significant_strikes_per_min_1',
  'takedowns_per_15min_2': 'takedowns_per_15min_1',
  'submissions_attempted_per_15min_2': 'submissions_attempted_per_15min_1',
  'knockdowns_per_15min_2': 'knockdowns_per_15min_1',
  'control_time_2': 'control_time_1'
}, inplace=True)
improved_df_copy['result'] = improved_df_copy['result'].apply(lambda x: 'W' if x=='L' else 'L')
improved_df_copy.head()

Unnamed: 0,fighter_2,fighter_1,result,significant_strikes_accuracy_2,significant_strikes_accuracy_1,head_strikes_accuracy_2,head_strikes_accuracy_1,body_strikes_accuracy_2,body_strikes_accuracy_1,leg_strikes_accuracy_2,...,control_time_1,significant_strikes_per_min_2,significant_strikes_per_min_1,takedowns_per_15min_2,takedowns_per_15min_1,submissions_attempted_per_15min_2,submissions_attempted_per_15min_1,knockdowns_per_15min_2,knockdowns_per_15min_1,two_n_half
1,Kevin Holland,Jack Della Maddalena,W,0.356742,0.552632,0.228682,0.4,0.6,0.734694,0.813953,...,0,8.466667,7.0,0.0,0.0,0.0,0.0,0.0,0.0,Over
2,Raul Rosas Jr.,Terrence Mitchell,L,0.6,0.315789,0.586207,0.277778,1.0,1.0,0.0,...,0,20.0,6.666667,0.0,0.0,0.0,0.0,16.666667,0.0,Under
3,Daniel Zellhuber,Christos Giagos,L,0.356436,0.463415,0.24,0.396825,0.578947,0.428571,1.0,...,11,4.268775,4.505929,0.0,0.0,1.778656,0.0,0.0,0.0,Under
4,Fernando Padilla,Kyle Nelson,W,0.346154,0.445652,0.25,0.300699,0.772727,0.95,0.722222,...,0,4.8,5.466667,0.0,0.0,0.0,0.0,0.0,0.0,Over
5,Loopy Godinez,Elise Reed,L,0.677419,0.230769,0.6,0.0,1.0,0.666667,1.0,...,22,2.432432,0.34749,8.687259,0.0,10.42471,0.0,1.737452,0.0,Under


In [58]:
improved_df = pd.concat([improved_df, improved_df_copy])
# reset indices to get rid of duplicate indices coming from concatenation
improved_df.reset_index(inplace=True, drop=True)
improved_df.iloc[3511:3519, :]

Unnamed: 0,fighter_1,fighter_2,result,significant_strikes_accuracy_1,significant_strikes_accuracy_2,head_strikes_accuracy_1,head_strikes_accuracy_2,body_strikes_accuracy_1,body_strikes_accuracy_2,leg_strikes_accuracy_1,...,control_time_2,significant_strikes_per_min_1,significant_strikes_per_min_2,takedowns_per_15min_1,takedowns_per_15min_2,submissions_attempted_per_15min_1,submissions_attempted_per_15min_2,knockdowns_per_15min_1,knockdowns_per_15min_2,two_n_half
3511,Rhys McKee,Ange Loosa,L,0.420765,0.538462,0.331034,0.493056,0.76,0.655172,0.769231,...,367,5.133333,6.533333,0.0,6.0,0.0,0.0,0.0,0.0,Over
3512,Joselyne Edwards,Nora Cornolle,L,0.428571,0.519481,0.3,0.324324,0.888889,0.708333,0.307692,...,131,1.2,2.666667,5.0,0.0,1.0,0.0,0.0,0.0,Over
3513,Kleydson Rodrigues,Farid Basharat,L,0.875,0.666667,0.75,0.666667,1.0,1.0,1.0,...,195,1.647059,0.941176,0.0,7.058824,0.0,3.529412,0.0,0.0,Under
3514,Jacqueline Cavalcanti,Zarah Fairn,W,0.570136,0.150579,0.338583,0.103734,0.909091,0.666667,0.875,...,4,8.4,2.6,0.0,0.0,0.0,0.0,0.0,0.0,Over
3515,Chan Sung Jung,Max Holloway,L,0.274194,0.585938,0.227273,0.441558,0.75,0.885714,0.5,...,57,3.274478,7.223114,0.0,0.0,0.0,1.444623,0.0,2.889246,Under
3516,Ryan Spann,Anthony Smith,L,0.315508,0.619048,0.246835,0.478261,0.608696,0.8,1.0,...,56,3.933333,6.066667,0.0,1.0,0.0,0.0,0.0,0.0,Over
3517,Alex Caceres,Giga Chikadze,L,0.369048,0.460993,0.227273,0.301205,0.568182,0.578947,0.857143,...,0,4.133333,4.333333,0.0,0.0,0.0,0.0,0.0,0.0,Over
3518,Fernie Garcia,Rinya Nakamura,L,0.227273,0.631579,0.169811,0.516129,0.571429,0.769231,0.333333,...,566,1.0,2.4,0.0,4.0,1.0,3.0,0.0,0.0,Over


In [59]:
improved_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6966 entries, 0 to 6965
Data columns (total 43 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   fighter_1                          6966 non-null   object 
 1   fighter_2                          6966 non-null   object 
 2   result                             6966 non-null   object 
 3   significant_strikes_accuracy_1     6966 non-null   float64
 4   significant_strikes_accuracy_2     6966 non-null   float64
 5   head_strikes_accuracy_1            6966 non-null   float64
 6   head_strikes_accuracy_2            6966 non-null   float64
 7   body_strikes_accuracy_1            6966 non-null   float64
 8   body_strikes_accuracy_2            6966 non-null   float64
 9   leg_strikes_accuracy_1             6966 non-null   float64
 10  leg_strikes_accuracy_2             6966 non-null   float64
 11  distance_strikes_accuracy_1        6966 non-null   float

In [60]:
improved_df.to_csv("../../datasets/wrangled_ufc_stats.csv", index=False)