In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [24]:
processed_ufc_data = pd.read_csv('fight_data.csv')

In [25]:
#Removing unwanted columns

irrelevant_columns = ['R_odds','B_odds','R_ev','B_ev','B_current_lose_streak','B_current_win_streak',
                      'B_longest_win_streak','R_current_lose_streak','R_current_win_streak','R_longest_win_streak',
                      'empty_arena','constant_1','B_match_weightclass_rank','R_match_weightclass_rank',"R_Women's Flyweight_rank",
                      "R_Women's Featherweight_rank","R_Women's Strawweight_rank","R_Women's Bantamweight_rank",
                      "R_Heavyweight_rank","R_Light Heavyweight_rank",'R_Middleweight_rank', 'R_Welterweight_rank',
                      'R_Lightweight_rank','R_Featherweight_rank','R_Bantamweight_rank','R_Flyweight_rank','R_Pound-for-Pound_rank',
                      "B_Women's Flyweight_rank","B_Women's Featherweight_rank","B_Women's Strawweight_rank","B_Women's Bantamweight_rank",
                      'B_Heavyweight_rank','B_Light Heavyweight_rank','B_Middleweight_rank','B_Welterweight_rank','B_Lightweight_rank',
                      'B_Featherweight_rank','B_Bantamweight_rank','B_Flyweight_rank','B_Pound-for-Pound_rank','r_dec_odds','b_dec_odds',
                      'r_sub_odds','b_sub_odds','r_ko_odds','b_ko_odds']

processed_ufc_data.drop(columns=irrelevant_columns, inplace=True)

In [26]:


# Find the missing rows
missing_rows = {}
for column in processed_ufc_data.columns:
    missing_count = processed_ufc_data[column].isnull().sum()
    missing_rows[column] = missing_count

print("Missing rows for each column:")
for column, missing_count in missing_rows.items():
    print(f"{column}: {missing_count} missing rows")

Missing rows for each column:
R_fighter: 0 missing rows
B_fighter: 0 missing rows
date: 0 missing rows
location: 0 missing rows
country: 0 missing rows
Winner: 0 missing rows
title_bout: 0 missing rows
weight_class: 0 missing rows
gender: 0 missing rows
no_of_rounds: 0 missing rows
B_draw: 0 missing rows
B_avg_SIG_STR_landed: 930 missing rows
B_avg_SIG_STR_pct: 765 missing rows
B_avg_SUB_ATT: 832 missing rows
B_avg_TD_landed: 833 missing rows
B_avg_TD_pct: 842 missing rows
B_losses: 0 missing rows
B_total_rounds_fought: 0 missing rows
B_total_title_bouts: 0 missing rows
B_win_by_Decision_Majority: 0 missing rows
B_win_by_Decision_Split: 0 missing rows
B_win_by_Decision_Unanimous: 0 missing rows
B_win_by_KO/TKO: 0 missing rows
B_win_by_Submission: 0 missing rows
B_win_by_TKO_Doctor_Stoppage: 0 missing rows
B_wins: 0 missing rows
B_Stance: 2 missing rows
B_Height_cms: 0 missing rows
B_Reach_cms: 0 missing rows
B_Weight_lbs: 0 missing rows
R_draw: 0 missing rows
R_avg_SIG_STR_landed: 455 

In [27]:
# Fill missing values with mean
columns_to_fill = ['B_avg_SIG_STR_landed', 'B_avg_SIG_STR_pct', 'B_avg_SUB_ATT', 'B_avg_TD_landed', 'B_avg_TD_pct',
                   'R_avg_SIG_STR_landed', 'R_avg_SIG_STR_pct', 'R_avg_SUB_ATT', 'R_avg_TD_landed', 'R_avg_TD_pct']
for column in columns_to_fill:
    processed_ufc_data[column].fillna(processed_ufc_data[column].mean(), inplace=True)

In [28]:
# Fill missing values for 'finish' column based on specified distribution
finish_distribution = {'DQ': 0.3, 'KO/TKO': 32, 'M-Dec': 0.6, 'Overturned': 0.04, 'S-dec': 10.4, 'Sub': 18.3, 'U-Dec': 38.17}
for finish_type, percentage in finish_distribution.items():
    num_missing = int(missing_rows['finish'] * percentage / 100)
    processed_ufc_data.loc[processed_ufc_data['finish'].isnull(), 'finish'] = finish_type
    missing_rows['finish'] -= num_missing

In [29]:
# Replace missing values for 'finish_details' with 'blank'
processed_ufc_data['finish_details'].fillna('blank', inplace=True)


In [30]:
# Distribute missing values for 'finish_round' based on specified percentages
round_distribution = {1: 25.8, 2: 15.7, 3: 54.1, 4: 0.6, 5: 3.7}
for round_num, percentage in round_distribution.items():
    num_missing = int(missing_rows['finish_round'] * percentage / 100)
    processed_ufc_data.loc[processed_ufc_data['finish_round'].isnull(), 'finish_round'] = round_num
    missing_rows['finish_round'] -= num_missing

In [31]:
# Replace missing values for 'finish_round_time' with '5:00'
processed_ufc_data['finish_round_time'].fillna('5:00', inplace=True)

In [32]:
# Calculate and replace missing values for 'total_fight_time_secs' based on (finish round * 5 * 60)
processed_ufc_data['total_fight_time_secs'].fillna(processed_ufc_data['finish_round'] * 5 * 60, inplace=True)

In [33]:
# Write the updated data to a new file
processed_ufc_data.to_csv('fight_data_cleaned.csv', index=False)


In [34]:
processed_ufc_data

Unnamed: 0,R_fighter,B_fighter,date,location,country,Winner,title_bout,weight_class,gender,no_of_rounds,...,age_dif,sig_str_dif,avg_sub_att_dif,avg_td_dif,better_rank,finish,finish_details,finish_round,finish_round_time,total_fight_time_secs
0,Thiago Santos,Johnny Walker,2021-10-02,"Las Vegas, Nevada, USA",USA,Red,False,Light Heavyweight,MALE,5,...,-8,-0.530000,0.600000,-0.370000,Red,U-DEC,blank,5.0,5:00,1500.0
1,Alex Oliveira,Niko Price,2021-10-02,"Las Vegas, Nevada, USA",USA,Blue,False,Welterweight,MALE,3,...,-1,2.190000,0.300000,-1.480000,neither,U-DEC,blank,3.0,5:00,900.0
2,Misha Cirkunov,Krzysztof Jotko,2021-10-02,"Las Vegas, Nevada, USA",USA,Blue,False,Middleweight,MALE,3,...,-2,-0.850000,-1.600000,-3.330000,neither,S-DEC,blank,3.0,5:00,900.0
3,Alexander Hernandez,Mike Breeden,2021-10-02,"Las Vegas, Nevada, USA",USA,Red,False,Lightweight,MALE,3,...,3,0.250000,0.000000,-1.570000,neither,KO/TKO,Punch,1.0,1:20,80.0
4,Joe Solecki,Jared Gordon,2021-10-02,"Las Vegas, Nevada, USA",USA,Blue,False,Lightweight,MALE,3,...,5,2.580000,-0.600000,-0.310000,neither,S-DEC,blank,3.0,5:00,900.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4891,Duane Ludwig,Darren Elkins,3/21/2010,"Broomfield, Colorado, USA",USA,Blue,False,Lightweight,MALE,3,...,6,-13.666667,0.000000,0.000000,neither,KO/TKO,blank,1.0,0:44,44.0
4892,John Howard,Daniel Roberts,3/21/2010,"Broomfield, Colorado, USA",USA,Red,False,Welterweight,MALE,3,...,-2,-18.000000,-1.000000,-4.666667,neither,KO/TKO,Punch,1.0,2:01,121.0
4893,Brendan Schaub,Chase Gormley,3/21/2010,"Broomfield, Colorado, USA",USA,Red,False,Heavyweight,MALE,3,...,0,-4.000000,1.000000,1.000000,neither,KO/TKO,Punches,1.0,0:47,47.0
4894,Mike Pierce,Julio Paulino,3/21/2010,"Broomfield, Colorado, USA",USA,Red,False,Welterweight,MALE,3,...,-5,-40.500000,0.000000,-3.500000,neither,U-DEC,blank,3.0,5:00,900.0
