In [None]:
import pandas as pd
import seaborn
import numpy as np

In [None]:
## options for easier df viewing
pd.set_option('display.max_colwidth', None)

In [None]:
## Read in csv data
url = "https://raw.githubusercontent.com/sphill12/MMA_Project/main/ufc_data_till_UFC_292.csv"
data = pd.read_csv(url)

In [None]:
## visualize it
data.head()

In [None]:
## I want to create a long dataframe of the strikes for easier analysis of this data. To do this, I will separate the r_figher (red fighter) and b_figher (blue fighter) data and then stitch them together into a long dataframe
r_selector = list(range(0,32,2))
r_selector
r_fighters = data.iloc[:,r_selector]
r_fighters.insert(16,"Winner", data["Winner"])

In [None]:
## for my long data frame, I want to retain data on whether the fighter lost or won their bout. A simple list was generated by a for loop and will be appended to the dataframe
fight_winner = []
for i in range(len(r_fighters)):
    if r_fighters.iloc[i, 0] == r_fighters.iloc[i, 16]:
        fight_winner.append("won")
    else:
        fight_winner.append("lost")
r_fighters["fight_result"] = fight_winner


In [None]:
r_fighters

In [None]:
## now I will repeat the process with the b fighter
b_selector = list(range(1,33,2))
b_fighters = data.iloc[:,b_selector]
b_fighters.insert(16, "Winner", data["Winner"])


In [None]:
## The same loop as before
fight_winner_b = []
for i in range(len(b_fighters)):
    if b_fighters.iloc[i, 0] == b_fighters.iloc[i, 16]:
        fight_winner_b.append("won")
    else:
        fight_winner_b.append("lost")
b_fighters["fight_result"] = fight_winner_b

In [None]:
## create a list of the new column names that i want the final dataframe to have
print(data.columns)
col_names = ["fighter", "kd", "sig_str", "sig_str_pct", "total_str", "td", "td_pct", "sub_att","rev", "ctrl", "head", "body", "leg", "distance","clinch", "ground","winner", "result", "date", "location","fight_type","opponent", "referee", "ended_by", "last_round"]

In [None]:
## change col names as well as add columns that I may want to look at
b_fighters["date"] = data["date"]
r_fighters["date"] = data["date"]

b_fighters["location"] = data["location"]
r_fighters["location"] = data["location"]

b_fighters["fight_type"] = data["Fight_type"]
r_fighters["fight_type"] = data["Fight_type"]

b_fighters["opponent"] = data["R_fighter"]
r_fighters["opponent"] = data["B_fighter"]

b_fighters["referee"] = data["Referee"]
r_fighters["referee"] = data["Referee"]

b_fighters["ended_by"] = data["win_by"]
r_fighters["ended_by"] = data["win_by"]

b_fighters["last_round"] = data["last_round"]
r_fighters["last_round"] = data["last_round"]

b_fighters.columns = col_names
r_fighters.columns = col_names

In [None]:
## Create one list
fight_list = r_fighters.append(b_fighters)


In [None]:
## The columns containing strikes are not useable for analysis, as they contain strings. The numbers will need to be separated into a succesfull strike column and a strike attempt column. This is done using a regex. The first regex will look for numbers until it encounters a space. The second regex will look for a text character and a space, then take the number that follows it.
fight_list["sig_str_success"] = fight_list["sig_str"].str.extract(r'(\d+) ')
fight_list["sig_str_attempt"] = fight_list["sig_str"].str.extract(r'[a-zA-Z]\s(\d+)')

fight_list["total_str_success"] = fight_list["total_str"].str.extract(r'(\d+) ')
fight_list["total_str_attempt"] = fight_list["total_str"].str.extract(r'[a-zA-Z]\s(\d+)')

fight_list["td_success"] = fight_list["td"].str.extract(r'(\d+) ')
fight_list["td_attempt"] = fight_list["td"].str.extract(r'[a-zA-Z]\s(\d+)')

fight_list["head_success"] = fight_list["head"].str.extract(r'(\d+) ')
fight_list["head_attempt"] = fight_list["head"].str.extract(r'[a-zA-Z]\s(\d+)')

fight_list["body_success"] = fight_list["body"].str.extract(r'(\d+) ')
fight_list["body_attempt"] = fight_list["body"].str.extract(r'[a-zA-Z]\s(\d+)')

fight_list["leg_success"] = fight_list["leg"].str.extract(r'(\d+) ')
fight_list["leg_attempt"] = fight_list["leg"].str.extract(r'[a-zA-Z]\s(\d+)')

fight_list["distance_success"] = fight_list["distance"].str.extract(r'(\d+) ')
fight_list["distance_attempt"] = fight_list["distance"].str.extract(r'[a-zA-Z]\s(\d+)')

fight_list["clinch_success"] = fight_list["clinch"].str.extract(r'(\d+) ')
fight_list["clinch_attempt"] = fight_list["clinch"].str.extract(r'[a-zA-Z]\s(\d+)')

fight_list["ground_success"] = fight_list["ground"].str.extract(r'(\d+) ')
fight_list["ground_attempt"] = fight_list["ground"].str.extract(r'[a-zA-Z]\s(\d+)')

In [None]:
## now that they are displayed in attempts and successes, the original columns can be dropped
drop_list = ["sig_str", "total_str", "td", "head","body","leg", "distance", "clinch", "ground"]
fight_list = fight_list.drop(columns= drop_list)


In [None]:
## Lets differentiate between men and woman divisions
fight_list["division_gender"] = np.where(fight_list["fight_type"].str.contains("women", case = False),"woman", "man")

In [None]:
## Now lets clean up the divisions. I will create a column to denote if its a championship bout.
fight_list["championship_fight"] = np.where(fight_list["fight_type"].str.contains("title", case = False),"true", "false")

In [None]:
## The text for the fight type is unclear, and unneeded now that I have columns for the championship status and the division. I will be making a new column with better labels using np.where and str.contains
fight_list["weight_class"] = np.where(fight_list["fight_type"].str.contains("strawweight", case = False), "strawweight", 
                                      np.where(fight_list["fight_type"].str.contains("flyweight", case = False), "flyweight", 
                                               np.where(fight_list["fight_type"].str.contains("bantamweight", case = False), "bantamweight",
                                                        np.where(fight_list["fight_type"].str.contains("featherweight", case = False),"featherweight",
                                                                 np.where(fight_list["fight_type"].str.contains("lightweight",case = False), "lightweight",
                                                                          np.where(fight_list["fight_type"].str.contains("welterweight", case = False), "welterweight",
                                                                                   np.where(fight_list["fight_type"].str.contains("middleweight", case = False), "middleweight",
                                                                                            np.where(fight_list["fight_type"].str.contains("light heavyweight", case = False), "light heavyweight", "heavyweight"))))))))

In [None]:
## convert the date to a standard format
fight_list['date'] = pd.to_datetime(fight_list['date'])

In [None]:
## Determine if any columns need data types adjusted
fight_list.dtypes

In [None]:
## There are symbols in the takedown % column that need to become NA values. Percent symbols need to be removed
fight_list["td_pct"] = fight_list["td_pct"].replace("---", np.nan)
fight_list["ctrl"] = fight_list["ctrl"].replace("---", np.nan)
fight_list["ctrl"] = fight_list["ctrl"].replace("--", np.nan)
fight_list["sig_str_pct"] = fight_list["sig_str_pct"].str.replace("%", "")
fight_list["td_pct"] = fight_list["td_pct"].str.replace("%", "")

In [None]:
## Convert columns to numeric
numeric_cols = ["kd", "sig_str_pct", "td_pct", "sub_att","rev", "sig_str_success", "sig_str_attempt", "total_str_success", "total_str_attempt", "td_success", "td_attempt", "head_success", "head_attempt", "body_success", "body_attempt", "leg_success", "leg_attempt", "distance_success", "distance_attempt", "clinch_success", "clinch_attempt","ground_success", "ground_attempt"]
fight_list[numeric_cols] = fight_list[numeric_cols].apply(pd.to_numeric, errors = "coerce")

In [None]:
## convert minutes of control column to seconds
fight_list[["minutes", "seconds"]] = fight_list["ctrl"].str.split(":", expand = True)
fight_list["seconds_of_control"] = pd.to_numeric(fight_list["minutes"]) * 60 + pd.to_numeric(fight_list["seconds"])

In [None]:
## check that data types are what they need to be
fight_list.dtypes

In [None]:
## drop columns that were made to create control time column
drop_list = ["ctrl", "minutes", "seconds"]
fight_list = fight_list.drop(columns= drop_list)

In [None]:
## export as a csv
fight_list.to_csv("fight_statistics.csv")