# We'll Start By Cleaning Our Raw Fights Information

In [12]:
import pandas as pd
import numpy as np

In [13]:
df = pd.read_csv('raw_data/fights.csv')
df.head()

Unnamed: 0,W/L,Fighter,Kd,Str,Td,Sub,Weight class,Method,Round,Time,event
0,win,Leon Edwards Kamaru Usman,0 0,120 87,0 4,0 0,Welterweight,M-DEC,5,5:00,UFC 286: Edwards vs. Usman 3
1,win,Justin Gaethje Rafael Fiziev,0 0,103 97,1 0,0 0,Lightweight,M-DEC,3,5:00,UFC 286: Edwards vs. Usman 3
2,win,Gunnar Nelson Bryan Barberena,0 0,10 7,1 0,1 0,Welterweight,SUB Armbar,1,4:51,UFC 286: Edwards vs. Usman 3
3,win,Jennifer Maia Casey O'Neill,0 0,145 137,0 0,0 0,Women's Flyweight,U-DEC,3,5:00,UFC 286: Edwards vs. Usman 3
4,win,Marvin Vettori Roman Dolidze,0 0,106 71,0 0,0 0,Middleweight,U-DEC,3,5:00,UFC 286: Edwards vs. Usman 3


# Right off the bat,
We see that our dataframe has information for both fighters in one column. In order to create any kind of analysis, we'll need to separate those names + stats between a fighter1 and a fighter2.

In [14]:
# create dataframe that separates fighter data
fights = pd.DataFrame()
fights[['fighter1', 'fighter2']] = df['Fighter'].str.split('  ', expand=True)
fights[['fighter1_kd', 'fighter2_kd']] = df['Kd'].str.split('  ', expand=True)
fights[['fighter1_str', 'fighter2_str']] = df['Str'].str.split('  ', expand=True)
fights[['fighter1_td', 'fighter2_td']] = df['Td'].str.split('  ', expand=True)
fights[['fighter1_sub', 'fighter2_sub']] = df['Sub'].str.split('  ', expand=True)

# make the column names more computer-friendly
fights['weight_class'] = df['Weight class']
fights['method'] = df['Method']
fights['round'] = df['Round']
fights['time'] = df['Time']
fights['event'] = df['event']
fights['w/l'] = df['W/L']



*On the webpage, a 'winner' icon always pointed to the first name unless there was a draw. Because there are draws, I want to create an additional column for the 'result' instead of simply making the fighter1 column 'winner' and fighter2 'loser'*

Therefore:

In [15]:
# we create a new result column
fights['result'] = np.where(fights['w/l'] == 'win', fights['fighter1'], 'draw')
fights = fights.drop('w/l', axis=1)

#show the fights df
fights.head()

Unnamed: 0,fighter1,fighter2,fighter1_kd,fighter2_kd,fighter1_str,fighter2_str,fighter1_td,fighter2_td,fighter1_sub,fighter2_sub,weight_class,method,round,time,event,result
0,Leon Edwards,Kamaru Usman,0,0,120,87,0,4,0,0,Welterweight,M-DEC,5,5:00,UFC 286: Edwards vs. Usman 3,Leon Edwards
1,Justin Gaethje,Rafael Fiziev,0,0,103,97,1,0,0,0,Lightweight,M-DEC,3,5:00,UFC 286: Edwards vs. Usman 3,Justin Gaethje
2,Gunnar Nelson,Bryan Barberena,0,0,10,7,1,0,1,0,Welterweight,SUB Armbar,1,4:51,UFC 286: Edwards vs. Usman 3,Gunnar Nelson
3,Jennifer Maia,Casey O'Neill,0,0,145,137,0,0,0,0,Women's Flyweight,U-DEC,3,5:00,UFC 286: Edwards vs. Usman 3,Jennifer Maia
4,Marvin Vettori,Roman Dolidze,0,0,106,71,0,0,0,0,Middleweight,U-DEC,3,5:00,UFC 286: Edwards vs. Usman 3,Marvin Vettori


In [16]:
# optional save data as a new csv file
fights.to_csv('clean_data/fights_cleaned.csv')

# Next, We'll Clean Our Fighters Information:

In [25]:
# now we create a dataframe from the fighters csv file we scraped earlier

fighters = pd.read_csv('raw_data/fighters.csv')
fighters.head()

Unnamed: 0,First,Last,Nickname,Ht.,Wt.,Reach,Stance,W,L,D,Belt
0,,,,,,,,,,,
1,Tom,Aaron,,--,155 lbs.,--,,5.0,3.0,0.0,
2,Danny,Abbadi,The Assassin,"5' 11""",155 lbs.,--,Orthodox,4.0,6.0,0.0,
3,Nariman,Abbasov,Bayraktar,"5' 8""",155 lbs.,"66.0""",Orthodox,28.0,4.0,0.0,
4,David,Abbott,Tank,"6' 0""",265 lbs.,--,Switch,10.0,15.0,0.0,


We can see that the first row is completely empty, so we don't need it.
I chose to drop the belt column as well.

In [26]:
# drop empty first row and irrelevant column
fighters = fighters.drop(fighters.index[0])
fighters = fighters.drop('Belt', axis=1)

# fill NaN values in different columns
fighters['Stance'] = fighters['Stance'].fillna('Unknown')
fighters['Nickname'] = fighters['Nickname'].fillna('')

I want to change the height column from strings in the 'feet, inches' format to numeric types showing total inches. 

In order to do that, I'll need to create a function:

In [27]:
# first, ensure all values in the column are strings
fighters['Ht.'] = fighters['Ht.'].astype(str)

# next, create the function
def convert_height(height):
    if height == "--":
        return None
    else:
        feet_inches = height.split("' ")
        if len(feet_inches) == 2:
            feet, inches = feet_inches
            total_inches = int(feet) * 12 + int(inches.strip('"'))
            return total_inches
        else:
            return None

# apply the function to the Height 'Ht.' column
fighters['Ht.'] = fighters['Ht.'].apply(convert_height)

We'll do a little re-formatting:

In [28]:
# fill NaN values with the mean of the column and change type to integer
fighters['Ht.'] = fighters['Ht.'].fillna(fighters['Ht.'].mean())
fighters['Ht.'] = fighters['Ht.'].astype(int)

# reformat Reach column + fill NaN values with column avg
fighters['Reach'] = fighters['Reach'].replace('--', np.nan)
fighters['Reach'] = fighters['Reach'].str.replace('"', '').astype(float)
fighters['Reach'] = fighters['Reach'].fillna(fighters['Reach'].mean())
fighters['Reach'] = fighters['Reach'].astype(int)

# reformat Weight column
fighters['Wt.'] = fighters['Wt.'].replace('--', np.nan)
fighters['Wt.'] = fighters['Wt.'].str.replace(' lbs.', '', regex=False).astype(float)

# drop other empty rows
fighters = fighters.dropna()

I also want to create a new column for weight class:

In [29]:
# first, we create a dictionary of weight class ranges based on UFC standards
weight_classes = {
    'strawweight': (0, 115),
    'flyweight': (115, 125),
    'bantamweight': (125, 135),
    'featherweight': (135, 145),
    'lightweight': (145, 155),
    'welterweight': (155, 170),
    'middleweight': (170, 185),
    'light heavyweight': (185, 205),
    'heavyweight': (205, 265),
    'super heavyweight': (265, float('inf'))
}

# create a new column for weight class
fighters['weight_class'] = ''

# loop through the rows and assign a weight class based on weight
for index, row in fighters.iterrows():
    weight = row['Wt.']
    for wc, range in weight_classes.items():
        if range[0] <= weight < range[1]:
            fighters.at[index, 'weight_class'] = wc
            break


# print the fighters df
fighters.head()

Unnamed: 0,First,Last,Nickname,Ht.,Wt.,Reach,Stance,W,L,D,weight_class
1,Tom,Aaron,,70,155.0,71,Unknown,5.0,3.0,0.0,welterweight
2,Danny,Abbadi,The Assassin,71,155.0,71,Orthodox,4.0,6.0,0.0,welterweight
3,Nariman,Abbasov,Bayraktar,68,155.0,66,Orthodox,28.0,4.0,0.0,welterweight
4,David,Abbott,Tank,72,265.0,71,Switch,10.0,15.0,0.0,super heavyweight
5,Hamdy,Abdelwahab,The Hammer,74,264.0,72,Southpaw,5.0,0.0,0.0,heavyweight


*Note: while there are fighters on the official UFCSTATS website that are listed as 'super heavyweight', UFC has not officially made a Super Heavyweight class as of yet.*

In [51]:
# optional: save data as a new csv file
fights.to_csv('clean_data/fighters_cleaned.csv')

# Now we have some nice, clean data we can experiment with!

*My goal to create a predictive model won't be met in the time frame for this project, so I will just choose one data-set to analyze.* 

*However, I believe that in order to train a more accurate model, I'm going to need information from both of these data-sets, in addition to information from a different website I did not scrape for this project*