## ufc.ipynb
This file loads the csv file, cleans the data, and creates visualizations from the data. It is a work in progress. Data cleaning has been finalized. I am currently working on the analysis/visualization portion.

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [51]:
df = pd.read_csv("data/processed/fights.csv")
#df

<h2>Exploratory Info</h2>

In [None]:
df.info()

In [None]:
df.describe(include='all')

In [None]:
#unique Result values
print(*pd.unique(df['Result']), sep='\n')

In [None]:
#unique weight class values
print(*pd.unique(df['WeightClass']), sep='\n')

In [None]:
#unique Outcome values
print(*pd.unique(df['Outcome']), sep='\n')

In [None]:
#look at spceific values in a column
df.loc[df.Outcome.str.contains('Miesha Tate'),:]

In [None]:
#unique fighter1  names
print(*pd.unique(df['Fighter1']), sep='\n')

In [None]:
print(df.groupby('Fighter1').size())

In [None]:
pd.options.display.max_rows = 1000
ndf = df[['EventType', 'EventNum', 'EventNickname']].copy()
ndf.drop_duplicates(keep='first', inplace=True)
ndf.sort_values(by=['EventType', 'EventNum'], inplace=True)
print("There have been {0} UFC fights.".format(len(ndf))) #416
ndf

<h2>Feature Engineering</h2>

In [52]:
#create isTitle column to indicate championship fights (must be done before setting def. champ)
df['isTitle'] = np.where(
    (df.Fighter1.str.contains("\(c\)|\(ic\)")) |
    (df.Fighter2.str.contains("\(c\)|\(ic\)"))
    , True, False)

In [53]:
def setDefendingChamp(row):
    if ("(c)" in row.Fighter1 or "(ic)" in row.Fighter1) and ("(c)" in row.Fighter2 or "(ic)" in row.Fighter2):
        return row.Fighter1.strip() + ',' + row.Fighter2.strip()
    elif "(c)" in row.Fighter1 or "(ic)" in row.Fighter1:
        return row.Fighter1.strip()
    elif "(c)" in row.Fighter2 or "(ic)" in row.Fighter2:
        return row.Fighter2.strip()
    else:
        return ''

In [54]:
#call setDefendingChamp
df['Defending'] = ''
df.Defending = df.apply(lambda x: setDefendingChamp(x), axis=1)
#df.loc[df.Fighter1.str.contains('\(c\)') | df.Fighter1.str.contains('\(ic\)') | df.Fighter2.str.contains('\(c\)') | df.Fighter2.str.contains('\(ic\)'),['isTitle', 'Fighter1', 'Fighter2', 'Defending']]

#remove (c) and (ic) from Fighter1 and Fighter2
df.Fighter1 = df.Fighter1.str.replace(r"\(.*\)","")
df.Fighter2 = df.Fighter2.str.replace(r"\(.*\)","")
#df.loc[df.isTitle == True,['isTitle', 'Fighter1', 'Fighter2', 'Defending']]

In [55]:
#create a gender column based on weight class text
df['Gender'] = np.where(df.WeightClass.str.contains("Women"), 'F', 'M')
#df

In [56]:
#clean weight class text
df.WeightClass = np.where(df.WeightClass.str.contains("Catchweight"), 'Catchweight', df.WeightClass)
df.WeightClass = df.WeightClass.str.replace('Women\'s ', '')
#df

In [57]:
#clean result
df.Result = df.Result.str.replace('def.', 'def')
df.Result = df.Result.str.replace('vs.', 'vs')
#print(pd.unique(df['Result']))

In [58]:
#clean Outcome columns (eliminate space in outcome, so I can split on first space)
df.Outcome = df.Outcome.str.replace('Technical Submission','Submission')
df.Outcome = df.Outcome.str.replace('Technical submission','Submission')
df.Outcome = df.Outcome.str.replace('Verbal Submission', 'Submission')
df.Outcome = df.Outcome.str.replace('No Contest', 'No_Contest')
df.Outcome = df.Outcome.str.replace('DQ', 'Disqualification')
df.Outcome = df.Outcome.str.replace('Referee Stoppage', 'TKO (Referee Stoppage)')
df.Outcome = df.Outcome.str.replace('Technical Decision', 'Decision')
#split outcome into two columns
df['Outcome1'], df['Outcome2'] = df.Outcome.str.split(' ', 1).str
#df
#print(pd.unique(df['Outcome1']))

In [59]:
df.Outcome2 = df.Outcome2.fillna('')
df.Outcome2 = df.Outcome2.str.strip()
df.Outcome2 = df.Outcome2.str.lower()
df.Outcome2 = df.Outcome2.str.replace('(','')
df.Outcome2 = df.Outcome2.str.replace(')','')
#df.Outcome2

In [60]:
#clean unanimous decision (run once)
if 'Judges' not in df.columns:
    df['Judges'] = (np.where(df.Outcome2.str.contains('unanimous'),
                  df.Outcome2.str.split('unanimous').str[1], ''))
    df.Outcome2 = (np.where(df.Outcome2.str.contains('unanimous'),
                  df.Outcome2.str.split(' ', n=1).str[0], df.Outcome2))
    #clean split decision
    df.Judges = (np.where(df.Outcome2.str.contains('split'),
                  df.Outcome2.str.split('split').str[1], df.Judges))
    df.Outcome2 = (np.where(df.Outcome2.str.contains('split'),
                  df.Outcome2.str.split(' ', n=1).str[0], df.Outcome2))
    #clean majority decision
    df.Judges = (np.where(df.Outcome2.str.contains('majority'),
                  df.Outcome2.str.split('majority').str[1], df.Judges))
    df.Outcome2 = (np.where(df.Outcome2.str.contains('majority'),
                  df.Outcome2.str.split(' ', n=1).str[0], df.Outcome2))

#df[['Outcome2', 'Judges']]

In [61]:
#clean Outcome2

#punches and elbows
df.Outcome2 = df.Outcome2.str.replace(r'^elbow and punches$','punches and elbows')
df.Outcome2 = df.Outcome2.str.replace(r'^elbows and punches$','punches and elbows')
df.Outcome2 = df.Outcome2.str.replace(r'^elbow and punch$','punches and elbows')
df.Outcome2 = df.Outcome2.str.replace(r'^punches and elbow$','punches and elbows')
df.Outcome2 = df.Outcome2.str.replace(r'^punch and elbows$','punches and elbows')
#knees and punches
df.Outcome2 = df.Outcome2.str.replace(r'^knee and punches$','knees and punches')
df.Outcome2 = df.Outcome2.str.replace(r'^punches and knee$','knees and punches')
df.Outcome2 = df.Outcome2.str.replace(r'^punches and knees$','knees and punches')
df.Outcome2 = df.Outcome2.str.replace(r'^punch and knee$','knees and punches')
df.Outcome2 = df.Outcome2.str.replace(r'^knees to the body and punches$','knees and punches')
df.Outcome2 = df.Outcome2.str.replace(r'^knee to the body and punches$','knees and punches')
df.Outcome2 = df.Outcome2.str.replace(r'^knees & punches$','knees and punches')
df.Outcome2 = df.Outcome2.str.replace(r'^knee & punches$','knees and punches')
#elbows
df.Outcome2 = df.Outcome2.str.replace(r'^elbows and knees$','knees and elbow')
df.Outcome2 = df.Outcome2.str.replace(r'^knees to the body and elbows$','knees and elbow')
df.Outcome2 = df.Outcome2.str.replace(r'^knee$','knees')
df.Outcome2 = df.Outcome2.str.replace(r'^knees to the body$','knees')
df.Outcome2 = df.Outcome2.str.replace(r'^knee to the body$','knees')
df.Outcome2 = df.Outcome2.str.replace(r'^punch$','punches')
df.Outcome2 = df.Outcome2.str.replace(r'^elbow$','elbows')
#choke
df.Outcome2 = df.Outcome2.str.replace(r'^arm-triangle$','arm-triangle choke')
df.Outcome2 = df.Outcome2.str.replace(r'^modified guillotine choke$','guillotine choke')
df.Outcome2 = df.Outcome2.str.replace(r'^guillotine choke / exhaustion$','guillotine choke')
df.Outcome2 = df.Outcome2.str.replace(r'^triangle choke with elbows$','triangle choke')
df.Outcome2 = df.Outcome2.str.replace(r'^mounted triangle choke$','triangle choke')
#armbar
df.Outcome2 = df.Outcome2.str.replace(r'^straight armbar$','armbar')
df.Outcome2 = df.Outcome2.str.replace(r'^cross-body armbar$','armbar')
df.Outcome2 = df.Outcome2.str.replace(r'^cross-body armbar$','armbar')
df.Outcome2 = df.Outcome2.str.replace(r'^arm-triangle choke$','armbar')
df.Outcome2 = df.Outcome2.str.replace(r'^triangle armbar$','armbar')
#other
df.Outcome2 = df.Outcome2.str.replace(r'^superman punch and punches$','superman punch')
df.Outcome2 = df.Outcome2.str.replace(r'^flying knee and punch$','flying knee and punches')
df.Outcome2 = df.Outcome2.str.replace(r'^flying knee and strikes$','flying knee and punches')
df.Outcome2 = df.Outcome2.str.replace(r'^spinning back kick to the body and punches$','spinning back kick and punches')
df.Outcome2 = df.Outcome2.str.replace(r'^knee and spinning back kick to the body$','spinning back kick and knee')
df.Outcome2 = df.Outcome2.str.replace(r'^slam$','body slam')
df.Outcome2 = df.Outcome2.str.replace(r'^slam and punch$','body slam')
df.Outcome2 = df.Outcome2.str.replace(r'^slam and punches$','body slam')
df.Outcome2 = df.Outcome2.str.replace(r'^head kick and punch$','head kick and punches')
df.Outcome2 = df.Outcome2.str.replace(r'^punch and head kick$','head kick and punches')
df.Outcome2 = df.Outcome2.str.replace(r'^head kick and body punch$','head kick and punches')
df.Outcome2 = df.Outcome2.str.replace(r'^punches and head kick$','head kick and punches')
df.Outcome2 = df.Outcome2.str.replace(r'^flying head kick and punches$','flying head kick')
df.Outcome2 = df.Outcome2.str.replace(r'^leg kick$','leg kicks')
df.Outcome2 = df.Outcome2.str.replace(r'^illegal knee$','illegal knees')
df.Outcome2 = df.Outcome2.str.replace(r'^kick to the body and punch$','body kick and punches')
df.Outcome2 = df.Outcome2.str.replace(r'^body kicks and punches$','body kick and punches')
df.Outcome2 = df.Outcome2.str.replace(r'^front kick to the body and punches$','front kick and punches')
df.Outcome2 = df.Outcome2.str.replace(r'^suplex and punches$','suplex')
df.Outcome2 = df.Outcome2.str.replace(r'^inverted triangle kimura$','kimura')
df.Outcome2 = df.Outcome2.str.replace(r'^reverse triangle and kimura$','kimura')
df.Outcome2 = df.Outcome2.str.replace(r'^modified kimura$','kimura')
df.Outcome2 = df.Outcome2.str.replace(r'^body punch$','body punches')
df.Outcome2 = df.Outcome2.str.replace(r'^punches to the body$','body punches')
df.Outcome2 = df.Outcome2.str.replace(r'^punch to the body$','body punches')
#remove bracketed items
df.Outcome2 = df.Outcome2.str.replace(r"\[.*\]","")

#show number f unique values
#print(df.Outcome2.nunique())

#show unique values
#print(*df.Outcome2.unique(), sep='\n')

#show specific unique strings contained in Outcome2
#print(*df.loc[df.Outcome2.str.contains('punch'),'Outcome2'].unique(), sep='\n')

In [90]:
#validate Gender column for female fights
df.Fighter1 = df.Fighter1.str.strip()
df.Fighter2 = df.Fighter2.str.strip()
df.Fighter1 = df.Fighter1.str.replace('Melinda Fábián', 'Melinda Fabián')
df.Fighter2 = df.Fighter2.str.replace('Melinda Fábián', 'Melinda Fabián')
df.Fighter1 = df.Fighter1.str.replace('Mara Romero Borella', 'Mara Borella')
df.Fighter2 = df.Fighter2.str.replace('Mara Romero Borella', 'Mara Borella')

ff1 = df.loc[df.Gender == 'F', 'Fighter1']
ff2 = df.loc[df.Gender == 'F', 'Fighter2']
fem = pd.concat((ff1, ff2), axis=0)
print("count with duplicates: {0}".format(fem.count())) #468
fem.drop_duplicates(keep='first', inplace=True)
print("count without duplicates: {0}".format(fem.count())) #125
fem.sort_values(inplace=True)
fem

#print(*pd.unique(fem), sep='\n')
##df.loc[df.Gender == 'F',['Fighter1', 'Fighter2', 'Gender']]

#display number of fights per fighter
#pd.options.display.max_rows = 1000
#print(fem.groupby(fem).size())

#df[df['Fighter1'].isin(fem) | df['Fighter2'].isin(fem)]
#df.loc[46]

#TODO: update gender column


count with duplicates: 468
count without duplicates: 125


1226              Aisling Daly
1082           Aleksandra Albu
1094             Alex Chambers
1500              Alexa Grasso
122               Alexis Davis
1515             Amanda Cooper
193               Amanda Lemos
686               Amanda Nunes
380                 Andrea Lee
1798               Angela Hill
2241             Angela Magaña
1367               Anna Elmose
2309                Ariel Beck
1323        Ashlee Evans-Smith
2295              Ashley Yoder
247                 Aspen Ladd
2302              Barb Honchak
1092              Bec Rawlings
707              Bethe Correia
177              Carla Esparza
442                Cat Zingano
235               Chan-Mi Jeon
2306           Christina Marks
122              Cindy Dandois
19             Cláudia Gadelha
465              Cortney Casey
3650               Cris Cyborg
1356          Cristina Stanciu
3592          Cynthia Calvillo
34             Danielle Taylor
2304            DeAnna Bennett
1645        Elizabeth Phillips
1287    

<h2>Visualizations</h2>

In [None]:
#WeightClass
df.WeightClass.value_counts().plot.bar()
plt.show()

In [None]:
#Outcome
df.Outcome1.value_counts().plot.bar()
plt.show()

In [None]:
df.Outcome1.value_counts().plot(kind='pie')
plt.show()

In [None]:
#df.Outcome1.value_counts().plot(kind='box')
#plt.show()

In [None]:
df.Outcome2.value_counts().plot(kind='pie')
plt.show()

<h2>Summary</h2>

In [None]:
#fighters who won 5-round decision, winning every round at least 1 judge
decisions = df.loc[df.Judges.str.contains('50'),['isTitle', 'isTitle', 'Fighter1', 'Fighter2', 'Outcome1', 'Outcome2', 'Judges', 'EventDate']]
decisions.EventDate = pd.to_datetime(decisions.EventDate)
decisions.sort_values(by=['EventDate'], inplace=True)
print("{0} decisions".format(len(decisions)) #54
decisions

In [None]:
#Shutout
#fighters who won perfect 5-round decision, winning every round by all 3 judges
shutout = df.loc[df.Judges.str.count('50') >= 3,['isTitle', 'Fighter1', 'Fighter2', 'Outcome1', 'Outcome2', 'Judges', 'EventDate']]
shutout.EventDate = pd.to_datetime(shutout.EventDate)
shutout.sort_values(by=['EventDate'], inplace=True)
print("{0} shutout decisions".format(len(shutout)) #24
shutout