# Extracting UFC fight data

In [25]:
import pandas as pd

from selenium import webdriver
from selenium.webdriver.common.by import By

## Web scraping

In [26]:
pages = 12
event_links = []
fight_links = []

# hiding browser
op = webdriver.ChromeOptions()
op.add_argument('headless')
driver = webdriver.Chrome(options=op)

# driver = webdriver.Chrome()

In [27]:
for p in range(1, pages+1):
  r = driver.get(f"http://ufcstats.com/statistics/events/completed?page={p}")
  elems = driver.find_elements(By.CSS_SELECTOR, ".b-statistics__table-row .b-statistics__table-content")
  event_links.extend([el.find_element(By.CSS_SELECTOR, "a.b-link.b-link_style_black").get_attribute('href') for el in elems])

In [28]:
print(len(event_links), event_links[0])

299 http://ufcstats.com/event-details/8fa2b06572365321


In [29]:
for ev in event_links:
  r = driver.get(ev)
  fight_elems = driver.find_element(By.CLASS_NAME, "b-fight-details__table-body").find_elements(By.CSS_SELECTOR, ".b-fight-details__table-row.b-fight-details__table-row__hover.js-fight-details-click")
  fight_links.extend([fe.get_attribute('onclick')[7:-2] for fe in fight_elems])

In [30]:
print(len(fight_links), fight_links[0])

3561 http://ufcstats.com/fight-details/b395c89e19a3fec4


In [31]:
# specific data scraping function for ufc-stats website
def row_stat_fetcher(row_elem, col):
  return [e.text for e in row_elem[col].find_elements(By.CLASS_NAME, "b-fight-details__table-text")]

In [32]:
# scrape important data from each fight webpage 

fights = []

for fl in fight_links:
  r = driver.get(fl)

  fighter_elems = driver.find_elements(By.CSS_SELECTOR, ".b-link.b-fight-details__person-link")
  fighter1, fighter2 = [el.text for el in fighter_elems]

  result = driver.find_element(By.CSS_SELECTOR, ".b-fight-details__person-status").text.strip()

  method = driver.find_element(By.CSS_SELECTOR, ".b-fight-details__text-item_first").find_elements(By.TAG_NAME, "i")[1].text.strip()

  detail_elems = driver.find_elements(By.CLASS_NAME, "b-fight-details__text-item")

  round_elem = detail_elems[0]
  round_text = round_elem.find_element(By.CSS_SELECTOR, ".b-fight-details__label").text
  round = int(round_elem.text.replace(round_text, '').strip())

  time_elem = detail_elems[1]
  time_text = time_elem.find_element(By.CSS_SELECTOR, ".b-fight-details__label").text
  time = time_elem.text.replace(time_text, '').strip()

  referee = detail_elems[3].find_element(By.TAG_NAME, "span").text.strip()

  stat_elems = driver.find_elements(By.CSS_SELECTOR, ".b-fight-details__table-row")[1].find_elements(By.CSS_SELECTOR, ".b-fight-details__table-col")
  kd1, kd2 = list( map(lambda x: int(x), row_stat_fetcher(stat_elems, 1)) )
  strk1, strk2 = row_stat_fetcher(stat_elems, 4)
  td1, td2 = row_stat_fetcher(stat_elems, 5)
  suba1, suba2 = list( map(lambda x: int(x), row_stat_fetcher(stat_elems, 7)) )
  rev1, rev2 = list( map(lambda x: int(x), row_stat_fetcher(stat_elems, 8)) )
  ctrl1, ctrl2 = row_stat_fetcher(stat_elems, 9)

  sig_stat_elems = driver.find_element(By.CSS_SELECTOR, ".b-fight-details > table").find_elements(By.CSS_SELECTOR, ".b-fight-details__table-row")[1].find_elements(By.CSS_SELECTOR, ".b-fight-details__table-col")
  sigstrk1, sigstrk2 = row_stat_fetcher(sig_stat_elems, 1)
  head1, head2 = row_stat_fetcher(sig_stat_elems, 3)
  body1, body2 = row_stat_fetcher(sig_stat_elems, 4)
  leg1, leg2 = row_stat_fetcher(sig_stat_elems, 5)
  dis1, dis2 = row_stat_fetcher(sig_stat_elems, 6)
  clinch1, clinch2 = row_stat_fetcher(sig_stat_elems, 7)
  grnd1, grnd2 = row_stat_fetcher(sig_stat_elems, 8)

  fights.append(dict({
    "fighter_1": fighter1,
    "fighter_2": fighter2,
    "knockdowns_1": kd1,
    "knockdowns_2": kd2,
    "total_strikes_1": strk1,
    "total_strikes_2": strk2,
    "significant_strikes_1": sigstrk1,
    "significant_strikes_2": sigstrk2,
    "head_strikes_1": head1,
    "head_strikes_2": head2,
    "body_strikes_1": body1,
    "body_strikes_2": body2,
    "leg_strikes_1": leg1,
    "leg_strikes_2": leg2,
    "distance_strikes_1": dis1,
    "distance_strikes_2": dis2,
    "clinch_strikes_1": clinch1,
    "clinch_strikes_2": clinch2,
    "ground_strikes_1": grnd1,
    "ground_strikes_2": grnd2,
    "takedowns_1": td1,
    "takedowns_2": td2,
    "submission_attempts_1": suba1,
    "submission_attempts_2": suba2,
    "reversals_1": rev1,
    "reversals_2": rev2,
    "control_time_1": ctrl1,
    "control_time_2": ctrl2,
    "result": result,
    "method": method,
    "round": round,
    "time": time,
    "referee": referee,
    "ufc_stats_com_url": fl
  }))

## Saving data

In [33]:
df = pd.DataFrame.from_dict(fights)
df.head()

Unnamed: 0,fighter_1,fighter_2,knockdowns_1,knockdowns_2,total_strikes_1,total_strikes_2,significant_strikes_1,significant_strikes_2,head_strikes_1,head_strikes_2,...,reversals_1,reversals_2,control_time_1,control_time_2,result,method,round,time,referee,ufc_stats_com_url
0,Alexa Grasso,Valentina Shevchenko,1,0,262 of 408,199 of 301,84 of 203,80 of 179,54 of 151,61 of 158,...,1,0,2:39,8:37,D,Decision - Split,5,5:00,Herb Dean,http://ufcstats.com/fight-details/b395c89e19a3...
1,Kevin Holland,Jack Della Maddalena,0,0,127 of 356,105 of 190,127 of 356,105 of 190,59 of 258,46 of 115,...,0,0,0:00,0:00,L,Decision - Split,3,5:00,Jason Herzog,http://ufcstats.com/fight-details/697efaf0d162...
2,Raul Rosas Jr.,Terrence Mitchell,1,0,19 of 31,6 of 19,18 of 30,6 of 19,17 of 29,5 of 18,...,0,0,0:17,0:00,W,KO/TKO,1,0:54,Mark Smith,http://ufcstats.com/fight-details/c0ca4c201d08...
3,Daniel Zellhuber,Christos Giagos,0,0,36 of 101,38 of 82,36 of 101,38 of 82,18 of 75,25 of 63,...,0,0,0:17,0:11,W,Submission,2,3:26,Jason Herzog,http://ufcstats.com/fight-details/2e1435c160bf...
4,Fernando Padilla,Kyle Nelson,0,0,73 of 209,83 of 185,72 of 208,82 of 184,42 of 168,43 of 143,...,0,0,0:07,0:00,L,Decision - Unanimous,3,5:00,Chris Tognoni,http://ufcstats.com/fight-details/a5d76e93a505...


In [34]:
df.to_csv('../../datasets/ufc_fight_stats.csv', index=False)