In [168]:
import requests
import bs4
import re
import csv
from string import ascii_lowercase

1. Get all URLs
2. Define columns
3. Iterate through URLs and scrape into columns
4. Save to CSV

1. event_urls = get_event_urls()
2. fight_urls = get_fight_urls(event_urls)
3. get_event_data(event_urls)
4. get_fight_data(fight_urls)

## 1. Define functions to scrape URLs for each event/fight

In [2]:
#Function to scrape url of each UFC event from ufcstats.com
def get_event_urls():
    main_url = requests.get('http://ufcstats.com/statistics/events/completed?page=all')
    main_soup = bs4.BeautifulSoup(main_url.text, 'lxml')
    
    #Adds href to list if href contains a link and keyword 'event-details'
    all_event_urls = [item.get('href') for item in  main_soup.find_all('a') if type(item.get('href')) == str and 'event-details' in item.get('href')]
    
    return all_event_urls

In [3]:
#Function that takes as input a list of event urls, and outputs a list of urls for each fight in each event
def get_fight_urls(event_urls):
    all_fight_urls = []
    for url in event_urls:
        event_url = requests.get(url)
        event_soup = bs4.BeautifulSoup(event_url.text,'lxml')
        
        for item in event_soup.find_all('a', class_ = 'b-flag b-flag_style_green'):
            all_fight_urls.append(item.get('href'))

    return(all_fight_urls)

In [403]:
def get_fighter_urls():
    main_url_list = [requests.get(f'http://ufcstats.com/statistics/fighters?char={letter}&page=all') for letter in ascii_lowercase]
    main_soup_list = [bs4.BeautifulSoup(url.text,'lxml') for url in main_url_list]
    all_links = []
    
    for item in main_soup_list:
        for item in item.select('a.b-link')[1::3]:
            all_links.append(item.get('href')) 
    
    return all_fighter_urls

In [39]:
event_urls = get_event_urls()

In [40]:
fight_urls = get_fight_urls(event_urls[0:10])

In [405]:
fighter_urls = get_fighter_urls()

In [181]:
main_url_list = [requests.get(f'http://ufcstats.com/statistics/fighters?char={letter}&page=all') for letter in ascii_lowercase]
main_soup_list = [bs4.BeautifulSoup(url.text,'lxml') for url in main_url_list]

In [407]:
main_url_list = [requests.get(f'http://ufcstats.com/statistics/fighters?char={letter}&page=all') for letter in ascii_lowercase]
main_soup_list = [bs4.BeautifulSoup(url.text,'lxml') for url in main_url_list]
all_links = []
    
for item in main_soup_list:
    for item in item.select('a.b-link')[1::3]:
        all_links.append(item.get('href')) 

In [409]:
len(all_links)

52

## 2. Define functions to scrape event/fight data

In [5]:
#Function to scrape details of each UFC event - name, date, and location - and store in a list.
def get_event_data(event_urls):
    csv_rows = []
    x = len(event_urls)
    for event in event_urls:
        event_request = requests.get(event)
        event_soup = bs4.BeautifulSoup(event_request.text,'lxml')
        
        event_name = event_soup.select('h2')[0].text
        event_date = event_soup.select('li')[3].text.split(':')[-1]
        event_full_location = event_soup.select('li')[4].text.split(':')[1].strip().split(',')
        event_city = event_soup.select('li')[4].text.split(':')[1].strip().split(',')[0]
        event_country = event_soup.select('li')[4].text.split(':')[1].strip().split(',')[-1]
        event_url = event
        
        #Check if event address includes state and add to event_state if True
        if len(event_full_location)>2:
               event_state = event_soup.select('li')[4].text.split(':')[1].strip().split(',')[1]
        else:
               event_state = 'NULL'
        event_id = x
        x = x-1
            
        csv_rows.append([event_id, 
                                event_name.strip(), 
                                event_date.strip(), 
                                event_city.strip(), 
                                event_state.strip(), 
                                event_country.strip(), 
                                event_url.strip()])
    
    return csv_rows

In [142]:
##Function to scrape details of each UFC fight and store in a list.
def get_fight_data(fight_urls):

    csv_rows = []
    
    for url in fight_urls:

        fight_url = requests.get(url)
        fight_soup = bs4.BeautifulSoup(fight_url.text,'lxml')

        #Define repeated select statements for column values
        select_overview = fight_soup.select('i.b-fight-details__text-item')
        select_result = fight_soup.select('i.b-fight-details__text-item_first')
        select_result_details = fight_soup.select('p.b-fight-details__text')
        select_details = fight_soup.select('p.b-fight-details__table-text')
        select_weight_class_details = fight_soup.select('i.b-fight-details__fight-title')
        select_win_lose = fight_soup.select('i.b-fight-details__person-status')

        #Scrape fight details
        event_id = fight_soup.h2.text
        referee = select_overview[3].text.split(':')[1]
        f_1 = select_details[0].text
        f_2 = select_details[1].text
        
        #If there is a winner, set 'winner' to winning fighter. If no winner (e.g. NC, DQ) set 'winner' to NULL
        if (select_win_lose[0].text.strip()=='W') | (select_win_lose[1].text.strip()=='W'):
            if (select_win_lose[0].text.strip()=='W'):
                winner = f_1
            else:
                winner = f_2
        else: print('NULL')
        num_rounds = select_overview[2].text.split(':')[1].strip()[0]
        if len(select_weight_class_details[0].text.split()) > 2:
            weight_class = select_weight_class_details[0].text.split()[0] + ' ' + select_weight_class_details[0].text.split()[1]
        else:
            weight_class = select_weight_class_details[0].text.split()[0]
        if 'Decision' in select_result[0].text.split(':')[1]:
            result = select_result[0].text.split(':')[1].split()[0]
            result_details = select_result_details[0].text.split(':')[1].split()[-1]
        else:
            result = select_result[0].text.split(':')[1] 
            result_details = select_result_details[1].text.split(':')[-1]
        finish_round = select_overview[0].text.split(':')[1]
        finish_time = re.findall('\d:\d\d',select_overview[1].text)[0]

        #Scrape fight stats
        f_1_kd = select_details[2].text
        f_2_kd = select_details[3].text
        f_1_sig_strikes_att = select_details[4].text.split(' of ')[1]
        f_1_sig_strikes_succ = select_details[4].text.split(' of ')[0]
        f_2_sig_strikes_att = select_details[5].text.split(' of ')[1]
        f_2_sig_strikes_succ = select_details[5].text.split(' of ')[0]
        f_1_strikes_att = select_details[8].text.split(' of ')[1]
        f_1_strikes_succ = select_details[8].text.split(' of ')[0]
        f_2_strikes_att = select_details[9].text.split(' of ')[1]
        f_2_strikes_succ = select_details[9].text.split(' of ')[0]
        f_1_td_att = select_details[10].text.split(' of ')[1]
        f_1_td_succ = select_details[10].text.split(' of ')[0]
        f_2_td_att = select_details[11].text.split(' of ')[1]
        f_2_td_succ = select_details[11].text.split(' of ')[0]
        f_1_sub_att = select_details[14].text
        f_2_sub_att = select_details[15].text
        f_1_rev = select_details[16].text
        f_2_rev = select_details[17].text
        f_1_ctrl_time = select_details[18].text
        f_2_ctrl_time = select_details[19].text
        fight_url = url

        csv_rows.append([event_id.strip(), referee.strip(), f_1.strip(), f_2.strip(), winner.strip(), num_rounds.strip(), weight_class.strip(), result.strip(), result_details.strip(), finish_round.strip(), finish_time.strip(), fight_url])

    return csv_rows

In [328]:
def get_fighter_data(fighter_urls):
    
    for url in fighter_urls:
        fighter_url = requests.get(url)
        fighter_soup = bs4.BeautifulSoup(fighter_url.text,'lxml')
        
        select_name = fighter_soup.select('span')[0].text.split()
        select_nickname = fighter_soup.select('p.b-content__Nickname')
        select_details = fighter_soup.select('li.b-list__box-list-item')
        
        fighter_id = []
        if len(select_name = fighter_soup.select('span')[0].text.split())>1:
            fighter_f_name = select_name[0]
            fighter_l_name = select_name[-1]
        else:
            fighter_f_name = select_name[0]
            fighter_l_name = 'NULL'
        if select_nickname.text == '\n':
            fighter_nickname = 'NULL'
        else:
            fighter_nickname = select_nickname.text
        fighter_height_ft = int(select_details[0].text.split(':')[1].strip()[0])
        fighter_height_in = int(select_details[0].text.split(':')[1].strip().split("'")[1].strip().strip('"'))
        fighter_height_cm = ((fighter_height_ft*12.0)*2.54)+(fighter_height_in*2.54)
        fighter_weight_lbs = select_details[1].text.split(':')[1].split()[0]
        fighter_reach_cm = round(int(select_details[2].text.split(':')[1].strip().strip('"'))*2.54,2)
        fighter_stance = select_details[3].text.split(':')[1].strip()
        fighter_dob = select_details[4].text.split(':')[1]
        fighter_w = []
        fighter_l = []
        fighter_d = []
        fighter_nc = []

In [329]:
fighter_url = requests.get('http://ufcstats.com/fighter-details/ec421d5de0aea624')
fighter_soup = bs4.BeautifulSoup(fighter_url.text,'lxml')
select_details = fighter_soup.select('li.b-list__box-list-item')
        
select_details[3].text.split(':')[1].strip()

''

In [None]:
feet_cm = (feet * 12) * 2.54
inches_cm = inches * 2.54

height_cm = feet_cm + inches_cm

ufc_fighters['Height'] = height_cm

In [6]:
event_data = get_event_data(event_urls)

In [43]:
fight_urls[0:10]

['http://ufcstats.com/fight-details/c3ef3cb03edde8bb',
 'http://ufcstats.com/fight-details/07cb64236ae7aaea',
 'http://ufcstats.com/fight-details/582806c33ce6dcf6',
 'http://ufcstats.com/fight-details/9124740fe7816d70',
 'http://ufcstats.com/fight-details/ced01368259428f5',
 'http://ufcstats.com/fight-details/42922bab8a3e1828',
 'http://ufcstats.com/fight-details/8b296724a6844865',
 'http://ufcstats.com/fight-details/c22de92b9d9030dc',
 'http://ufcstats.com/fight-details/c2cdeb207cce5ceb',
 'http://ufcstats.com/fight-details/f18d44292036d5de']

In [143]:
fight_data = get_fight_data(fight_urls[0:10])

In [144]:
fight_data

[['UFC Fight Night: Emmett vs. Topuria',
  'Marc Goddard',
  'Josh Emmett',
  'Ilia Topuria',
  'Ilia Topuria',
  '5',
  'Featherweight',
  'Decision',
  'Round',
  '5',
  '5:00',
  'http://ufcstats.com/fight-details/c3ef3cb03edde8bb'],
 ['UFC Fight Night: Emmett vs. Topuria',
  'Keith Peterson',
  'Amanda Ribas',
  'Maycee Barber',
  'Maycee Barber',
  '3',
  "Women's Flyweight",
  'KO/TKO',
  'Elbows to Head From Half Guard',
  '2',
  '3:42',
  'http://ufcstats.com/fight-details/07cb64236ae7aaea'],
 ['UFC Fight Night: Emmett vs. Topuria',
  'Keith Peterson',
  'David Onama',
  'Gabriel Santos',
  'David Onama',
  '3',
  'Featherweight',
  'KO/TKO',
  'Punch to Head At Distance',
  '2',
  '4:13',
  'http://ufcstats.com/fight-details/582806c33ce6dcf6'],
 ['UFC Fight Night: Emmett vs. Topuria',
  'Marc Goddard',
  'Brendan Allen',
  'Bruno Silva',
  'Brendan Allen',
  '3',
  'Middleweight',
  'Submission',
  'Rear Naked Choke',
  '1',
  '4:39',
  'http://ufcstats.com/fight-details/91247

## 3. Normalise data

In [408]:
#Function to normalise the data by setting the primary key 'event_id' as a foreign key in fight_data
def normalise(event_data,fight_data):
    all_event_ids = {}

    for item in event_data:
        all_event_ids[item[1]]=item[0]

    for item in fight_data:
        item[0] = all_event_ids[item[0]]

In [409]:
normalise(event_data,fight_data)

In [410]:
with open('ufc_fights_sample.csv','w',newline='',encoding='UTF8') as csv_file:
    writer = csv.writer(csv_file)
    writer.writerow(['event_id', 'referee', 'f_1', 'f_2', 'winner', 'num_rounds', 'weight_class', 'finish_method', 'finish_method_details', 'finish_round', 'finish_time', 'fight_url'])

In [411]:
with open('ufc_fights_sample.csv','a+') as csv_file:
    writer = csv.writer(csv_file)
    for row in fight_data:
        writer.writerow(row)

In [30]:
with open('ufc_events_sample.csv','w',encoding='UTF8') as csv_file:
    writer = csv.writer(csv_file)
    writer.writerow(['event_name', 'event_date', 'event_city', 'event_state', 'event_country'])

In [71]:
with open('ufc_events_sample.csv','a') as csv_file:
    writer = csv.writer(csv_file)
    for row in get_event_data(event_urls[600:653]):
        writer.writerow(row)

In [89]:
with open('ufc_events_sample_2.csv','w',encoding='UTF8') as csv_file:
    writer = csv.writer(csv_file)
    writer.writerow(['event_id','event_name', 'event_date', 'event_city', 'event_state', 'event_country'])

In [90]:
with open('ufc_events_sample_2.csv','a') as csv_file:
    writer = csv.writer(csv_file)
    for row in get_event_data(get_event_urls()[0:50]):
        writer.writerow(row)

In [285]:
event_urls

['http://ufcstats.com/event-details/e9e1acc96536bb4f',
 'http://ufcstats.com/event-details/a780d16cf7eed44d',
 'http://ufcstats.com/event-details/b9415726dc3ec526',
 'http://ufcstats.com/event-details/b6c6d1731ff00eeb',
 'http://ufcstats.com/event-details/7abe471b61725980',
 'http://ufcstats.com/event-details/6f812143641ceff8',
 'http://ufcstats.com/event-details/901cddcbfa079097',
 'http://ufcstats.com/event-details/3c6976f8182d9527',
 'http://ufcstats.com/event-details/51b1e2fd9872005b',
 'http://ufcstats.com/event-details/6fb1ba67bef41b37',
 'http://ufcstats.com/event-details/15b1b21cd743d652',
 'http://ufcstats.com/event-details/3dc3022232b79c7a',
 'http://ufcstats.com/event-details/aec273fcb765330d',
 'http://ufcstats.com/event-details/e4bb7e483c4ad318',
 'http://ufcstats.com/event-details/35080a7f406f9ab3',
 'http://ufcstats.com/event-details/1ccff7f0cfdf85eb',
 'http://ufcstats.com/event-details/806975e1b4f47b27',
 'http://ufcstats.com/event-details/f21a3d68fb9df387',
 'http://u