In [7]:
import requests
import bs4
import re
import csv
from string import ascii_lowercase
import os

1. Get all URLs
2. Define columns
3. Iterate through URLs and scrape into columns
4. Save to CSV

1. event_urls = get_event_urls()
2. fight_urls = get_fight_urls(event_urls)
3. get_event_data(event_urls)
4. get_fight_data(fight_urls)

## 1. Create CSV files to scrape to

In [213]:
def create_csv_files():
    if 'ufc_event_data.csv' not in os.listdir():
        with open('ufc_event_data.csv','w', newline='',encoding='UTF8') as ufc_event_data:
            writer = csv.writer(ufc_event_data)
            writer.writerow(['event_name',
                             'event_date',
                             'event_city',
                             'event_state',
                             'event_country',
                             'event_url'])
    else:
        pass
    
    if 'ufc_fight_overview_data.csv' not in os.listdir():
        with open('ufc_fight_overview_data.csv','w',newline='',encoding='UTF8') as ufc_fight_data:
            writer = csv.writer(ufc_fight_data)
            writer.writerow(['event_name', 
                             'referee', 
                             'f_1', 
                             'f_2', 
                             'winner', 
                             'num_rounds', 
                             'weight_class', 
                             'finish_method', 
                             'finish_method_details', 
                             'finish_round', 
                             'finish_time', 
                             'fight_url'])
    else:
        pass
        
    if 'ufc_fight_stat_data.csv' not in os.listdir():
        with open ('ufc_fight_stat_data.csv','w',newline="",encoding='UTF8') as ufc_fighter_data:
            writer = csv.writer(ufc_fighter_data)
            writer.writerow(['knockdowns',
                             'total_strikes_att',
                             'total_strikes_succ',
                             'head_strikes_att',
                             'head_strikes_succ',
                             'body_strikes_att',
                             'body_strikes_succ',
                             'leg_strikes_att',
                             'leg_strikes_succ',
                             'distance_strikes_att',
                             'distance_strikes_succ',
                             'clinch_strikes_att',
                             'clinch_strikes_succ',
                             'ground_strikes_att',
                             'ground_strikes_succ',
                             'sig_strikes_att',
                             'sig_strikes_succ',
                             'takedown_att',
                             'takedown_succ',
                             'submission_att',
                             'reversals',
                             'ctrl_time'])
    else:
        pass

In [211]:
create_csv_files()

## 1. Define functions to scrape URLs for each event/fight

In [152]:
#Function to scrape url of each UFC event from ufcstats.com
def get_event_urls():
    main_url = requests.get('http://ufcstats.com/statistics/events/completed?page=all')
    main_soup = bs4.BeautifulSoup(main_url.text, 'lxml')
    
    #Adds href to list if href (1) contains a link, (2) contains keyword 'event-details', and (3) has not been scraped already
    all_event_urls = [item.get('href') for item in  main_soup.find_all('a') 
                      if type(item.get('href')) == str 
                      and 'event-details' in item.get('href')]
    
    return all_event_urls

In [153]:
#Function that takes as input a list of event urls, and outputs a list of urls for each fight in each event
def get_fight_urls(event_urls):
            
    all_fight_urls = []
    for url in event_urls:
        event_url = requests.get(url)
        event_soup = bs4.BeautifulSoup(event_url.text,'lxml')

        for item in event_soup.find_all('a', class_='b-flag b-flag_style_green'):
            all_fight_urls.append(item.get('href'))

    return all_fight_urls

In [413]:
def get_fighter_urls():
    main_url_list = [requests.get(f'http://ufcstats.com/statistics/fighters?char={letter}&page=all') for letter in ascii_lowercase]
    main_soup_list = [bs4.BeautifulSoup(url.text,'lxml') for url in main_url_list]
    all_links = []
    
    for item in main_soup_list:
        for item in item.select('a.b-link')[1::3]:
            all_links.append(item.get('href')) 
    
    return all_links

## 2. Define functions to scrape event/fight data

In [203]:
#Function to scrape details of each UFC event - name, date, and location - and store in a list.
def get_event_data(event_urls):

    with open('ufc_event_data.csv','a+') as csv_file:
        writer = csv.writer(csv_file)
    
        for event in event_urls:
            event_request = requests.get(event)
            event_soup = bs4.BeautifulSoup(event_request.text,'lxml')

            event_name = event_soup.select('h2')[0].text
            event_date = event_soup.select('li')[3].text.split(':')[-1]
            event_full_location = event_soup.select('li')[4].text.split(':')[1].strip().split(',')
            event_city = event_soup.select('li')[4].text.split(':')[1].strip().split(',')[0]
            event_country = event_soup.select('li')[4].text.split(':')[1].strip().split(',')[-1]
            event_url = event

            #Check if event address includes state and add to event_state if True
            if len(event_full_location)>2:
                   event_state = event_soup.select('li')[4].text.split(':')[1].strip().split(',')[1]
            else:
                   event_state = 'NULL'


            writer.writerow([event_name.strip(), 
                             event_date.strip(), 
                             event_city.strip(), 
                             event_state.strip(), 
                             event_country.strip(), 
                             event_url.strip()])

In [179]:
##Function to scrape details of each UFC fight and store in a list.
def get_fight_overview_data(fight_urls):
    
    with open('ufc_fight_overview_data.csv','a+') as csv_file:
        writer = csv.writer(csv_file)
    
        for url in fight_urls:

            fight_url = requests.get(url)
            fight_soup = bs4.BeautifulSoup(fight_url.text,'lxml')

            #Define repeated select statements for column values
            select_overview = fight_soup.select('i.b-fight-details__text-item')
            select_result = fight_soup.select('i.b-fight-details__text-item_first')
            select_result_details = fight_soup.select('p.b-fight-details__text')
            select_details = fight_soup.select('p.b-fight-details__table-text')
            select_weight_class_details = fight_soup.select('i.b-fight-details__fight-title')
            select_win_lose = fight_soup.select('i.b-fight-details__person-status')

            #Scrape fight details
            event_name = fight_soup.h2.text
            referee = select_overview[3].text.split(':')[1]
            f_1 = select_details[0].text
            f_2 = select_details[1].text

            #If there is a winner, set 'winner' to winning fighter. If no winner (e.g. NC, DQ) set 'winner' to NULL
            if (select_win_lose[0].text.strip()=='W') | (select_win_lose[1].text.strip()=='W'):
                if (select_win_lose[0].text.strip()=='W'):
                    winner = f_1
                else:
                    winner = f_2
            else: print('NULL')
            num_rounds = select_overview[2].text.split(':')[1].strip()[0]
            if len(select_weight_class_details[0].text.split()) > 2:
                weight_class = select_weight_class_details[0].text.split()[0] + ' ' + select_weight_class_details[0].text.split()[1]
            else:
                weight_class = select_weight_class_details[0].text.split()[0]
            if 'Decision' in select_result[0].text.split(':')[1]:
                result = select_result[0].text.split(':')[1].split()[0]
                result_details = select_result_details[0].text.split(':')[1].split()[-1]
            else:
                result = select_result[0].text.split(':')[1] 
                result_details = select_result_details[1].text.split(':')[-1]
            finish_round = select_overview[0].text.split(':')[1]
            finish_time = re.findall('\d:\d\d',select_overview[1].text)[0]
            fight_url = url


            writer.writerow([event_name.strip(),
                             referee.strip(), 
                             f_1.strip(), 
                             f_2.strip(), 
                             winner.strip(), 
                             num_rounds.strip(), 
                             weight_class.strip(), 
                             result.strip(), 
                             result_details.strip(), 
                             finish_round.strip(), 
                             finish_time.strip(), 
                             fight_url])


In [252]:
def get_fight_stat_data(fight_urls):
    
    with open('ufc_fight_stat_data.csv','a+') as csv_file:
        writer = csv.writer(csv_file)
    
        for url in fight_urls:

            fight_url = requests.get(url)
            fight_soup = bs4.BeautifulSoup(main_url.text,'lxml')
            
            select_details = fight_soup.select('p.b-fight-details__table-text')

            event_name = fight_soup.h2.text
            fighter_name = select_details[0].text
            knockdowns = select_details[2].text
            total_strikes_att = select_details[8].text.split(' of ')[1]
            total_strikes_succ = select_details[8].text.split(' of ')[0]
            sig_strikes_att = select_details[4].text.split(' of ')[1]
            sig_strikes_succ = select_details[4].text.split(' of ')[0]
            head_strikes_att = select_details[126].text.split(' of ')[1]
            head_strikes_succ = select_details[126].text.split(' of ')[0]
            body_strikes_att = select_details[128].text.split(' of ')[1]
            body_strikes_succ = select_details[128].text.split(' of ')[0]
            leg_strikes_att = select_details[130].text.split(' of ')[1]
            leg_strikes_succ = select_details[130].text.split(' of ')[0]
            distance_strikes_att = select_details[132].text.split(' of ')[1]
            distance_strikes_succ = select_details[132].text.split(' of ')[0]
            clinch_strikes_att = select_details[134].text.split(' of ')[1]
            clinch_strikes_succ = select_details[134].text.split(' of ')[0]
            ground_strikes_att = select_details[136].text.split(' of ')[1]
            ground_strikes_succ = select_details[136].text.split(' of ')[0]
            takedown_att = select_details[10].text.split(' of ')[1]
            takedown_succ = select_details[10].text.split(' of ')[0]
            submission_att = select_details[14].text
            reversals = select_details[16].text
            ctrl_time = select_details[18].text
            
            writer.writerow([event_name.strip(),
                            fighter_name.strip(),
                            knockdowns.strip(),
                            total_strikes_att.strip(),
                            total_strikes_succ.strip(),
                            sig_strikes_att.strip(),
                            sig_strikes_succ.strip(),
                            head_strikes_att.strip(),
                            head_strikes_succ.strip(),
                            body_strikes_att.strip(),
                            body_strikes_succ.strip(),
                            leg_strikes_att.strip(),
                            leg_strikes_succ.strip(),
                            distance_strikes_att.strip(),
                            distance_strikes_succ.strip(),
                            clinch_strikes_att.strip(),
                            clinch_strikes_succ.strip(),
                            ground_strikes_att.strip(),
                            ground_strikes_succ.strip(),
                            takedown_att.strip(),
                            takedown_succ.strip(),
                            submission_att.strip(),
                            reversals.strip(),
                            ctrl_time.strip()])

            event_name = fight_soup.h2.text
            fighter_name = select_details[1].text
            knockdowns = select_details[3].text
            total_strikes_att = select_details[9].text.split(' of ')[1]
            total_strikes_succ = select_details[9].text.split(' of ')[0]
            head_strikes_att = select_details[127].text.split(' of ')[1]
            head_strikes_succ = select_details[127].text.split(' of ')[0]
            body_strikes_att = select_details[129].text.split(' of ')[1]
            body_strikes_succ = select_details[129].text.split(' of ')[0]
            leg_strikes_att = select_details[131].text.split(' of ')[1]
            leg_strikes_succ = select_details[131].text.split(' of ')[0]
            distance_strikes_att = select_details[133].text.split(' of ')[1]
            distance_strikes_succ = select_details[133].text.split(' of ')[0]
            clinch_strikes_att = select_details[135].text.split(' of ')[1]
            clinch_strikes_succ = select_details[135].text.split(' of ')[0]
            ground_strikes_att = select_details[137].text.split(' of ')[1]
            ground_strikes_succ = select_details[137].text.split(' of ')[0]
            sig_strikes_att = select_details[5].text.split(' of ')[1]
            sig_strikes_succ = select_details[5].text.split(' of ')[0]
            takedown_att = select_details[11].text.split(' of ')[1]
            takedown_succ = select_details[11].text.split(' of ')[0]
            submission_att = select_details[15].text
            reversals = select_details[17].text
            ctrl_time = select_details[19].text
            
            writer.writerow([event_name.strip(),
                            fighter_name.strip(),
                            knockdowns.strip(),
                            total_strikes_att.strip(),
                            total_strikes_succ.strip(),
                            sig_strikes_att.strip(),
                            sig_strikes_succ.strip(),
                            head_strikes_att.strip(),
                            head_strikes_succ.strip(),
                            body_strikes_att.strip(),
                            body_strikes_succ.strip(),
                            leg_strikes_att.strip(),
                            leg_strikes_succ.strip(),
                            distance_strikes_att.strip(),
                            distance_strikes_succ.strip(),
                            clinch_strikes_att.strip(),
                            clinch_strikes_succ.strip(),
                            ground_strikes_att.strip(),
                            ground_strikes_succ.strip(),
                            takedown_att.strip(),
                            takedown_succ.strip(),
                            submission_att.strip(),
                            reversals.strip(),
                            ctrl_time.strip()])
        

In [482]:
def get_fighter_data(fighter_urls):
    
    csv_rows = []
    
    for url in fighter_urls:
        fighter_url = requests.get(url)
        fighter_soup = bs4.BeautifulSoup(fighter_url.text,'lxml')
        
        select_name = fighter_soup.select('span')[0].text.split()
        select_nickname = fighter_soup.select('p.b-content__Nickname')
        select_details = fighter_soup.select('li.b-list__box-list-item')
        
        fighter_id = 'NULL'
        if len(select_name)>1:
            fighter_f_name = select_name[0]
            fighter_l_name = select_name[-1]
        else:
            fighter_f_name = select_name[0]
            fighter_l_name = 'NULL'
        if select_nickname[0].text == '\n':
            fighter_nickname = 'NULL'
        else:
            fighter_nickname = select_nickname[0].text
        if '--' in select_details[0].text.split(':')[1].strip().split("'"):
            fighter_height_cm = 'NULL'
        else:
            fighter_height_ft = select_details[0].text.split(':')[1].strip()[0]
            fighter_height_in = select_details[0].text.split(':')[1].strip().split("'")[1].strip().strip('"')
            fighter_height_cm = ((int(fighter_height_ft)*12.0)*2.54)+(int(fighter_height_in)*2.54)
        if '--' in select_details[1].text.split(':')[1]:
            fighter_weight_lbs = 'NULL'
        else:
            fighter_weight_lbs = select_details[1].text.split(':')[1].split()[0]
        if '--' in select_details[2].text.split(':')[1]:
            fighter_reach_cm = 'NULL'
        else:
            fighter_reach_cm = round(int(select_details[2].text.split(':')[1].strip().strip('"'))*2.54,2)
        if select_details[3].text.split(':')[1].strip()=='':
            fighter_stance = 'NULL'
        else:
            fighter_stance = select_details[3].text.split(':')[1].strip()
        fighter_dob = select_details[4].text.split(':')[1]
        fighter_w = 'NULL'
        fighter_l = 'NULL'
        fighter_d = 'NULL'
        fighter_nc = 'NULL'
        fighter_url = url
        
        csv_rows.append([fighter_id, fighter_f_name.strip(), fighter_l_name.strip(), fighter_nickname.strip(),fighter_height_cm,fighter_weight_lbs.strip(),fighter_reach_cm,fighter_stance.strip(),fighter_dob.strip(),fighter_url])
        
    return csv_rows

## 3. Define variables

In [166]:
event_urls = get_event_urls()

In [167]:
fight_urls = get_fight_urls(event_urls)

In [None]:
fighter_urls = get_fighter_urls()

In [175]:
with open('ufc_event_data.csv','r') as csv_file:
    reader = csv.DictReader(csv_file)
    scraped_event_urls = [row['event_url'] for row in reader]
    for url in scraped_event_urls:
        if url in event_urls:
            event_urls.remove(url)

In [188]:
with open('ufc_fight_data.csv','r') as csv_file:
    reader = csv.DictReader(csv_file)
    scraped_fight_urls = [row['fight_url'] for row in reader]
    for url in scraped_fight_urls:
        if url in fight_urls:
            fight_urls.remove(url)

In [204]:
get_event_data(event_urls)

KeyboardInterrupt: 

In [190]:
get_fight_data(fight_urls)

KeyboardInterrupt: 

In [189]:
len(fight_urls)

2506

In [478]:
fighter_data = get_fighter_data(fighter_urls[0:10])

In [466]:
fighter_url = requests.get('http://ufcstats.com/fighter-details/93fe7332d16c6ad9')
fighter_soup = bs4.BeautifulSoup(fighter_url.text,'lxml')
        
select_name = fighter_soup.select('span')[0].text.split()
select_nickname = fighter_soup.select('p.b-content__Nickname')
select_details = fighter_soup.select('li.b-list__box-list-item')

select_details[3].text.split(':')[1].strip()==''

True

In [329]:
fighter_url = requests.get('http://ufcstats.com/fighter-details/ec421d5de0aea624')
fighter_soup = bs4.BeautifulSoup(fighter_url.text,'lxml')
select_details = fighter_soup.select('li.b-list__box-list-item')
        
select_details[3].text.split(':')[1].strip()

''

In [481]:
fighter_data[0][-1]

'Jul 13, 1978'

In [122]:
get_event_data(get_event_urls())

In [150]:
get_fight_urls()

KeyboardInterrupt: 

In [135]:
event_urls = get_event_urls()

## 3. Normalise data

In [408]:
#Function to normalise the data by setting the primary key 'event_id' as a foreign key in fight_data
def normalise(event_data,fight_data):
    all_event_ids = {}

    for item in event_data:
        all_event_ids[item[1]]=item[0]

    for item in fight_data:
        item[0] = all_event_ids[item[0]]

In [409]:
normalise(event_data,fight_data)

In [87]:
with open('ufc_event_data.csv','a+') as csv_file:
    reader = csv.DictReader(csv_file)
    writer = csv.writer(csv_file)
    print(reader['event_url'])

TypeError: 'DictReader' object is not subscriptable

In [411]:
with open('ufc_fight_data.csv','a+') as csv_file:
    writer = csv.writer(csv_file)
    for row in fight_data:
        writer.writerow(row)