In [6]:
import requests
import bs4
import re
import csv
import os

In [90]:
def create_csv_file():
    if 'ufc_fight_overview_data.csv' not in os.listdir():
        with open('ufc_fight_overview_data.csv','w',newline='',encoding='UTF8') as ufc_fight_data:
            writer = csv.writer(ufc_fight_data)
            writer.writerow(['event_name', 
                             'referee', 
                             'f_1', 
                             'f_2', 
                             'winner', 
                             'num_rounds', 
                             'title_fight',
                             'weight_class', 
                             'gender',
                             'result', 
                             'result_details', 
                             'finish_round', 
                             'finish_time', 
                             'fight_url'])
    else:
        pass

In [8]:
#Function to scrape url of each UFC event from ufcstats.com
def get_event_urls():
    main_url = requests.get('http://ufcstats.com/statistics/events/completed?page=all')
    main_soup = bs4.BeautifulSoup(main_url.text, 'lxml')
    
    #Adds href to list if href (1) contains a link, (2) contains keyword 'event-details', and (3) has not been scraped already
    all_event_urls = [item.get('href') for item in  main_soup.find_all('a') 
                      if type(item.get('href')) == str 
                      and 'event-details' in item.get('href')]
    
    return all_event_urls

In [9]:
#Function that takes as input a list of event urls, and outputs a list of urls for each fight in each event
def get_fight_urls(event_urls):
            
    all_fight_urls = []
    for url in event_urls:
        event_url = requests.get(url)
        event_soup = bs4.BeautifulSoup(event_url.text,'lxml')

        for item in event_soup.find_all('a', class_='b-flag b-flag_style_green'):
            all_fight_urls.append(item.get('href'))

    return all_fight_urls

In [49]:
def prevent_duplicates(fight_urls):
    if 'ufc_fight_overview_data.csv' in os.listdir():
        with open('ufc_fight_overview_data.csv','r') as csv_file:
            reader = csv.DictReader(csv_file)
            scraped_fight_urls = [row['fight_url'] for row in reader]
            for url in scraped_fight_urls:
                if url in fight_urls:
                    fight_urls.remove(url)
    else:
        pass

In [169]:
##Function to scrape details of each UFC fight and store in a list.
def get_fight_overview_data(fight_urls):
    
    prevent_duplicates(fight_urls)
    
    with open('ufc_fight_overview_data.csv','a+') as csv_file:
        writer = csv.writer(csv_file)
    
        for url in fight_urls:

            fight_url = requests.get(url)
            fight_soup = bs4.BeautifulSoup(fight_url.text,'lxml')

            #Define repeated select statements for column values
            select_overview = fight_soup.select('i.b-fight-details__text-item')
            select_result = fight_soup.select('i.b-fight-details__text-item_first')
            select_result_details = fight_soup.select('p.b-fight-details__text')
            select_details = fight_soup.select('p.b-fight-details__table-text')
            select_fight_type = fight_soup.select('i.b-fight-details__fight-title')
            select_win_lose = fight_soup.select('i.b-fight-details__person-status')

            #Scrape fight details
            event_name = fight_soup.h2.text
            referee = select_overview[3].text.split(':')[1]
            try:
                f_1 = select_details[0].text
            except:
                f_1 = fight_soup.select('a.b-fight-details__person-link')[0].text
            try:
                f_2 = select_details[1].text
            except:
                f_2 = fight_soup.select('a.b-fight-details__person-link')[1].text

            #If there is a winner, set 'winner' to winning fighter. If no winner (e.g. NC, DQ) set 'winner' to NULL
            if (select_win_lose[0].text.strip()=='W') | (select_win_lose[1].text.strip()=='W'):
                if (select_win_lose[0].text.strip()=='W'):
                    winner = f_1
                else:
                    winner = f_2
            else: print('NULL')
            num_rounds = select_overview[2].text.split(':')[1].strip()[0]
            if 'Title' in select_fight_type[0].text:
                title_fight = 'Y'
            else:
                title_fight = 'N'
            if len(select_fight_type[0].text.split()) > 2:
                weight_class = select_fight_type[0].text.split()[0] + ' ' + select_fight_type[0].text.split()[1]
            else:
                weight_class = select_fight_type[0].text.split()[0]
            if 'Women' in select_fight_type[0].text:
                gender = 'F'
            else:
                gender = 'M'
            if 'Decision' in select_result[0].text.split(':')[1]:
                result = select_result[0].text.split(':')[1].split()[0]
                result_details = select_result[0].text.split(':')[1].split()[-1]
            else:
                result = select_result[0].text.split(':')[1] 
                result_details = select_result_details[1].text.split(':')[-1]
            finish_round = select_overview[0].text.split(':')[1]
            finish_time = re.findall('\d:\d\d',select_overview[1].text)[0]


            writer.writerow([event_name.strip(),
                             referee.strip(), 
                             f_1.strip(), 
                             f_2.strip(), 
                             winner.strip(), 
                             num_rounds.strip(), 
                             title_fight,
                             weight_class.strip(), 
                             gender,
                             result.strip(), 
                             result_details.strip(), 
                             finish_round.strip(), 
                             finish_time.strip(), 
                             url])

In [96]:
create_csv_file()

In [36]:
event_urls = get_event_urls()

In [93]:
fight_urls = get_fight_urls(event_urls)

In [170]:
get_fight_overview_data(fight_urls)

In [159]:
len(fight_urls)

153

In [160]:
fight_urls

['http://ufcstats.com/fight-details/b80872821bc4f6ba',
 'http://ufcstats.com/fight-details/748657d9ba9d7c71',
 'http://ufcstats.com/fight-details/ec1bda9a4c2aab42',
 'http://ufcstats.com/fight-details/5c3542260741c6bc',
 'http://ufcstats.com/fight-details/e0c636beed345e1d',
 'http://ufcstats.com/fight-details/764e492b55c0ec40',
 'http://ufcstats.com/fight-details/0e0629a1503125a6',
 'http://ufcstats.com/fight-details/b3cafb4e6e1b2ad0',
 'http://ufcstats.com/fight-details/1c1f8f281b2065e0',
 'http://ufcstats.com/fight-details/b5e49b35bc25c5a9',
 'http://ufcstats.com/fight-details/3bd789bcfb12368d',
 'http://ufcstats.com/fight-details/6603f1d60bfeefa9',
 'http://ufcstats.com/fight-details/da77ead135cb4f17',
 'http://ufcstats.com/fight-details/c99e302777049f2d',
 'http://ufcstats.com/fight-details/1a5171802bf3bce6',
 'http://ufcstats.com/fight-details/1f561a3e35a9b48a',
 'http://ufcstats.com/fight-details/c92596c3143db25b',
 'http://ufcstats.com/fight-details/3f6c0a76407e1863',
 'http://u

In [168]:
fight_url = requests.get('http://ufcstats.com/fight-details/b80872821bc4f6ba')
fight_soup = bs4.BeautifulSoup(fight_url.text,'lxml')

select_details = fight_soup.select('a.b-fight-details__person-link')

select_details[1].text

'Josh Stuart '