In [13]:
import requests
import bs4
import csv
import os

In [14]:
#Function to create csv file to add scraped data to
def create_csv_file():
    #Checks if file already exists - if not, creates new file with column headers
    if 'ufc_event_data.csv' not in os.listdir():
        with open('ufc_event_data.csv','w', newline='',encoding='UTF8') as ufc_event_data:
            writer = csv.writer(ufc_event_data)
            writer.writerow(['event_name',
                             'event_date',
                             'event_city',
                             'event_state',
                             'event_country',
                             'event_url'])
    else:
        pass

In [15]:
#Function to scrape url of each UFC event from ufcstats.com
def get_event_urls():
    main_url = requests.get('http://ufcstats.com/statistics/events/completed?page=all')
    main_soup = bs4.BeautifulSoup(main_url.text, 'lxml')
    
    #Adds href to list if href (1) contains a link, (2) contains keyword 'event-details', and (3) has not been scraped already
    all_event_urls = [item.get('href') for item in  main_soup.find_all('a') 
                      if type(item.get('href')) == str 
                      and 'event-details' in item.get('href')]
    
    return all_event_urls

In [23]:
#Function to ensure each url is only scraped once when script is run multiple times
def prevent_duplicates(event_urls):
    if 'ufc_event_data.csv' in os.listdir():
        with open('ufc_event_data.csv','r') as csv_file:
            reader = csv.DictReader(csv_file)
            scraped_event_urls = [row['event_url'] for row in reader]
            for url in scraped_event_urls:
                if url in event_urls:
                    event_urls.remove(url)
    else:
        pass

In [17]:
#Function to scrape details of each UFC event and add to append to file
def get_event_data(event_urls):

    prevent_duplicates(event_urls)
    
    with open('ufc_event_data.csv','a+') as csv_file:
        writer = csv.writer(csv_file)
    
        for event in event_urls:
            event_request = requests.get(event)
            event_soup = bs4.BeautifulSoup(event_request.text,'lxml')

            event_name = event_soup.select('h2')[0].text
            event_date = event_soup.select('li')[3].text.split(':')[-1]
            event_full_location = event_soup.select('li')[4].text.split(':')[1].strip().split(',')
            event_city = event_soup.select('li')[4].text.split(':')[1].strip().split(',')[0]
            event_country = event_soup.select('li')[4].text.split(':')[1].strip().split(',')[-1]
            event_url = event

            #Check if event address includes state and add to event_state if True
            if len(event_full_location)>2:
                   event_state = event_soup.select('li')[4].text.split(':')[1].strip().split(',')[1]
            else:
                   event_state = 'NULL'


            writer.writerow([event_name.strip(), 
                             event_date.strip(), 
                             event_city.strip(), 
                             event_state.strip(), 
                             event_country.strip(), 
                             event_url.strip()])

In [21]:
create_csv_file()

In [None]:
event_urls = get_event_urls()

In [None]:
prevent_duplicates(event_urls)

In [None]:
get_event_data(event_urls)