In [1]:
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from string import ascii_letters
import re

# <span style='color:green'> **[done]** </span> Functions setup
* <span style='color:green'> **[done]** </span> get_columns(soup)

* <span style='color:green'> **[done]** </span> get_fight_list(soup)
 
* <span style='color:green'> **[done]** </span> get_links(soup) 
 
* <span style='color:green'> **[done]** </span> get_soup(url)
 
* <span style='color:green'> **[done]** </span> get_row(rows_list, index = 0)

* <span style='color:green'> **[done]** </span> get_date(soup)

In [2]:
def get_row(rows_list, date, index = 0):
    
    '''return array of single row data'''
    
    row_data = rows_list[index].find_all('p',{'class':'b-fight-details__table-text'})
    
    row = []
    for index in range(0,len(row_data)):
        row.append(row_data[index].text.strip())
    row.append(date)
    
    if row[0] == 'win':
        edited_row = [row[0],row[1]+':'+row[2], row[3]+':'+row[4],row[5]+':'+row[6],row[7]+':'+row[8],row[9]+':'+row[10],row[11],row[12]+':'+row[13],row[14],row[15],row[16]]
    else:   
        edited_row = [row[0]+':'+row[1],row[2]+':'+row[3],row[4]+':'+row[5],row[6]+':'+row[7],row[8]+':'+row[9],row[10]+':'+row[11],row[12],row[13]+':'+row[14],row[15],row[16],row[17]]
        
    return edited_row    

In [3]:
def get_fight_list(soup):
    
    '''return html soup of fights list in certain page'''
    
    fights_list = soup.find_all('tr',{'class':'b-fight-details__table-row b-fight-details__table-row__hover js-fight-details-click'})
    
    return fights_list

In [4]:
def get_columns(soup):
    
    '''return list of columns'''
    
    column_items = soup.find_all('th',{'class':'b-fight-details__table-col'})
    columns = []
    for item in column_items:
        columns.append(item.text.strip())
          
    return columns

In [5]:
def get_links(soup):
    
    '''return list of all page links'''
    
    url_links = soup.find_all('a', {'class':'b-link b-link_style_black'})
    links = []

    for item in url_links:
        links.append( item['href'] )
        
    return np.unique(links)    

In [6]:
def get_soup(url, headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36'}):
    
    '''set url page, 
    return soup as text'''
    
    try:
        r = requests.get(url)
        soup = BeautifulSoup(r.text, 'html.parser')
    except:
        print('Error on: ' + url) 
        return None
    
    return soup

In [7]:
def get_date(soup):
    
    '''return event date'''
    
    item = soup.find('li',{'class':'b-list__box-list-item'})
    string = item.get_text().strip()
    start = len('DATE:')
    date = string[start:].strip()
    
    return date

# <span style='color:green'> **[done]** </span> Main()

### <span style='color:green'> **[done]** </span> Extract column's names

In [8]:
url = 'http://ufcstats.com/statistics/events/completed?page=all'
soup = get_soup(url)
links = get_links(get_soup(url))
columns = get_columns(get_soup(links[0]))
columns.extend(['date'])
print(columns)

['W/L', 'Fighter', 'Kd', 'Str', 'Td', 'Sub', 'Weight class', 'Method', 'Round', 'Time', 'date']


### <span style='color:green'> **[done]** </span> construct Pandas DataFrame

In [9]:
df = pd.DataFrame(columns = columns)
df

Unnamed: 0,W/L,Fighter,Kd,Str,Td,Sub,Weight class,Method,Round,Time,date


### <span style='color:green'> **[done]** </span> adding data to DataFrame

In [10]:
current_page = 0

for url in links: 
    
    current_page += 1
    fights_list = get_fight_list(get_soup(url))
    event_date = get_date(get_soup(url))
    
    for index in range(0,len(fights_list)):
        
        percent = round(((current_page-1)/len(links)) * 100, 1)
        print('Scraping page %d/%d => %s %% completed ' %(current_page, len(links), percent), end = '\r')
        row = get_row(fights_list, event_date, index)
        df.loc[len(df)] = row

Scraping page 633/633 => 99.8 % completed 

In [11]:
df.tail(3)

Unnamed: 0,W/L,Fighter,Kd,Str,Td,Sub,Weight class,Method,Round,Time,date
6971,win,Aleksandra Albu:Kailin Curran,0:0,82:86,5:1,0:0,Women's Strawweight,U-DEC:,3,5:00,"July 29, 2017"
6972,win,Jarred Brooks:Eric Shelton,0:0,27:31,5:1,1:1,Flyweight,S-DEC:,3,5:00,"July 29, 2017"
6973,win,Drew Dober:Joshua Burkman,1:0,15:11,0:0,0:0,Lightweight,KO/TKO:Punch,1,3:04,"July 29, 2017"


### <span style='color:green'> **[done]** </span> saving file

In [13]:
df.to_csv('../data/ufc_fights_stats.csv')

# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>