In [65]:
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from string import ascii_letters

# Functions setup
* [done] get_columns(soup)

* [done] get_fight_list(soup)
 
* [done] get_links(soup) 
 
* [done] get_soup(url)
 
* [done] get_row(rows_list, index = 0)

In [232]:
def get_row(rows_list, index = 0):
    
    '''return array of single row data'''
    
    row_data = rows_list[index].find_all('p',{'class':'b-fight-details__table-text'})
    
    row = []

    for index in range(len(single_row_data)):
        row.append(row_data[index].text.strip())
    
    edited_row = [row[0],row[1]+':'+row[2],row[3]+':'+row[4],row[5]+':'+row[6],row[7]+':'+row[8],row[9]+':'+row[10],row[11],row[12],row[14],row[15]]

    return edited_row    

In [297]:
def get_fight_list(soup):
    
    '''return html soup of fights list in certain page'''
    
    fights_list = soup.find_all('tr',{'class':'b-fight-details__table-row b-fight-details__table-row__hover js-fight-details-click'})
    
    return fights_list

In [51]:
def get_columns(soup):
    
    '''return list of columns'''
    
    column_items = soup.find_all('th',{'class':'b-fight-details__table-col'})
    columns = []
    for item in column_items:
        columns.append(item.text.strip())
          
    return columns

In [18]:
def get_links(soup):
    
    '''return list of all page links'''
    
    url_links = soup.find_all('a', {'class':'b-link b-link_style_black'})
    links = []

    for item in url_links:
        links.append( item['href'] )
        
    return np.unique(links)    

In [17]:
def get_soup(url, headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36'}):
    
    '''set url page, 
    return soup as text'''
    
    try:
        r = requests.get(url)
        soup = BeautifulSoup(r.text, 'html.parser')
    except:
        print('Error on: ' + url) 
        return None
    
    return soup

# Main()

### - [done] Extract column's names

In [295]:
url = 'http://ufcstats.com/statistics/events/completed?page=all'
soup = get_soup(url)
links = get_links(get_soup(url))
columns = get_columns(get_soup(links[0]))
print(columns)

['W/L', 'Fighter', 'Kd', 'Str', 'Td', 'Sub', 'Weight class', 'Method', 'Round', 'Time']


### - [done] construct Pandas DataFrame

In [296]:
df = pd.DataFrame(columns = columns)
print(df)

Empty DataFrame
Columns: [W/L, Fighter, Kd, Str, Td, Sub, Weight class, Method, Round, Time]
Index: []


### -[done] adding data to DataFrame

In [318]:
current_page = 0

for url in links:
    
    current_page += 1
    fights_list = get_fight_list(get_soup(url))
    
    for index in range(0,len(fights_list)):
        
        percent = round(((current_page-1)/len(links)) * 100, 1)
        print('Scraping page %d/%d => %s %% completed ' %(current_page, len(links), percent), end = '\r')
        row = get_row(fights_list,index)
        df.loc[current_page + index] = row

Scraping page 618/618 => 99.8 % completed 

In [319]:
df.tail()

Unnamed: 0,W/L,Fighter,Kd,Str,Td,Sub,Weight class,Method,Round,Time
625,win,Brian Ortega:Renato Moicano,0:0,65:109,0:2,1:0,Featherweight,SUB,3,2:59
626,win,Calvin Kattar:Andre Fili,0:0,75:73,2:0,0:0,Featherweight,U-DEC,3,5:00
627,win,Aleksandra Albu:Kailin Curran,0:0,82:86,5:1,0:0,Women's Strawweight,U-DEC,3,5:00
628,win,Jarred Brooks:Eric Shelton,0:0,27:31,5:1,1:1,Flyweight,S-DEC,3,5:00
629,win,Drew Dober:Joshua Burkman,1:0,15:11,0:0,0:0,Lightweight,KO/TKO,1,3:04


### -[done] saving file

In [320]:
df.to_csv('ufc_fights_stats.csv')