In [1]:
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup

# Function setup

In [3]:
def get_row(soup):
    
    '''get soup & return np.array of all relevant row details'''
    
    row = get_name(soup)
    row = np.append( row,get_record(soup) )
    row = np.append( row,get_stats(soup) )
    
    return row

In [4]:
def get_record(soup):
    
    '''get soup & return np.array of wins,draws,losses'''
    
    h2 = soup.find('h2',{'class':'b-content__title'})

    record = h2 .contents[3].contents[0].strip()
    record = record[record.find(':')+2:len(record)]
    
    wins = record[:record.find('-')]
    
    losses = record[record.find('-')+1:record.find('-')+record.find('-')]
    
    if record.find(' ') == -1:
        string_end = len(record)
    else:
        string_end = record.find(' ')
    
    draws = record[record.find('-')+1+record.find('-'):string_end]
    
    return np.array([wins,losses,draws])

In [5]:
def get_name(soup):
    
    '''get soup & return fighter's name'''
    
    h2 = soup.find('h2',{'class':'b-content__title'})
    name = h2.contents[1].contents[0].strip()
    
    return np.array([name])

In [6]:
def get_stats(soup):
    
    '''get html data from page & return row with only the relevant values'''
    
    data = get_html_data(soup)
    
    row = []

    for i in range(len(data)):
        for j in range(len(data[i])):
            row.append( data[i][j].contents[2].strip() )
        
    return np.array(row)             

In [7]:
def get_columns(soup):
    
    '''return list of columns'''
    
    data = get_html_data(soup)
    
    columns = ['name','wins','losses','draws']
    
    for i in range(len(data)):
        for j in range(len(data[i])):
            columns.append( data[i][j].contents[1].contents[0].strip() )
     
    return columns

In [8]:
def get_html_data(soup):
    
    '''extracting data from fighter's page to list of size = (3,m)'''
    
    ul_class = soup.find_all('ul',{'class':'b-list__box-list'})
    data = []
    
    for item in ul_class:
        data.append( item.find_all('li',{'class':'b-list__box-list-item b-list__box-list-item_type_block'}))
    
    return data

In [9]:
def get_soup(url, headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36'}):
    
    '''set url page, 
    return soup as text'''
    
    try:
        r = requests.get(url)
        soup = BeautifulSoup(r.text, 'html.parser')
    except:
        print('Error on: ' + url) 
    
    return soup

In [10]:
def get_links(soup):
    
    '''return list of all page links'''
    
    url_links = soup.find_all('a', {'class':'b-link b-link_style_black'})
    links = []

    for item in url_links:
        links.append( item['href'] )
        
    return np.unique(links)    

# Main
***

### extracting columns:

In [11]:
url = 'http://ufcstats.com/statistics/fighters?char=a&page=all'
soup = get_soup(url)
links = get_links(soup)
columns = get_columns( get_soup(links[1]) )
print(columns)

['name', 'wins', 'losses', 'draws', 'Height:', 'Weight:', 'Reach:', 'STANCE:', 'DOB:', 'SLpM:', 'Str. Acc.:', 'SApM:', 'Str. Def:', '', 'TD Avg.:', 'TD Acc.:', 'TD Def.:', 'Sub. Avg.:']


### constructing pandas data frame:

In [12]:
df = pd.DataFrame(columns = columns)
df

Unnamed: 0,name,wins,losses,draws,Height:,Weight:,Reach:,STANCE:,DOB:,SLpM:,Str. Acc.:,SApM:,Str. Def:,Unnamed: 14,TD Avg.:,TD Acc.:,TD Def.:,Sub. Avg.:


### extracting fighters statistics:

In [20]:
pages = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z']

pages_amount = len(pages)
current_page = 0

for page in pages:
    current_page += 1 
    url = 'http://ufcstats.com/statistics/fighters?char=' + page + '&page=all'
    #soup = get_soup(url)
    page_links = get_links(get_soup(url))
    links_amount = len(page_links)
    
    current_link = 0
    for stat_page in page_links:
        current_link += 1
        percent = round((((current_page-1)/pages_amount) + (current_link/links_amount)*(1/pages_amount)) * 100, 2)
        print('Scraping page %d/%d (%d/%d links) => %s %% completed ' %(current_page, pages_amount, current_link, links_amount, percent), end = '\r')
        stat_page_soup = get_soup(stat_page)
        row = get_row(stat_page_soup)
        df.loc[len(df)] = row
        

Error on: http://ufcstats.com/fighter-details/e295b96f135d758a


UnboundLocalError: local variable 'soup' referenced before assignment

In [23]:
df.tail()

Unnamed: 0,name,wins,losses,draws,Height:,Weight:,Reach:,STANCE:,DOB:,SLpM:,Str. Acc.:,SApM:,Str. Def:,Unnamed: 14,TD Avg.:,TD Acc.:,TD Def.:,Sub. Avg.:
1725,Evan Elder,7,,0,"5' 10""",170 lbs.,"72""",Switch,"Apr 11, 1997",1.8,45%,5.0,33%,,0.0,0%,33%,0.0
1726,Kenny Ento,14,1.0,0,"6' 2""",192 lbs.,--,,--,8.41,79%,0.0,100%,,5.49,100%,0%,11.0
1727,Rob Emerson,18,1.0,0,"5' 9""",155 lbs.,"70""",Orthodox,"Jul 30, 1981",2.5,30%,2.83,64%,,2.02,52%,65%,0.4
1728,Dan Evensen,11,4.0,0,"6' 3""",250 lbs.,--,Orthodox,"Jun 01, 1974",0.27,10%,3.86,38%,,0.0,0%,0%,4.0
1729,Jamall Emmers,18,6.0,0,"5' 10""",145 lbs.,"74""",Orthodox,"Jul 24, 1989",5.73,53%,4.44,55%,,2.76,53%,100%,0.4


In [22]:
df.to_csv('ufc_stats.csv')

## >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>