In [1]:
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from string import ascii_letters
import re

# Function setup
* [done] get_row(soup)
* [done] get_record(soup)
* [done] get_name(soup)
* [done] get_stats(soup)
* [done] get_columns(soup)
* [done] get_html_data(soup)
* [done] get_soup(url, headers)
* [done] get_links(soup)

In [2]:
def get_row(soup):
    
    '''get soup & return np.array of all relevant row details'''
    
    row = get_name(soup)
    row = np.append( row,get_record(soup) )
    row = np.append( row,get_stats(soup) )
    
    return row

In [3]:
def get_record(soup):
    
    '''get soup & return np.array of wins,draws,losses'''
    
    h2 = soup.find('h2',{'class':'b-content__title'})

    record = h2 .contents[3].contents[0].strip()
    record = record[record.find(':')+2:len(record)]
    
    wins = record[:record.find('-')]
    
    losses = record[record.find('-')+1:record.find('-')+record.find('-')]
    
    if record.find(' ') == -1:
        string_end = len(record)
    else:
        string_end = record.find(' ')
    
    draws = record[record.find('-')+1+record.find('-'):string_end]
    
    return np.array([wins,losses,draws])

In [4]:
def get_name(soup):
    
    '''get soup & return fighter's name'''
    
    h2 = soup.find('h2',{'class':'b-content__title'})
    name = h2.contents[1].contents[0].strip()
    
    return np.array([name])

In [5]:
def get_stats(soup):
    
    '''get html data from page & return row with only the relevant values'''
    
    data = get_html_data(soup)
    
    row = []

    for i in range(len(data)):
        for j in range(len(data[i])):
            row.append( data[i][j].contents[2].strip() )
        
    return np.array(row)             

In [6]:
def get_columns(soup):
    
    '''return list of columns'''
    
    data = get_html_data(soup)
    
    columns = ['name','wins','losses','draws']
    
    for i in range(len(data)):
        for j in range(len(data[i])):
            columns.append( data[i][j].contents[1].contents[0].strip() )
     
    return columns

In [7]:
def get_html_data(soup):
    
    '''extracting data from fighter's page to list of size = (3,m)'''
    
    ul_class = soup.find_all('ul',{'class':'b-list__box-list'})
    data = []
    
    for item in ul_class:
        data.append( item.find_all('li',{'class':'b-list__box-list-item b-list__box-list-item_type_block'}))
    
    return data

In [8]:
def get_soup(url, headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36'}):
    
    '''set url page, 
    return soup as text'''
    
    try:
        r = requests.get(url)
        soup = BeautifulSoup(r.text, 'html.parser')
    except:
        print('Error on: ' + url) 
        return None
    
    return soup

In [9]:
def get_links(soup):
    
    '''return list of all page links'''
    
    url_links = soup.find_all('a', {'class':'b-link b-link_style_black'})
    links = []

    for item in url_links:
        links.append( item['href'] )
        
    return np.unique(links)    

# Main( )
***

### - [done] Extracting columns

In [10]:
url = 'http://ufcstats.com/statistics/fighters?char=a&page=all'
soup = get_soup(url)
links = get_links(soup)
columns = get_columns( get_soup(links[1]) )
print(columns)

['name', 'wins', 'losses', 'draws', 'Height:', 'Weight:', 'Reach:', 'STANCE:', 'DOB:', 'SLpM:', 'Str. Acc.:', 'SApM:', 'Str. Def:', '', 'TD Avg.:', 'TD Acc.:', 'TD Def.:', 'Sub. Avg.:']


### - [done] constructing pandas data frame

In [11]:
df = pd.DataFrame(columns = columns)
df

Unnamed: 0,name,wins,losses,draws,Height:,Weight:,Reach:,STANCE:,DOB:,SLpM:,Str. Acc.:,SApM:,Str. Def:,Unnamed: 14,TD Avg.:,TD Acc.:,TD Def.:,Sub. Avg.:


### - [done] extracting fighters statistics

In [12]:
pages = ascii_letters.lower()

pages_amount = len(pages)
current_page = 0

for page in pages:
    current_page += 1 
    url = 'http://ufcstats.com/statistics/fighters?char=' + page + '&page=all'
    #soup = get_soup(url)
    page_links = get_links(get_soup(url))
    links_amount = len(page_links)
    
    current_link = 0
    for stat_page in page_links:
        current_link += 1
        percent = round((((current_page-1)/pages_amount) + (current_link/links_amount)*(1/pages_amount)) * 100, 2)
        print('Scraping page %d/%d (%d/%d links) => %s %% completed ' %(current_page, pages_amount, current_link, links_amount, percent), end = '\r')
        stat_page_soup = get_soup(stat_page)
        row = get_row(stat_page_soup)
        df.loc[len(df)] = row
        

Scraping page 52/52 (32/32 links) => 100.0 % completed d 

In [13]:
df.tail()

Unnamed: 0,name,wins,losses,draws,Height:,Weight:,Reach:,STANCE:,DOB:,SLpM:,Str. Acc.:,SApM:,Str. Def:,Unnamed: 14,TD Avg.:,TD Acc.:,TD Def.:,Sub. Avg.:
7839,Joao Zeferino,21,9.0,0,"5' 11""",170 lbs.,--,Orthodox,"Jan 15, 1986",0.83,36%,2.6,48%,,0.5,5%,50%,1.0
7840,Zhang Tiequan,15,4.0,0,"5' 8""",155 lbs.,"69""",Orthodox,"Jul 25, 1978",1.23,36%,2.14,51%,,1.95,58%,75%,3.4
7841,Carlos Zevallos,3,,0,"6' 0""",205 lbs.,--,Orthodox,--,4.36,65%,2.28,68%,,0.0,0%,100%,0.0
7842,Zach Zane,10,7.0,0,"5' 7""",145 lbs.,"69""",Southpaw,"Dec 14, 1989",0.87,56%,6.67,20%,,0.0,0%,26%,0.0
7843,Roger Zapata,4,,0,"5' 11""",170 lbs.,--,Southpaw,"May 09, 1986",2.6,51%,2.13,36%,,0.0,0%,81%,1.4


### -[done] saving file

In [15]:
df.to_csv('../data/ufc_fighters_stats.csv')

## >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>