In [1]:
import time
import pickle

import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from bs4 import BeautifulSoup

In [2]:
def save_data(data, name):
    with open(name, "wb") as fp:
        pickle.dump(data, fp)

In [3]:
def open_data(name):
    with open(name, "rb") as fp:
        return pickle.load(fp)

In [4]:
def get_soup(url):
    session = requests.Session()
    retry = Retry(connect=3, backoff_factor=0.5)
    adapter = HTTPAdapter(max_retries=retry)
    session.mount('http://', adapter)
    session.mount('https://', adapter)
    re = session.get(url)
    
    return BeautifulSoup(re.text, features='html.parser')

In [5]:
# Page to big for request, nee to download webpage manually. 
def get_event_urls():
    with open("event_list.html") as event_list:
        soup = BeautifulSoup(event_list, 'html.parser')
        
    url_list = []
    for event in soup.find_all('a', {'class': 'html-attribute-value html-external-link'})[5:]:
        event_url = event.get_text()
        url_list.append(event_url)
        
    url_list = url_list
    return url_list[::-1][16:]

In [6]:
def get_fight_urls_from_event(soup):
    url_list = []
    for fight in soup.find_all('a', {'b-flag b-flag_style_green'}):
        fight_url = fight.get('href')
        url_list.append(fight_url)
    return url_list

In [15]:
# Determine if winner is in left or right column.
# 0 is left, 1 is right, 2 is no contest or draw.

def get_winner_from_fight(soup):
    win_lose = soup.find('div', {'class': 'b-fight-details'}).find_all('i')
    
    if 'b-fight-details__person-status_style_green' in win_lose[0].get('class'):
        winner = 0
    elif 'b-fight-details__person-status_style_green' in win_lose[1].get('class'):
        winner = 1
    else:
        winner = 2
        
    result = soup.find('div', {'class': 'b-fight-details__content'})
    method = result.find('i', {'style': 'font-style: normal'}).get_text(strip=True)
    rnd = result.find('i', {'class': 'b-fight-details__text-item'}).get_text(strip=True).split(':')[1]
    time = result.find_all('i', {'class': 'b-fight-details__text-item'})[1].get_text(strip=True).split(':', 1)[1]
    referee = result.find_all('i', {'class': 'b-fight-details__text-item'})[3].get_text(strip=True).split(':')[1]
    weight = soup.find('div', {'class': 'b-fight-details__fight-head'}).get_text(strip=True)
    
    bonus = soup.find('div', {'class': 'b-fight-details__fight-head'}).find('img')
    if bonus:
        if 'belt' not in bonus:
            bonus = 1
        else:
            bonus = 0
    else:
        bonus = 0
    
    details = result.find_all('p', {'class': 'b-fight-details__text'})[1].get_text(strip=True).split(':', 1)[1]
    
    result = [winner, method, rnd, time, referee, weight, bonus, details]
    
    return result

In [8]:
def get_totals(results):
    for j in [6, 4, 3, 2, 0]:
        del results[j]

    stats = [[], []]
    for j, result in enumerate(results):
        stat = result.find_all('p')

        if j in [0, 2, 3]:
            stats[0].append(int(stat[0].get_text(strip=True)))
            stats[1].append(int(stat[1].get_text(strip=True)))
        elif j == 1:
            stats[0].append([int(x) for x in stat[0].get_text().split(' of ')])
            stats[1].append([int(x) for x in stat[1].get_text().split(' of ')])
        else:
            stats[0].append(stat[0].get_text(strip=True))
            stats[1].append(stat[1].get_text(strip=True))   
    
    return stats

In [9]:
def get_strikes(results):
    for j in [2, 0]:
        del results[j]

    stats = [[], []]
    for result in results:
        stat = result.find_all('p')
        stats[0].append([int(x) for x in stat[0].get_text().split(' of ')])
        stats[1].append([int(x) for x in stat[1].get_text().split(' of ')])
    
    return stats

In [10]:
def get_fighter_stats_from_fight(soup):
    totals = [[], []]
    strikes = [[], []]
    soup = soup.find_all('table')
    
    for i, section in enumerate(soup):
        section = section.find_all('tr',{'class': 'b-fight-details__table-row'})[1:]
        
        for all_results in section:
            results = all_results.find_all('td',{'class': 'b-fight-details__table-col'})
            
            if i <= 1:
                stats = get_totals(results)
                totals[0].append(stats[0])
                totals[1].append(stats[1])
            else:
                stats = get_strikes(results)
                strikes[0].append(stats[0])
                strikes[1].append(stats[1])       
    
    return totals, strikes

In [11]:
# Fight = [Fighter1, Fighter2, Result]
    # Fighter1, Fighter2 = [Name, Total, Strikes]
        # Totals = [Total, First, Second, Third, Fourth, Fifth]
            # Total, First, etc = [Knockdowns, Takedowns, Reversals, Submission Attempted, Control Time]
                # Takedowns = [Landed, Attempted]
        # Strikes = [Total, First, Second, Third, Fourth, Fifth]
            # Total, First, etc = [Significant Strikes, Head, Body, Leg, Distance, Clinch, Ground]
                # Everything = [Landed, Attempted]
    # Result = [Winner, Method, Round, Time, Referee, Weight, Bonus, Details]

def get_fight_data(soup):
    name = [x.get_text() for x in soup.find_all('a', {'class': 'b-fight-details__person-link'})]
    totals, strikes = get_fighter_stats_from_fight(soup)
    result = get_winner_from_fight(soup)
    
    data = [[name[0], totals[0], strikes[0]], [name[1], totals[1], strikes[1]], result]

    return data
    
url = 'http://ufcstats.com/fight-details/a38648a1c190f9be'
soup = get_soup(url)
get_fight_data(soup)

1


[['Brad Riddell ',
  [[0, [1, 4], 0, 0, '0:14'],
   [0, [0, 0], 0, 0, '0:00'],
   [0, [0, 1], 0, 0, '0:00'],
   [0, [1, 3], 0, 0, '0:14']],
  [[[59, 115], [23, 72], [20, 26], [16, 17], [59, 114], [0, 1], [0, 0]],
   [[21, 41], [9, 28], [7, 8], [5, 5], [21, 41], [0, 0], [0, 0]],
   [[22, 50], [8, 30], [8, 13], [6, 7], [22, 49], [0, 1], [0, 0]],
   [[16, 24], [6, 14], [5, 5], [5, 5], [16, 24], [0, 0], [0, 0]]]],
 ['Rafael Fiziev ',
  [[0, [0, 0], 0, 0, '0:09'],
   [0, [0, 0], 0, 0, '0:00'],
   [0, [0, 0], 0, 0, '0:09'],
   [0, [0, 0], 0, 0, '0:00']],
  [[[66, 123], [36, 78], [22, 30], [8, 15], [64, 121], [2, 2], [0, 0]],
   [[19, 35], [10, 18], [7, 10], [2, 7], [19, 35], [0, 0], [0, 0]],
   [[30, 59], [14, 38], [13, 17], [3, 4], [28, 57], [2, 2], [0, 0]],
   [[17, 29], [12, 22], [2, 3], [3, 4], [17, 29], [0, 0], [0, 0]]]],
 [1,
  'KO/TKO',
  '3',
  '2:20',
  'Herb Dean',
  'Lightweight Bout',
  1,
  'Kick to Head At Distance\n      Wheel kick']]

In [12]:
# All_fight_data = [Fights, Date, Venue] Ordered from old -> new.

def get_all_fight_data(all_fight_data = [], start = 0):
    event_urls = get_event_urls()[start:]
    for event_url in event_urls:
        event_data = []
        event_soup = get_soup(event_url)
        fight_urls = get_fight_urls_from_event(event_soup)
        
        event_date_location = event_soup.find_all('li', {'class': 'b-list__box-list-item'})
        event_date = event_date_location[0].get_text(strip=True).split(':', 1)[1]
        event_location = event_date_location[1].get_text(strip=True).split(':', 1)[1]
        
        print(event_soup.find('span', {'class': 'b-content__title-highlight'}).get_text(strip=True), ' #', event_urls.index(event_url))
        
        fights = []
        for fight_url in fight_urls:
            fight_soup = get_soup(fight_url)
            fight_data = get_fight_data(fight_soup)
            fights.append(fight_data)
            
            print(fight_data[0][0], 'vs', fight_data[1][0])
        
        event_data.append(fights)
        event_data.append(event_date)
        event_data.append(event_location)
        
        all_fight_data.append(event_data)

        time.sleep(5)
        save_data(all_fight_data, 'all_fight_data')
    
    return all_fight_data

In [20]:
start = 294 + 283 + 53
all_fight_data = get_all_fight_data(open_data('all_fight_data'), start)

UFC Fight Night: Santos vs. Walker  # 0
Thiago Santos  vs Johnny Walker 
Alex Oliveira  vs Niko Price 
Misha Cirkunov  vs Krzysztof Jotko 
Alexander Hernandez  vs Mike Breeden 
Joe Solecki  vs Jared Gordon 
Antonina Shevchenko  vs Casey O'Neill 
Bethe Correia  vs Karol Rosa 
Devonte Smith  vs Jamie Mullarkey 
Douglas Silva de Andrade  vs Gaetano Pirrello 
Stephanie Egger  vs Shanna Young 
Alejandro Perez  vs Johnny Eduardo 
UFC Fight Night: Dern vs. Rodriguez  # 1
Mackenzie Dern  vs Marina Rodriguez 
Randy Brown  vs Jared Gooden 
Tim Elliott  vs Matheus Nicolau 
Sabina Mazo  vs Mariya Agapova 
Chris Gutierrez  vs Felipe Colares 
Alexandr Romanov  vs Jared Vanderaa 
Charles Rosa  vs Damon Jackson 
Loopy Godinez  vs Silvana Gomez Juarez 
Steve Garcia  vs Charlie Ontiveros 
UFC Fight Night: Ladd vs. Dumont  # 2
Aspen Ladd  vs Norma Dumont 
Andrei Arlovski  vs Carlos Felipe 
Jim Miller  vs Erick Gonzalez 
Manon Fiorot  vs Mayra Bueno Silva 
Nate Landwehr  vs Ludovit Klein 
Andrew Sanchez  

Thiago Santos  vs Magomed Ankalaev 
Marlon Moraes  vs Song Yadong 
Sodiq Yusuff  vs Alex Caceres 
Khalil Rountree Jr.  vs Karl Roberson 
Drew Dober  vs Terrance McKinney 
Alex Pereira  vs Bruno Silva 
Matthew Semelsberger  vs AJ Fletcher 
JJ Aldrich  vs Gillian Robertson 
Trevin Jones  vs Javid Basharat 
Damon Jackson  vs Kamuela Kirk 
Sabina Mazo  vs Miranda Maverick 
Dalcha Lungiambula  vs Cody Brundage 
Kris Moutinho  vs Guido Cannetti 
Tafon Nchukwi  vs Azamat Murzakanov 
UFC Fight Night: Volkov vs. Aspinall  # 19
Alexander Volkov  vs Tom Aspinall 
Arnold Allen  vs Dan Hooker 
Paddy Pimblett  vs Kazula Vargas 
Gunnar Nelson  vs Takashi Sato 
Molly McCann  vs Luana Carolina 
Jai Herbert  vs Ilia Topuria 
Mike Grundy  vs Makwan Amirkhani 
Shamil Abdurakhimov  vs Sergei Pavlovich 
Nikita Krylov  vs Paul Craig 
Jack Shore  vs Timur Valiev 
Cory McKenna  vs Elise Reed 
Muhammad Mokaev  vs Cody Durden 
UFC Fight Night: Blaydes vs. Daukaus  # 20
Curtis Blaydes  vs Chris Daukaus 
Joanne Wo

Julianna Pena  vs Amanda Nunes 
Brandon Moreno  vs Kai Kara-France 
Derrick Lewis  vs Sergei Pavlovich 
Alexandre Pantoja  vs Alex Perez 
Magomed Ankalaev  vs Anthony Smith 
Alex Morono  vs Matthew Semelsberger 
Drew Dober  vs Rafael Alves 
Don'Tale Mayes  vs Hamdy Abdelwahab 
Drakkar Klose  vs Rafa Garcia 
Michael Morales  vs Adam Fugitt 
Joselyne Edwards  vs Ji Yeon Kim 
Nicolae Negumereanu  vs Ihor Potieria 
Orion Cosce  vs Blood Diamond 
UFC Fight Night: Santos vs. Hill  # 37
Thiago Santos  vs Jamahal Hill 
Vicente Luque  vs Geoff Neal 
Mohammed Usman  vs Zac Pauga 
Brogan Walker  vs Juliana Miller 
Augusto Sakai  vs Serghei Spivac 
Terrance McKinney  vs Erick Gonzalez 
Sam Alvey  vs Michal Oleksiejczuk 
Bryan Battle  vs Takashi Sato 
Cory McKenna  vs Miranda Granger 
Mayra Bueno Silva  vs Stephanie Egger 
UFC Fight Night: Vera vs. Cruz  # 38
Marlon Vera  vs Dominick Cruz 
Nate Landwehr  vs David Onama 
Yazmin Jauregui  vs Iasmin Lucindo 
Devin Clark  vs Azamat Murzakanov 
Priscila

In [19]:
open_data('all_fight_data')[-1]

[[[['Alexander Volkanovski ',
    [[0, [0, 0], 0, 2, '3:51'],
     [0, [0, 0], 0, 0, '0:00'],
     [0, [0, 0], 0, 0, '0:03'],
     [0, [0, 0], 0, 1, '1:13'],
     [0, [0, 0], 0, 1, '2:33'],
     [0, [0, 0], 0, 0, '0:02']],
    [[[214, 353],
      [146, 270],
      [20, 26],
      [48, 57],
      [158, 254],
      [3, 6],
      [53, 93]],
     [[30, 52], [14, 32], [5, 7], [11, 13], [30, 50], [0, 2], [0, 0]],
     [[38, 61], [19, 38], [5, 8], [14, 15], [38, 61], [0, 0], [0, 0]],
     [[61, 99], [46, 81], [3, 3], [12, 15], [30, 42], [0, 0], [31, 57]],
     [[35, 58], [28, 50], [1, 2], [6, 6], [13, 22], [0, 0], [22, 36]],
     [[50, 83], [39, 69], [6, 6], [5, 8], [47, 79], [3, 4], [0, 0]]]],
   ['Brian Ortega ',
    [[0, [2, 5], 3, 1, '0:54'],
     [0, [0, 0], 0, 0, '0:00'],
     [0, [0, 0], 0, 0, '0:00'],
     [0, [1, 1], 2, 1, '0:41'],
     [0, [1, 2], 1, 0, '0:13'],
     [0, [0, 2], 0, 0, '0:00']],
    [[[88, 234], [59, 193], [20, 26], [9, 15], [82, 225], [6, 9], [0, 0]],
     [[16, 51]