In [1]:
import time
import pickle

import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from bs4 import BeautifulSoup

In [2]:
def save_data(data, name):
    with open(name, "wb") as fp:
        pickle.dump(data, fp)

In [3]:
def open_data(name):
    with open(name, "rb") as fp:
        return pickle.load(fp)

In [4]:
def get_soup(url):
    session = requests.Session()
    retry = Retry(connect=3, backoff_factor=0.5)
    adapter = HTTPAdapter(max_retries=retry)
    session.mount('http://', adapter)
    session.mount('https://', adapter)
    re = session.get(url)
    
    return BeautifulSoup(re.text, features='html.parser')

In [5]:
def get_event_urls():
    soup = get_soup('http://ufcstats.com/statistics/events/completed?page=all')

    url_list = []
    for event in soup.find_all('a', {'class': 'b-link b-link_style_black'}):
        event_url = event.get('href')
        url_list.append(event_url)

    return url_list

In [62]:
def get_fight_urls_from_event(soup):
    url_list = []
    for fight in (soup.find_all('a', {'b-flag b-flag_style_bordered'})[::2] + soup.find_all('a', {'b-flag b-flag_style_green'})):
        fight_url = fight.get('href')
        url_list.append(fight_url)
    return url_list

In [7]:
# Determine if winner is in left or right column.
# 0 is left, 1 is right, 2 is no contest or draw.

def get_winner_from_fight(soup):
    win_lose = soup.find('div', {'class': 'b-fight-details'}).find_all('i')
    
    if 'b-fight-details__person-status_style_green' in win_lose[0].get('class'):
        winner = 0
    elif 'b-fight-details__person-status_style_green' in win_lose[1].get('class'):
        winner = 1
    else:
        winner = 2
        
    result = soup.find('div', {'class': 'b-fight-details__content'})
    method = result.find('i', {'style': 'font-style: normal'}).get_text(strip=True)
    rnd = result.find('i', {'class': 'b-fight-details__text-item'}).get_text(strip=True).split(':')[1]
    time = result.find_all('i', {'class': 'b-fight-details__text-item'})[1].get_text(strip=True).split(':', 1)[1]
    referee = result.find_all('i', {'class': 'b-fight-details__text-item'})[3].get_text(strip=True).split(':')[1]
    weight = soup.find('div', {'class': 'b-fight-details__fight-head'}).get_text(strip=True)
    
    bonus = soup.find('div', {'class': 'b-fight-details__fight-head'}).find('img')
    if bonus:
        if 'belt' not in bonus:
            bonus = 1
        else:
            bonus = 0
    else:
        bonus = 0
    
    details = result.find_all('p', {'class': 'b-fight-details__text'})[1].get_text(strip=True).split(':', 1)[1]
    
    result = [winner, method, rnd, time, referee, weight, bonus, details]
    
    return result

In [8]:
def get_totals(results):
    for j in [6, 4, 3, 2, 0]:
        del results[j]

    stats = [[], []]
    for j, result in enumerate(results):
        stat = result.find_all('p')

        if j in [0, 2, 3]:
            stats[0].append(int(stat[0].get_text(strip=True)))
            stats[1].append(int(stat[1].get_text(strip=True)))
        elif j == 1:
            stats[0].append([int(x) for x in stat[0].get_text().split(' of ')])
            stats[1].append([int(x) for x in stat[1].get_text().split(' of ')])
        else:
            stats[0].append(stat[0].get_text(strip=True))
            stats[1].append(stat[1].get_text(strip=True))   
    
    return stats

In [9]:
def get_strikes(results):
    for j in [2, 0]:
        del results[j]

    stats = [[], []]
    for result in results:
        stat = result.find_all('p')
        stats[0].append([int(x) for x in stat[0].get_text().split(' of ')])
        stats[1].append([int(x) for x in stat[1].get_text().split(' of ')])
    
    return stats

In [10]:
def get_fighter_stats_from_fight(soup):
    totals = [[], []]
    strikes = [[], []]
    soup = soup.find_all('table')
    
    for i, section in enumerate(soup):
        section = section.find_all('tr',{'class': 'b-fight-details__table-row'})[1:]
        
        for all_results in section:
            results = all_results.find_all('td',{'class': 'b-fight-details__table-col'})
            
            if i <= 1:
                stats = get_totals(results)
                totals[0].append(stats[0])
                totals[1].append(stats[1])
            else:
                stats = get_strikes(results)
                strikes[0].append(stats[0])
                strikes[1].append(stats[1])       
    
    return totals, strikes

In [11]:
# Fight = [Fighter1, Fighter2, Result]
    # Fighter1, Fighter2 = [Name, Total, Strikes]
        # Totals = [Total, First, Second, Third, Fourth, Fifth]
            # Total, First, etc = [Knockdowns, Takedowns, Reversals, Submission Attempted, Control Time]
                # Takedowns = [Landed, Attempted]
        # Strikes = [Total, First, Second, Third, Fourth, Fifth]
            # Total, First, etc = [Significant Strikes, Head, Body, Leg, Distance, Clinch, Ground]
                # Everything = [Landed, Attempted]
    # Result = [Winner, Method, Round, Time, Referee, Weight, Bonus, Details]

def get_fight_data(soup):
    name = [x.get_text() for x in soup.find_all('a', {'class': 'b-fight-details__person-link'})]
    totals, strikes = get_fighter_stats_from_fight(soup)
    result = get_winner_from_fight(soup)
    
    data = [[name[0], totals[0], strikes[0]], [name[1], totals[1], strikes[1]], result]

    return data
    
url = 'http://ufcstats.com/fight-details/a38648a1c190f9be'
soup = get_soup(url)
get_fight_data(soup)

[['Brad Riddell ',
  [[0, [1, 4], 0, 0, '0:14'],
   [0, [0, 0], 0, 0, '0:00'],
   [0, [0, 1], 0, 0, '0:00'],
   [0, [1, 3], 0, 0, '0:14']],
  [[[59, 115], [23, 72], [20, 26], [16, 17], [59, 114], [0, 1], [0, 0]],
   [[21, 41], [9, 28], [7, 8], [5, 5], [21, 41], [0, 0], [0, 0]],
   [[22, 50], [8, 30], [8, 13], [6, 7], [22, 49], [0, 1], [0, 0]],
   [[16, 24], [6, 14], [5, 5], [5, 5], [16, 24], [0, 0], [0, 0]]]],
 ['Rafael Fiziev ',
  [[0, [0, 0], 0, 0, '0:09'],
   [0, [0, 0], 0, 0, '0:00'],
   [0, [0, 0], 0, 0, '0:09'],
   [0, [0, 0], 0, 0, '0:00']],
  [[[66, 123], [36, 78], [22, 30], [8, 15], [64, 121], [2, 2], [0, 0]],
   [[19, 35], [10, 18], [7, 10], [2, 7], [19, 35], [0, 0], [0, 0]],
   [[30, 59], [14, 38], [13, 17], [3, 4], [28, 57], [2, 2], [0, 0]],
   [[17, 29], [12, 22], [2, 3], [3, 4], [17, 29], [0, 0], [0, 0]]]],
 [1,
  'KO/TKO',
  '3',
  '2:20',
  'Herb Dean',
  'Lightweight Bout',
  1,
  'Kick to Head At Distance\n      Wheel kick']]

In [111]:
# All_fight_data = [Fights, Date, Venue] Ordered from old -> new.

def get_all_fight_data(all_fight_data = []):
    event_urls = get_event_urls()[::-1][len(all_fight_data):]

    for event_url in event_urls:
        event_data = []
        event_soup = get_soup(event_url)
        fight_urls = get_fight_urls_from_event(event_soup)

        event_date_location = event_soup.find_all('li', {'class': 'b-list__box-list-item'})
        event_date = event_date_location[0].get_text(strip=True).split(':', 1)[1]
        event_location = event_date_location[1].get_text(strip=True).split(':', 1)[1]

        print(event_soup.find('span', {'class': 'b-content__title-highlight'}).get_text(strip=True), ' #', event_urls.index(event_url))

        fights = []
        for fight_url in fight_urls:
            fight_soup = get_soup(fight_url)
            fight_data = get_fight_data(fight_soup)
            fights.append(fight_data)

            print(fight_data[0][0], 'vs', fight_data[1][0])

        event_data.append(fights)
        event_data.append(event_date)
        event_data.append(event_location)

        all_fight_data.append(event_data)

        time.sleep(5)
        save_data(all_fight_data, 'all_fight_data')

    return all_fight_data

In [114]:
all_fight_data = get_all_fight_data([])


UFC 2: No Way Out  # 0
Royce Gracie  vs Patrick Smith 
Royce Gracie  vs Remco Pardoel 
Patrick Smith  vs Johnny Rhodes 
Royce Gracie  vs Jason DeLucia 
Remco Pardoel  vs Orlando Wiet 
Johnny Rhodes  vs Fred Ettish 
Patrick Smith  vs Scott Morris 
Royce Gracie  vs Minoki Ichihara 
Jason DeLucia  vs Scott Baker 
Remco Pardoel  vs Alberta Cerra Leon 
Orlando Wiet  vs Robert Lucarelli 
Frank Hamaker  vs Thaddeus Luster 
Johnny Rhodes  vs David Levicki 
Patrick Smith  vs Ray Wizard 
Scott Morris  vs Sean Daugherty 

UFC 3: The American Dream  # 1
Steve Jennum  vs Harold Howard 
Ken Shamrock  vs Felix Lee Mitchell 
Royce Gracie  vs Kimo Leopoldo 
Harold Howard  vs Roland Payne 
Ken Shamrock  vs Christophe Leninger 
Keith Hackney  vs Emmanuel Yarborough 

UFC 4: Revenge of the Warriors  # 2
Royce Gracie  vs Dan Severn 
Dan Severn  vs Marcus Bossett 
Royce Gracie  vs Keith Hackney 
Dan Severn  vs Anthony Macias 
Steve Jennum  vs Melton Bowen 
Keith Hackney  vs Joe Son 
Royce Gracie  vs Ron va

Shonie Carter  vs Brad Gumm 

UFC 25: Ultimate Japan 3  # 27
Tito Ortiz  vs Wanderlei Silva 
Murilo Bustamante  vs Yoji Anjo 
Sanae Kikuta  vs Eugene Jackson 
Ron Waterman  vs Satoshi Honma 
Ikuhisa Minowa  vs Joe Slick 
Laverne Clark  vs Koji Oishi 

UFC 26: Ultimate Field Of Dreams  # 28
Kevin Randleman  vs Pedro Rizzo 
Tyrone Roberts  vs David Dodd 
Pat Miletich  vs John Alessio 
Amaury Bitetti  vs Alex Andrade 
Matt Hughes  vs Marcelo Aguiar 
Jens Pulver  vs Joao Roque 
Ian Freeman  vs Nate Schroeder 
Shonie Carter  vs Adrian Serrano 

UFC 27: Ultimate Bad Boyz  # 29
Brad Gumm  vs CJ Fernandes 
Pedro Rizzo  vs Dan Severn 
Maurice Smith  vs Bobby Hoffman 
Jeremy Horn  vs Eugene Jackson 
Fabiano Iha  vs Laverne Clark 
Yuki Kondo  vs Alexandre Dantas 
Ian Freeman  vs Tedd Williams 
Jeff Monson  vs Tim Lajcik 

UFC 28: High Stakes  # 30
Randy Couture  vs Kevin Randleman 
Renato Sobral  vs Maurice Smith 
Josh Barnett  vs Gan McGee 
Andrei Arlovski  vs Aaron Brink 
Jens Pulver  vs John L

Ivan Salaverry  vs Joe Riggs 
Joe Doerksen  vs Patrick Cote 
Mike van Arsdale  vs John Marsh 

UFC 53: Heavy Hitters  # 57
Andrei Arlovski  vs Justin Eilers 
Karo Parisyan  vs Matt Serra 
Rich Franklin  vs Evan Tanner 
Forrest Griffin  vs Bill Mahood 
Paul Buentello  vs Kevin Jordan 
Nate Quarry  vs Shonie Carter 
David Loiseau  vs Charles McCarthy 
Nick Diaz  vs Koji Oishi 

UFC Fight Night 1  # 58
Nate Marquardt  vs Ivan Salaverry 
Chris Leben  vs Patrick Cote 
Stephan Bonnar  vs Sam Hoger 
Nate Quarry  vs Pete Sell 
Josh Koscheck  vs Pete Spratt 
Mike Swick  vs Gideon Ray 
Kenny Florian  vs Alex Karalexis 
Drew Fickett  vs Josh Neer 

UFC 54: Boiling Point  # 59
Chuck Liddell  vs Jeremy Horn 
Tim Sylvia  vs Tra Telligman 
Randy Couture  vs Mike van Arsdale 
Diego Sanchez  vs Brian Gassaway 
Georges St-Pierre  vs Frank Trigg 
Matt Lindland  vs Joe Doerksen 
Trevor Prangley  vs Travis Lutter 
James Irvin  vs Terry Martin 

UFC Fight Night 2  # 60
David Loiseau  vs Evan Tanner 
Chris L

Frankie Edgar  vs Tyson Griffin 
Lyoto Machida  vs Sam Hoger 
Dustin Hazelett  vs Diego Saraiva 

UFC 68: The Uprising  # 84
Randy Couture  vs Tim Sylvia 
Martin Kampmann  vs Drew McFedries 
Rich Franklin  vs Jason MacDonald 
Matt Hughes  vs Chris Lytle 
Jason Lambert  vs Renato Sobral 
Matt Hamill  vs Rex Holman 
Jon Fitch  vs Luigi Fioravanti 
Gleison Tibau  vs Jason Dent 
Jamie Varner  vs Jason Gilliam 

UFC Fight Night: Stevenson vs Guillard  # 85
Joe Stevenson  vs Melvin Guillard 
Justin McCully  vs Antoni Hardonk 
Kenny Florian  vs Dokonjonosuke Mishima 
Wilson Gouveia  vs Seth Petruzelli 
Drew Fickett  vs Keita Nakamura 
Kurt Pellegrino  vs Nate Mohr 
Kuniyoshi Hironaka  vs Forrest Petz 
Roan Carneiro  vs Rich Clementi 
Thiago Tavares  vs Naoyuki Kotani 

UFC 69: Shootout  # 86
Matt Serra  vs Georges St-Pierre 
Josh Koscheck  vs Diego Sanchez 
Roger Huerta  vs Leonard Garcia 
Yushin Okami  vs Mike Swick 
Kendall Grove  vs Alan Belcher 
Heath Herring  vs Brad Imes 
Thales Leites 


UFC 85: Bedlam  # 108
Thiago Alves  vs Matt Hughes 
Michael Bisping  vs Jason Day 
Mike Swick  vs Marcus Davis 
Thales Leites  vs Nate Marquardt 
Fabricio Werdum  vs Brandon Vera 
Martin Kampmann  vs Jorge Rivera 
Matt Wiman  vs Thiago Tavares 
Kevin Burns  vs Roan Carneiro 
Luiz Cane  vs Jason Lambert 
Paul Taylor  vs Jess Liaudin 
Antoni Hardonk  vs Eddie Sanchez 

The Ultimate Fighter: Team Rampage vs Team Forrest Finale  # 109
Kendall Grove  vs Evan Tanner 
Amir Sadollah  vs CB Dollaway 
Diego Sanchez  vs Luigi Fioravanti 
Spencer Fisher  vs Jeremy Stephens 
Matthew Riddle  vs Dante Rivera 
Dustin Hazelett  vs Joshua Burkman 
Drew McFedries  vs Marvin Eastman 
Matt Brown  vs Matt Arroyo 
Dean Lister  vs Jeremy Horn 
Rob Kimmons  vs Rob Yundt 

UFC 86: Jackson vs Griffin  # 110
Forrest Griffin  vs Quinton Jackson 
Patrick Cote  vs Ricardo Almeida 
Joe Stevenson  vs Gleison Tibau 
Josh Koscheck  vs Chris Lytle 
Tyson Griffin  vs Marcus Aurelio 
Gabriel Gonzaga  vs Justin McCully 
Co

Joe Stevenson  vs Nate Diaz 
Melvin Guillard  vs Gleison Tibau 
Brad Blackburn  vs Edgar Garcia 
Tomasz Drwal  vs Mike Ciesnolevicz 
Nick Osipczak  vs Frank Lester 
Jason Dent  vs Cameron Dollar 

UFC 100  # 131
Jon Fitch  vs Paulo Thiago 
Brock Lesnar  vs Frank Mir 
Georges St-Pierre  vs Thiago Alves 
Dan Henderson  vs Michael Bisping 
Yoshihiro Akiyama  vs Alan Belcher 
Mark Coleman  vs Stephan Bonnar 
Jim Miller  vs Mac Danzig 
Jon Jones  vs Jake O'Brien 
Dong Hyun Kim  vs TJ Grant 
Tom Lawlor  vs CB Dollaway 
Shannon Gugerty  vs Matt Grice 

UFC 101: Declaration  # 132
BJ Penn  vs Kenny Florian 
Anderson Silva  vs Forrest Griffin 
Aaron Riley  vs Shane Nelson 
Johny Hendricks  vs Amir Sadollah 
Ricardo Almeida  vs Kendall Grove 
Kurt Pellegrino  vs Josh Neer 
John Howard  vs Tamdan McCrory 
Alessio Sakara  vs Thales Leites 
Matthew Riddle  vs Dan Cramer 
George Sotiropoulos  vs George Roop 
Jesse Lennox  vs Danillo Villefort 

UFC 102: Couture vs Nogueira  # 133
Antonio Rodrigo Nog

Keith Jardine  vs Matt Hamill 
Chris Leben  vs Aaron Simpson 
Spencer Fisher  vs Dennis Siver 
Jamie Yager  vs Rich Attonito 
John Gunderson  vs Mark Holst 
Brad Tavares  vs Seth Baczynski 
Josh Bryant  vs Kyle Noke 
Chris Camozzi  vs James Hammortree 
James McSweeney  vs Travis Browne 

UFC 116: Lesnar vs Carwin  # 153
Brock Lesnar  vs Shane Carwin 
Chris Leben  vs Yoshihiro Akiyama 
Chris Lytle  vs Matt Brown 
Krzysztof Soszynski  vs Stephan Bonnar 
George Sotiropoulos  vs Kurt Pellegrino 
Brendan Schaub  vs Chris Tuchscherer 
Seth Petruzelli  vs Ricardo Romero 
Kendall Grove  vs Goran Reljic 
Gerald Harris  vs David Branch 
Forrest Petz  vs Daniel Roberts 
Jon Madsen  vs Karlos Vemola 

UFC Live: Jones vs Matyushenko  # 154
Jon Jones  vs Vladimir Matyushenko 
Mark Munoz  vs Yushin Okami 
John Howard  vs Jake Ellenberger 
Tyson Griffin  vs Takanori Gomi 
Paul Kelly  vs Jacob Volkmann 
DaMarques Johnson  vs Matthew Riddle 
James Irvin  vs Igor Pokrajac 
Brian Stann  vs Mike Massenzio 

Quinton Jackson  vs Matt Hamill 
Frank Mir  vs Roy Nelson 
Stefan Struve  vs Travis Browne 
Thiago Alves  vs Rick Story 
Brian Stann  vs Jorge Santiago 
Miguel Torres  vs Demetrious Johnson 
Kendall Grove  vs Tim Boetsch 
Gleison Tibau  vs Rafaello Oliveira 
Michael McDonald  vs Chris Cariaso 
Cole Escovedo  vs Renan Barao 

The Ultimate Fighter: Team Lesnar vs Team dos Santos Finale  # 174
Ramsey Nijem  vs Tony Ferguson 
Clay Guida  vs Anthony Pettis 
Ed Herman  vs Tim Credeur 
Fabio Maldonado  vs Kyle Kingsbury 
Chuck O'Neil  vs Chris Cope 
Danny Downes  vs Jeremy Stephens 
Josh Grispi  vs George Roop 
Ryan McGillivray  vs Shamar Bailey 
Clay Harvison  vs Justin Edwards 
Scott Jorgensen  vs Ken Stone 
Reuben Duran  vs Francisco Rivera 

UFC 131: Dos Santos vs Carwin  # 175
Shane Carwin  vs Junior Dos Santos 
Kenny Florian  vs Diego Nunes 
Mark Munoz  vs Demian Maia 
Dave Herman  vs Jon Olav Einemo 
Vagner Rocha  vs Donald Cerrone 
Yves Edwards  vs Sam Stout 
Jesse Bongfeldt  vs Chris


UFC on FOX: Evans vs Davis  # 194
Rashad Evans  vs Phil Davis 
Chael Sonnen  vs Michael Bisping 
Demian Maia  vs Chris Weidman 
Evan Dunham  vs Nik Lentz 
Mike Russow  vs Jon Olav Einemo 
Cub Swanson  vs George Roop 
Charles Oliveira  vs Eric Wisely 
Michael Johnson  vs Shane Roller 
Joey Beltran  vs Lavar Johnson 
Chris Camozzi  vs Dustin Jacoby 

UFC 143: Diaz vs Condit  # 195


KeyboardInterrupt: 

In [115]:
len(open_data('all_fight_data'))

195

In [118]:
# all_fight_data = get_all_fight_data(open_data('all_fight_data'))

all_f_names = all_fighter_names(all_fight_data)
all_f_data = [fighter_data(all_fight_data, name) for name in all_f_names]
all_f_data = [item for sublist in all_f_data for item in sublist]

all_r_names = all_referee_names(all_fight_data)
all_r_data = [referee_data(all_fight_data, name) for name in all_r_names]
all_r_data = [item for sublist in all_r_data for item in sublist]

all_f_data_per_fight = [fighter_data_per_fight(all_fight_data, name) for name in all_f_names]
all_f_data_per_fight = [item for sublist in all_f_data_per_fight for item in sublist]

NameError: name 'all_fighter_names' is not defined

In [None]:
save_data(all_fight_data, 'all_fight_data')
save_data(all_f_data, 'all_f_data')
save_data(all_f_data_per_fight, 'all_f_data_pf')
save_data(all_r_data, 'all_r_data')