## scraper.ipynb
The sole job of this file is to provide the .csv data source. It scrapes over 200 wikipedia html files, each representing a UFC event, extracting fight data and combines them into one 21-column .csv file. It does no data cleaning. That is the responsibity of [ucf.ipynb](ufc.ipynb).

In [1]:
import pandas as pd
import numpy as np
import re #for regex
import os.path #for file handling
from bs4 import BeautifulSoup, NavigableString #for web page scraping
from IPython.core.display import display, HTML #display html document inside jupyter notebook
from decimal import Decimal, DecimalException

In [2]:
def get_clean_text(tr):
    this_text = ""
    
    td = tr.find(name="td")
    if td is None:
        return this_text
    
    #get string items only
    for item in td.contents:
        if isinstance(item, NavigableString):
            if item is not None:
                new_str = item.strip()
                if new_str:
                    this_text = new_str
    
    #if length of three or less is probably punctuation only. clear it out.
    if len(new_str) < 4:
        this_text = ""
    
    #if this_text still empty, loop and get text from all <A>
    #(this ignores <A> inside <sup> tags)
    if not new_str or len(new_str) < 3:
        for item in td.contents:
            if item.name is not None and item.name == "a":
                this_text = this_text + item.text.strip() + " "
    
    #if this_text still empty, loop and get text from all <span>
    if not new_str or len(new_str) < 3:
        for item in td.contents:
            if item.name is not None and item.name == "span":
                this_text = this_text + item.text.strip() + " "
    
    #print("==========")
    #print(td)
    #for item in td.contents:
    #    if item:
    #        print("true: {0}".format(item.name))
    #    else:
    #        print("false: {0}".format(item))
    
    #    if item.name is not None:
    #        print("- " + item.name)
    #    elif isinstance(item, NavigableString):
    #        print("- Text")
    #        print("  " + item.strip())
    #    else:
    #        print("- [none]")
    
    ##remove square brackets and content inside
    #if "[" in this_text and "]" in this_text:
    #    this_text = re.sub(r'\[[^)]*\]', '', this_text)
    
    return this_text.strip()

In [3]:
def get_parent_td_text_clean(th):
    if th.parent and th.parent.td:
        return get_clean_text(th.parent)
        #return th.parent.td.text.strip()

In [4]:
def strip_extension(filename):
    file = filename.replace(".txt", "")
    file = file.replace(".html", "")
    file = file.replace(".htm", "")
    return file

In [5]:
def isNum(str):
    try: 
        Decimal(str)
        #print("{0} is a number".format(str))
        return True
    except (ValueError, DecimalException):
        #print("{0} is not a number".format(str))
        return False

In [6]:
def get_event_info(ps, filename):
    venue_name = ""
    venue_location = ""
    event_name = ""
    event_type = ""
    event_number = ""
    event_nickname = ""
    event_date = ""
    event_attendance = ""
    event_gate = ""
    event_buyrate = ""
    event_purse = ""

    #event name and number
    event_code = strip_extension(filename)
    event_name = event_code.replace("_", " ")
    event_type, event_number = event_code.split("_")
    if not isNum(event_number):
        rnum = (np.random.randint(1, 300)) * -1
        event_number = str(rnum)

    #parse for table and row
    mtable = ps.find(name="body").find(name="table", attrs={"class":"infobox"})
    trs = mtable.findAll(name="tr")
    
    ##h1 firstHeading is event name
    #h1 = ps.find(name="h1", attrs={"id":"firstHeading"})
    #if h1 is not None:
    #    event_name = h1.text
        
    #first th is event nick name
    th1 = ps.find(name="th")
    if th1 is not None:
        event_nickname = th1.text
        
    #find rest of event data
    for tr in trs[1:]:
        th = tr.find(name="th")
        if th is None:
            continue
        if th.text.lower() == "venue":
            venue_name = get_clean_text(tr)
        elif th.text.lower() == "date":
            event_date = get_clean_text(tr)
        elif th.text.lower() == "city":
            venue_location = get_clean_text(tr)
        elif th.text.lower() == "attendance":
            event_attendance = get_clean_text(tr)
        elif th.text.lower() == "total gate":
            event_gate = get_clean_text(tr)
        elif th.text.lower() == "total purse":
            event_purse = get_clean_text(tr)
        elif th.text.lower() == "buyrate":
            event_buyrate = get_clean_text(tr)
            
    #if venue_name and venue_location blank, get all th to loop thru
    if not venue_name and not venue_location:
        ths = ps.findAll(name="th")
        for th in ths:
            if th and th.text.lower() == "venue":
                venue_name = get_parent_td_text_clean(th)
            if th and th.text.lower() == "city":
                venue_location = get_parent_td_text_clean(th)
            if th and th.text.lower() == "attendance":
                event_attendance = get_parent_td_text_clean(th)
            if th and th.text.lower() == "total gate":
                event_gate = get_parent_td_text_clean(th)
            if th and th.text.lower() == "date":
                event_date = get_parent_td_text_clean(th)
    
    event_info = {
        "venue_name" : venue_name,
        "venue_location" : venue_location,
        "event_name" : event_name,
        "event_type" : event_type,
        "event_number" : event_number,
        "event_nickname" : event_nickname,
        "event_date" : event_date,
        "event_attendance" : event_attendance,
        "event_gate" : event_gate,
        "event_buyrate" : event_buyrate,
        "event_purse" : event_purse
    }
    return event_info

In [7]:
def get_td_text(td):
    this_text = td.text.strip()
    
    #if this_text empty, try <A> text
    if not this_text:
        this_text = td.find(name="a").text.strip()
    
    return this_text.strip()

In [8]:
def get_fights(ps, event_info):
    fights = []
    fight_order = 0
    mtable = ps.find(name="body").find(name="table", attrs={"class":"toccolours"})
    trs = mtable.findAll(name="tr")
    for tr in trs:
        #print("==================")
        #print(tr)
        th = tr.findAll(name="th")
        #print(len(th))
        if th is not None and len(th) > 0:
            if len(th) == 1:
                card = th[0].text
                if "main card" in card.lower():
                    isMainCard = True
                else:
                    isMainCard = False
                #print(card)
            continue
        
        td = tr.findAll(name="td")
        #print(len(td))
        if len(td) > 0:
            fight_order = fight_order + 1
            isMainEvent = True if fight_order == 1 else False
            weightclass = get_td_text(td[0])
            fighter1 = get_td_text(td[1])
            result = get_td_text(td[2])
            fighter2 = get_td_text(td[3])
            outcome = get_td_text(td[4])
            rnd = get_td_text(td[5])
            time = get_td_text(td[6])
            fights.append((fight_order, isMainCard, isMainEvent, weightclass, fighter1, result, fighter2, outcome, rnd, time, event_info['venue_name'], event_info['venue_location'], event_info['event_date'], event_info['event_name'], event_info['event_nickname'], event_info['event_attendance'], event_info['event_gate'], event_info['event_buyrate'], event_info['event_purse'], event_info['event_type'], event_info['event_number']))
            #print(isMainEvent, weightclass, winner, loser, outcome, rnd, time)

    return fights

In [9]:
def write_csv(df, filename):
    file = strip_extension(filename)
    df.to_csv("data/processed/{0}.csv".format(file), index=False, header=True)

In [10]:
def append_csv(df):
    filename = "data/processed/fights.csv"
    write_header = not os.path.exists(filename)
    with open(filename, "a") as file:
        df.to_csv(file, index=False, header=write_header)

In [11]:
def delete_csv():
    filename = "data/processed/fights.csv"
    if os.path.exists(filename):
        os.remove(filename)

In [12]:
def write_excel(df, filename):
    file = strip_extension(filename)
    df.to_excel("data/processed/{0}.xlsx".format(file), index=False)

In [13]:
#untested
def append_excel(df):
    writer = pd.ExcelWriter("data/processed/fights.xlsx")
    start = 0
    if writer.sheets:
        start = writer.sheets[0].max_row
    df.to_excel(writer, startrow = start, index=False)

In [14]:
#check that all required columns contain data
def is_valid(series):
    #add True/False results to a list
    bl = []
    #first check that all required fields have data
    bl.append(True if series['FightNum'] else False)
    bl.append(True if series['isMainCard'] is not None else False)
    bl.append(True if series['isMainEvent'] is not None else False)
    bl.append(True if series['WeightClass'] else False)
    bl.append(True if series['Fighter1'] else False)
    bl.append(True if series['Fighter2'] else False)
    bl.append(True if series['Result'] else False)
    bl.append(True if series['Outcome'] else False)
    bl.append(True if series['Round'] else False)
    bl.append(True if series['Time'] else False)
    bl.append(True if series['VenueName'] else False)
    bl.append(True if series['VenueLocation'] else False)
    bl.append(True if series['EventDate'] else False)
    bl.append(True if series['EventName'] else False)
    bl.append(True if series['EventType'] else False)
    bl.append(True if series['EventNickname'] else False)
    #Attendance not req for TV events
    if series['EventType'] == "UFC" and Decimal(series['EventNum']) > 31.0:
        bl.append(True if series['EventAtt'] else False)
    #print(bl)
    
    #all returns true ONLY if all values are true
    return all(bl)

In [15]:
def process_html(filename, write_file):
    #print(filename)
    file = open("data/raw/" + filename, "r", encoding="utf8")
    html_string = file.read()
    #print(html_string)
    
    #pass the html string to BeautifulSoup
    ps = BeautifulSoup(html_string, "lxml")
    #print(ps.prettify())
    #return
    
    #event info
    event_info = get_event_info(ps, filename)
    #print(event_info)
    
    #get fights
    fights = get_fights(ps, event_info)
    #print(len(fights))
    
    pd.set_option('display.max_columns', None)
    cols = ['FightNum', 'isMainCard', 'isMainEvent', 'WeightClass', 'Fighter1', 'Result', 'Fighter2', 'Outcome', 'Round', 'Time', 'VenueName', 'VenueLocation', 'EventDate', 'EventName', 'EventNickname', 'EventAtt', 'EventGate', 'EventBuyrate', 'EventPurse', 'EventType', 'EventNum']
    df = pd.DataFrame(fights, columns=cols)
    
    #validate data in each row of DataFrame by sending it to is_valid() method
    #results will be a Series of boolean values. all(results) will return true ONLY if all values are true
    results = df.apply(lambda row: is_valid(row), axis=1)
    
    #write file or print to screen
    if write_file == True:
        if all(results):
            append_csv(df)
            #write_csv(df, filename)
        else:
            print(filename + " data frame is False")
    else:
        print("{0} is Valid: {1}".format(filename, all(results)));
        print(df)

In [16]:
def process_all():
    path = "data/raw/"
    for file in os.listdir(path):
        #only process files, not directories
        if not os.path.isfile(os.path.join(path, file)):
            continue
        #only process files with html extension
        name, ext = os.path.splitext(file)
        if ext != ".html":
            continue
        #process the raw source file
        process_html(file, True)

<h2></h2>

<h3><font color="#880000">debug: uses IPython.core.display display, HTML</font></h3>

In [None]:
def displayhtml(html_string):
    #convert the html string into an html object
    html_obj = HTML(html_string)
    display(html_obj)

<h3><font color="#880000">debug: find and print the body tag</font></h3>

In [None]:
def printBodyContent(ps)
    body = ps.find(name="body")
    print(body)

<h3><font color="#880000">debug: get ALL the table tags</font></h3>

In [None]:
def printAllTables(ps):
    alltable = body.findAll(name="table")
    print(alltable)

<h3><font color="#880000">debug: print table html???</font></h3>

In [None]:
def printTableContent(ps):
    alltable = body.findAll(name="table")
    for table in alltable:
        print("table")

<h3><font color="#880000">debug: print table content</font></h3>

In [None]:
def printTableRows(ps):
    alltable = body.findAll(name="table")
    for table in alltable:
        print("table-----------------------------------")
        trs = table.findAll(name="tr")
        for tr in trs:
            print("tr")
            td1 = tr.findAll(name="td")
            #print(len(td1))
            if len(td1) > 0:
                print(td1[0].text)

<h3><font color="#880000">debug test method: get alt venue</font></h3>

In [365]:
file = open("data/raw/TUF_12.html", "r", encoding="utf8")
html_string = file.read()
ps = BeautifulSoup(html_string, "lxml")

#for td in ps.find('th', text='Venue').parent.find_all('td'):
#    print(td.text)

#for td in ps.find('th', text='Venue'):
#    print(td.parent.parent.td.text)

#get single th GOOD
#th = ps.find("th", text="Venue")
#if th and th.parent and th.parent.td:
#    print(th.parent.td.text)

<h3><font color="#880000">debug test method: get_event_info</font></h3>

In [None]:
event_info = get_event_info(None)
print(event_info['venue_name'])

<h2></h2>

In [17]:
#delete master csv file
delete_csv()

In [18]:
#process all files in /data/raw/ and write to master csv file
process_all()

<p></p>

In [None]:
#test a single file (because of its unique html format)
process_html("FN_58.html", False)

In [None]:
#test a single file (because of its unique html format)
process_html("UFC_37.5.html", False)

In [None]:
#test a single file (because of its unique html format)
process_html("FOX_3.html", False)

In [None]:
#test a single file (because of its unique html format)
process_html("FUEL_10.html", False)

In [None]:
#test a single file (because of its unique html format)
process_html("TUF_26.html", False)

In [None]:
#test a single file (because of its unique html format)
("FX_7.html", False)

<p></p>
TODO: 
UFC_228_TODO.html