# Summary
2019-03-28

The notebook is just to gather data for NCAA Men's basketball tournment analysis 2019. The website used is https://www.sports-reference.com/cbb/

 

Thank you to webscrapping web tutorials: 
 - https://docs.python-guide.org/scenarios/scrape/
 - https://towardsdatascience.com/web-scraping-html-tables-with-python-c9baba21059

In [1]:
#load need libraries
import requests
import time
import pandas as pd
from lxml import html
from os import listdir
from os.path import join

### Download HTML files from website

In [2]:
#Download html file from website
live = False
if live:
    with open('ListOfTeams.txt') as f:
        teams = f.readlines()
    for i in range(len(teams)):
        #format teams into the proper format
        teams[i] = teams[i].strip().lower().replace(' ', '-')
    url_template ='https://www.sports-reference.com/cbb/schools/XXX/2019-schedule.html'
    for t in teams:
        url = url_template.replace('XXX', t)
        r = requests.get(url, allow_redirects=True)
        with open('HTML_files/{}.txt'.format(t), 'wb') as f:
            f.write(r.content) #save file locally
        time.sleep(5)
        print('downloaded... {}'.format(t))

### Process downloaded files

In [3]:
processed_data = []
folder = 'HTML_files'
filenames = listdir(folder)
#Loop through each file and process HTML
for fn in filenames:
    with open(join(folder, fn), 'rb') as f:
        try:
            tree = html.fromstring(f.read())

            #find the team name
            th_element = tree.xpath('//*[@id="polls"]//th')
            team = th_element[-1].text_content()
            team = fn.split('.txt')[0].lower() #use the file name for team name

            #loop through schedule table by the td_elements 
            #there are 14 columns in the table
            td_elements = tree.xpath('//*[@id="schedule"]//td')
            for r in range(0, len(td_elements), 14):
                ty = td_elements[r + 2].text_content()
                if ty != 'NCAA':  #ignore NCAA tournament games
                    #If link is available use href attribute else text for opp. name
                    a_element = opp = td_elements[r+4].find('a')                    
                    if a_element == None:
                        opp = td_elements[r + 4].text_content().lower().replace(' ','-')
                        opp = opp.split('\xa0')[0] #take only team name ignore rank
                    else:
                        opp = a_element.values()[0].split('/')[3].lower()
                    s1 = td_elements[r + 7].text_content()
                    s2 = td_elements[r + 8].text_content()   
                    w1 = 1 if int(s1) > int(s2) else 0
                    w2 = 1 if int(s1) < int(s2) else 0
                    processed_data.append([team, opp, ty, s1, s2, w1, w2])
        except:
            print('skipping {}'.format(team))
            pass

#put process data into a data frame
data = pd.DataFrame(processed_data, columns=['tm1','tm2','gm_type', 
                                             's1', 's2', 'w1', 'w2'])

#### Checking the DataFrame

In [4]:
data.loc[data['tm1']=='auburn',:]

Unnamed: 0,tm1,tm2,gm_type,s1,s2,w1,w2
65,auburn,south-alabama,REG,101,58,1,0
66,auburn,washington,REG,88,66,1,0
67,auburn,mississippi-college,REG,103,52,1,0
68,auburn,xavier,REG,88,79,1,0
69,auburn,duke,REG,72,78,0,1
70,auburn,arizona,REG,73,57,1,0
71,auburn,saint-peters,REG,99,49,1,0
72,auburn,north-carolina-asheville,REG,67,41,1,0
73,auburn,dayton,REG,82,72,1,0
74,auburn,alabama-birmingham,REG,75,71,1,0


In [5]:
data.loc[data['tm1']=='auburn',['w1','w2']].sum()

w1    26
w2     9
dtype: int64

### Save DataFrame as CSV
Save the dataframe for later use and avoid having to reprocess all of the HTML files again

In [6]:
data.to_csv('NCAA_dataset.csv', index=False)