In [1]:
#%pip install requests
#%pip install lxml
#%pip install openpyxl
import lxml
import requests
import pandas as pd
from bs4 import BeautifulSoup
import numpy as np
import os
#Gets the table from one year of data
def get_table(url, attribute_name, table_name):
    r = requests.get(url, headers={"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36"})
    soup = BeautifulSoup(r.content, "html.parser")
    table = soup.find("table", {attribute_name: table_name})
    return table

#Tidys table into usable DataFrame
def get_pomeroy_df(url):
    table = get_table(url, "id", "ratings-table")    
    df = pd.read_html(str(table))[0]
    col_names = ["Rank", "TeamName", "Conf", "W-L", "AdjEM", "AdjO", "AdjO Rank", "AdjD", "AdjD Rank", "AdjT", "AdjT Rank", "Luck", "Luck Rank", "AdjEM", "AdjEM Rank", "OppO", "OppO Rank", "OppD", "OppD Rank", "AdjEM", "AdjEM Rank"]
    df.columns = col_names
    
    #Remove teams without seeding
    df = df[df['TeamName'].str[-1].str.isdigit() == True].reset_index(drop=True)
    
    df[['TeamName','Seed']] = df["TeamName"].str.rsplit(" ", n=1, expand=True)
    
    return df

#Parse dataframes from range of years, excluding 2020
def pomeroy_get_multiple_years(start, end):
    dfs = []
    for year in range(start, end+1):
        #Exclude 2020 data since there was no tournament this year
        if year != 2020:
            df = get_pomeroy_df(f"https://kenpom.com/index.php?y={year}")
            df["Year"] = year
            dfs.append(df)
    df = pd.concat(dfs).reset_index(drop=True)
    return df

In [2]:
def get_betting_odds_table_html(url):
    table = get_table(url, "class", "soh1")
    return table

def write_to_html(table, year):
    path = os.getcwd()
    new_path = os.path.relpath(f'../data/betting_data/html/{year}_betting_odds_table.html', path)
    html_writer = open(new_path,"w")
    html_writer.write(str(table))
    html_writer.close()

In [3]:
def write_to_html_years(start, end):
    for year in range(start, end+1):
        #Exclude 2020 data since there was no tournament this year
        if year != 2020:
            table = get_betting_odds_table_html(f"https://www.sportsoddshistory.com/cbb-main/?y={year-1}-{year}&sa=cbb&a=nc&o=r")
            write_to_html(table, year)

In [4]:
def excel_to_dataframe(year):
    path = os.getcwd()
    new_path = os.path.relpath(f'../data/betting_data/excel/{year}_betting_odds_table.xlsx', path)
    df = pd.read_excel(new_path)
    df.at[1,"Team"] = "Team"
    new_header = df.iloc[1]
    df = df[2:] #take the data less the header row
    df.columns = new_header
    df = df[["Team", "Round 1"]].replace(r'^\s*$', np.nan, regex=True).dropna()
    df.columns = ["Team", "Round 1 Odds"]
    return df

In [5]:
def get_betting_df(start, end):
    dfs = []
    for year in range(start, end+1):
        #Exclude 2020 data since there was no tournament this year
        if year != 2020:
            df = excel_to_dataframe(year)
            df["Year"] = year
            dfs.append(df)
    df = pd.concat(dfs).reset_index(drop=True)
    return df

In [6]:
pomeroy_data = pomeroy_get_multiple_years(2002, 2022)
#Pomeroy referes to the same school differently in different years, fixing this manually for the join to work
pomeroy_data.at[119,"TeamName"] = "Troy"

In [7]:
pomeroy_data.to_csv("../data/pomeroy_data.csv")

In [8]:
write_to_html_years(2002, 2022)

In [9]:
betting_data = get_betting_df(2002, 2022)

In [10]:
betting_data.to_csv("../data/betting_data/betting_data.csv")