# March Madness ML Algorithm

## Setup

We download libraries and import configurables.

In [32]:
from bs4 import BeautifulSoup
# from dotenv import dotenv_values
import logging
import pandas as pd
import os

# config = dotenv_values(".env")
# logging.getLogger().setLevel(logging.INFO) # toggle logging level

## Load Data

### March Madness results

We import data from every tournament game from 1985 - 2019. This data is stored locally in `data/mm-results.csv`.

In [33]:
df = pd.read_csv('data/mm-results.csv')

### Yearly team stats

We import data on team stats for every season (1985 - 2019). This data is scraped from Sports Reference. If data is not already scraped, run `helper/pull-sports-reference.py`.

In [34]:
def parse_html_table(table):
    # parse the column headers
    try:
        headers = []
        html_headers = table.find_all('thead')[0].find_all('tr')[0].find_all('th')
        for header in html_headers:
            headers.append(header.get('data-stat'))
    except:
        logging.error(f'No headers found for {year} {team}')
        return None

    # parse the rows
    try:
        games = []
        for row in table.find_all('tbody')[0].find_all('tr'):
            if row.get('class') == ['thead']:
                continue
            cells = row.find_all('td')
            game = {}
            for i, cell in enumerate(cells):
                game[headers[i+1]] = cell.get_text()
            games.append(game)
    except:
        logging.error(f'Error parsing rows for {year} {team}')
        return None
    
    return games

seasonal_dfs = {}
directory = 'data/yearly'
for filename in os.listdir(directory):
    year = filename.split('-')[0]
    team = filename.split('-')[1].split('.')[0]
    
    with open(f'{directory}/{filename}') as f:
        # find table
        soup = BeautifulSoup(f, 'html.parser')
        table = soup.find(id='schedule')
        if table is None:
            logging.error(f'No schedule found for {year} {team}')
            continue
        
        # parse table
        games = parse_html_table(table)
        if games is None:
            continue
        
        # add games to dataframe
        seasonal_dfs[f'{year}-{team}'] = pd.DataFrame(games)


        

## Data Preparation

### Clean up main dataframe

We add, remove, and reorder columns in the main dataframe.

In [35]:
# remove columns that are not needed
columns_to_remove = [
    'Region Number',
    'Region Name',
]
df.drop(columns_to_remove, axis='columns')

# reorder columns
df = df[['Year', 'Round', 'Team 1', 'Seed 1', 'Team 2', 'Seed 2', 'Score 1', 'Score 2']]

# add column for winner
def get_winner(score1, score2):
    if score1 > score2:
        return 1
    elif score1 < score2:
        return 2
    else:
        logging.error(f"Score 1: {score1}, Score 2: {score2}, no winner found")
        return 0
df.insert(8, 'Winner', [get_winner(row['Score 1'], row['Score 2']) for index, row in df.iterrows()])


AttributeError: module 'numpy' has no attribute 'matrix'