# March Madness ML Algorithm

## Setup

We download libraries and import configurables.

In [127]:
from dotenv import dotenv_values
from io import StringIO
import json
import logging
import pandas as pd
import requests

config = dotenv_values(".env")

## Load Data

### March Madness results

We import data from every tournament game from 1985 - 2019. This data is stored locally in `data/mm-results.csv`.

In [128]:
df = pd.read_csv('data/mm-results.csv')

### Yearly team stats

We import data on team stats for every season (1985 - 2019). This data comes from Sports Reference.

In [129]:
yearly_team_stats = {}

for year in range(1985, 2020):
    # get all March Madness teams for the year
    teams = set()
    games = df.loc[df['Year'] == year]
    for index, row in games.iterrows():
        teams.add(row['Team 1'])
        teams.add(row['Team 2'])
    
    # pull data for each March Madness team from Sports Reference
    for team in teams:
        sr_team = team # TODO: map team name to Sports Reference team name
        schedule_response = requests.get(f"https://www.sports-reference.com/cbb/{sr_team}/alabama/men/{year}-schedule.html")

KeyboardInterrupt: 

## Data Preparation

### Clean up main dataframe

We add, remove, and reorder columns in the main dataframe.

In [None]:
# remove columns that are not needed
columns_to_remove = [
    'Region Number',
    'Region Name',
]
df.drop(columns_to_remove, axis='columns')

# reorder columns
df = df[['Year', 'Round', 'Team 1', 'Seed 1', 'Team 2', 'Seed 2', 'Score 1', 'Score 2']]

# add column for winner
def get_winner(score1, score2):
    if score1 > score2:
        return 1
    elif score1 < score2:
        return 2
    else:
        logging.error(f"Score 1: {score1}, Score 2: {score2}, no winner found")
        return 0
df.insert(8, 'Winner', [get_winner(row['Score 1'], row['Score 2']) for index, row in df.iterrows()])
