# March Madness ML Algorithm

## Setup

We download libraries and import configurables.

In [109]:
from dotenv import dotenv_values
from io import StringIO
import json
import logging
import pandas as pd
import requests

config = dotenv_values(".env")

## Load Data

### March Madness results

We import data from every tournament game from 1985 - 2019. This data is stored locally in `data/mm-results.csv`.

In [110]:
df = pd.read_csv('data/mm-results.csv')

### Yearly team stats

We import data on team stats for every season (1985 - 2019). This data comes from the SportsDataIO API.

In [111]:
api_key = config['API_KEY']

yearly_dfs = {}
for year in range(1985, 2020):
    response = requests.get(f"https://api.sportsdata.io/v3/cbb/scores/json/TeamSeasonStats/{year}?key={api_key}")
    print(response.text)
    yearly_dfs[year] = pd.read_json(StringIO(response.text))

[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[{"StatID":61839,"TeamID":296,"SeasonType":1,"Season":2016,"Name":"Abilene Christian Wildcats","Team":"ABCHR","Wins":8,"Losses":10,"ConferenceWins":6,"ConferenceLosses":5,"GlobalTeamID":60000296,"Possessions":null,"Updated":"2021-02-02T16:05:37","Games":18,"FantasyPoints":1533.9,"Minutes":2320,"FieldGoalsMade":280,"FieldGoalsAttempted":618,"FieldGoalsPercentage":30.2,"EffectiveFieldGoalsPercentage":34.2,"TwoPointersMade":207,"TwoPointersAttempted":413,"TwoPointersPercentage":33.5,"ThreePointersMade":73,"ThreePointersAttempted":205,"ThreePointersPercentage":23.7,"FreeThrowsMade":191,"FreeThrowsAttempted":268,"FreeThrowsPercentage":47.7,"OffensiveRebounds":84,"DefensiveRebounds":270,"Rebounds":354,"OffensiveReboundsPercentage":null,"DefensiveReboundsPercentage":null,"TotalReboundsPercentage":null,"Assists":149,"Steals":80,"BlockedShots":29,"Turnovers":155,"PersonalFouls":274,"Points":824,"TrueSho

## Data Preparation

### Clean up main dataframe

We add, remove, and reorder columns in the main dataframe.

In [112]:
# remove columns that are not needed
columns_to_remove = [
    'Region Number',
    'Region Name',
]
df.drop(columns_to_remove, axis='columns')

# reorder columns
df = df[['Year', 'Round', 'Team 1', 'Seed 1', 'Team 2', 'Seed 2', 'Score 1', 'Score 2']]

# add column for winner
def get_winner(score1, score2):
    if score1 > score2:
        return 1
    elif score1 < score2:
        return 2
    else:
        logging.error(f"Score 1: {score1}, Score 2: {score2}, no winner found")
        return 0
df.insert(8, 'Winner', [get_winner(row['Score 1'], row['Score 2']) for index, row in df.iterrows()])


### Record IDs of teams in main dataframe

The teams from `data/mm-results.csv` must be attached to an ID in order to connect yearly seasonal stats with tournament teams. Here, for each tournament game on record, we store the IDs of `Team 1` and `Team 2` in the new columns `TeamID 1` and `TeamID 2`, respectively. The mapping comes from `data/team-ids.json`.

In [113]:
with open('data/team-ids.json') as f:
    team_ids = json.load(f)

def get_team_id(team_name):
    if team_name in team_ids.keys():
        return int(team_ids[team_name])
    else:
        logging.warning(f"Team {team_name} not found. {team_name} will be chosen to lose. To fix this, add the team to data/team-ids.json")
        return -1

df.insert(2, 'TeamID 1', [get_team_id(x) for x in df['Team 1']], True)
df.insert(5, 'TeamID 2', [get_team_id(x) for x in df['Team 2']], True)


### Join main dataframe with relevant seasonal results

We create a dictionary, `team_stats`, to hold potentially relevant stats for each March Madness team in each year.

In [114]:
team_stats = {}

for year in range(1985, 2020):
    teams = set()
    games = df.loc[(df['Year'] == year) & (df['Round'] == 1)]
    for index, game in games.iterrows():
        teams.add(game['TeamID 1'])
        teams.add(game['TeamID 2'])
    for team in teams:
        print(yearly_dfs[year])
        team_stats[(team, year)] = yearly_dfs[year].loc[yearly_dfs[year]['TeamID'] == team].iloc[0]

print(team_stats)

Empty DataFrame
Columns: []
Index: []


KeyError: 'TeamID'