In [1]:
import numpy
import pandas as pd
import random
from scipy import stats
import matplotlib.mlab as mlab
import math

In [76]:
# Create id to team name mappings
teamNames = pd.read_csv('./ncaa-data/Teams.csv')
def teamIDNameMapping():
    team_id_map = {}
    for index, row in teamNames.iterrows():
        team_id_map[row['TeamID']] = row['TeamName']
    
    return team_id_map

team_id_map = teamIDNameMapping()

In [77]:
# Find tounament data of given year
def filterTourneyDataByYear(year):

    tourney_data = pd.read_csv('./ncaa-data/NCAATourneyDetailedResults.csv')
    tourney_data = tourney_data[tourney_data.Season == year]
    
    return tourney_data

# Figure out winners
def getWinnersList(tourney_data):
    winners = []

    tourney_data_grouped = tourney_data[4:].groupby('WTeamID').size().reset_index(name='NumWins')
    for index, row in tourney_data_grouped.iterrows():
        teamName = team_id_map[row['WTeamID']]
        wins = row['NumWins']
        if len(winners) == 0:
            winners.append([])
        winners[0].append(teamName)
        if wins > 1:
            if len(winners) == 1:
                winners.append([])
            winners[1].append(teamName)
        if wins > 2:
            if len(winners) == 2:
                winners.append([])
            winners[2].append(teamName)
        if wins > 3:
            if len(winners) == 3:
                winners.append([])
            winners[3].append(teamName)
        if wins > 4:
            if len(winners) == 4:
                winners.append([])
            winners[4].append(teamName)
        if wins > 5:
            if len(winners) == 5:
                winners.append([])
            winners[5].append(teamName)
    
    return winners

In [78]:
# Method that takes in season's tournament data, id to name mapping of teams
def calcBracketScore(tourney_data):
    
    winners = getWinnersList(tourney_data)
    
    # First four rows is 'first four' and not in actual bracket
    firstFour = tourney_data[:4]
    mainTourney = tourney_data[4:]
    
    tourney_seeds = pd.read_csv('./ncaa-data/TourneySeeds.csv')
    
    tourney = [-1] * max(tourney_seeds['Team'])

    # Look at first four teams
    for index, row in firstFour.iterrows():
        index1 = int(row['WTeamID'])
        index2 = int(row['LTeamID'])
        team1Name = team_id_map[index1]
        team2Name = team_id_map[index2]

        if random.randint(0,1): # team 1 randomly wins
            tourney[index2] = index1
        else:
            tourney[index1] = index2

    # MAIN TOURNAMENT 
    score = 0
    for index, row in mainTourney.iterrows():
        index1 = int(row['WTeamID'])
        index2 = int(row['LTeamID'])
        while tourney[index1] > 0:
            index1 = tourney[index1]
        while tourney[index2] > 0:
            index2 = tourney[index2]
        team1Name = team_id_map[index1]
        team2Name = team_id_map[index2]

        if random.randint(0,1): # team1 randomly wins
            tourney[index2] = index1
            tourney[index1] = tourney[index1] - 1
            if team1Name in winners[abs(tourney[index1]) - 2]:
                score += 2**(abs(tourney[index1]) - 2) * 10

        else: #team2 would win
            tourney[index1] = index2
            tourney[index2] = tourney[index2] - 1
            if team2Name in winners[abs(tourney[index2]) - 2]:
                score += 2**(abs(tourney[index2]) - 2) * 10

    return score

In [79]:
# Chosen year doesn't actually matter. Just picked 2017
tourney_data = filterTourneyDataByYear(2017)
scores = []
for i in range(100000):
    newScore = calcBracketScore(tourney_data)
    scores.append(newScore)

In [80]:
len(scores)

100000

In [86]:
numpy.mean(scores)

312.85000000000002

In [87]:
max(scores)

1400

In [98]:
stats.describe(scores)

DescribeResult(nobs=100000, minmax=(50, 1400), mean=312.85000000000002, variance=18087.084370843706, skewness=1.999202728530008, kurtosis=6.409950833783562)

In [92]:
numpy.std(scores)

134.48755890416035

In [93]:
print(numpy.mean(scores) - 2 * numpy.std(scores), numpy.mean(scores) + 2 * numpy.std(scores))

43.8748821917 581.825117808


In [100]:
x = pd.Series(scores)
x.describe()

count    100000.000000
mean        312.850000
std         134.488231
min          50.000000
25%         230.000000
50%         280.000000
75%         360.000000
max        1400.000000
dtype: float64