In [None]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import numpy as np
import io
from scipy.stats import norm
from plotnine import *

# This first part was written as a python script so it's all in one block


# different spellings for all women's teams
spellings = pd.read_csv('../input/2022-taylor-merry-womens-march-madness-data/March Madness - WSpellings.csv')


# pull in ratings from Talismanred, commented out because we don't want to scrape here, want to use the written csv of pre-tourney ratings
#tal_url = 'http://talismanred.com/ratings/whoops/'
#tal_page = requests.get(tal_url)
#tal_soup = BeautifulSoup(tal_page.content, 'lxml')
#teams_list = []
#ratings_list = []
#rows = tal_soup.select('#data')[0].get_text().split('\n')
#for row in rows:
#    if row != '' and row.lstrip()[0].isnumeric():
#        items = row.split(' ')
#        items = [x for x in items if (x != '' and ('.' in x or not x.isnumeric()))]
#        num_items = len(items)
#        teams_list.append(' '.join(items[0:num_items - 2]))
#        ratings_list.append(items[-1])
#tal_df = pd.DataFrame({'Team': teams_list, 'TalRating': ratings_list})
#tal_df.to_csv('mydata/tal.csv', index = False)

tal_df = pd.read_csv('../input/2022-taylor-merry-womens-march-madness-data/tal.csv')

# pull in ratings from Sonny Moore. No scraping allowed so copy pasted to a txt file (source: http://sonnymoorepowerratings.com/w-basket.htm)
sm_txt = pd.read_csv('../input/2022-taylor-merry-womens-march-madness-data/moore.txt', sep = "\n", header = None)


# lists to store team ratings
teams = []
ratings = []
# for each row in the text file
for i in range(len(sm_txt)):
    row = sm_txt.iloc[i, 0].split(' ')
    # list of actual elements in row
    actual = []
    # for each element in the row, add it to the actuals if it's not an empty string
    for element in row:
        if element != '':
            actual.append(element)
    # need to read in each word of the team and join by a space
    teams.append(' '.join(actual[1:len(actual) - 5]))
    # ratings is always the last element
    ratings.append(actual[-1])
# append the seasons data
sm_df = pd.DataFrame({'Team': teams, 'MooreRating': ratings})



# merge all of the ratings datasets together using the spellings data
teams = pd.merge(spellings, tal_df, how = 'inner', left_on = ['TalisManred'], right_on = ['Team'])
teams = pd.merge(teams, sm_df, how = 'inner', left_on = ['SonnyMoore'], right_on = ['Team'])
teams = teams[['TeamID', 'TalRating', 'MooreRating']]


# turn ratings into floats
teams['TalRating'] = teams['TalRating'].astype(float)
teams['MooreRating'] = teams['MooreRating'].astype(float)


# join to Kaggle data of seeds
seeds = pd.read_csv('../input/womens-march-mania-2022/WDataFiles_Stage2/WNCAATourneySeeds.csv')
seeds = seeds[seeds['Season'] == 2022]
teams = teams.merge(seeds, how = 'inner', on = ['TeamID'])


# read csv of my hypothesized injury/COVID/Conf/Play Style adjustments
adjustments = pd.read_csv('../input/2022-taylor-merry-womens-march-madness-data/March Madness - WAdjustments.csv')


# join adjustments to the ratings data
teams = teams.merge(adjustments, how = 'inner', on = ['TeamID'])


# ratings data needed for predictions
ratings = teams[['TeamID', 'TalRating', 'MooreRating', 'PointsAdjustment']]


# Kaggle provided matchup/submissions file
matchups = pd.read_csv('../input/womens-march-mania-2022/WDataFiles_Stage2/WSampleSubmissionStage2.csv')


# split out the team IDs
matchups['LowerTeamID'] = matchups['ID'].str.split('_').str[1].astype(int)
matchups['HigherTeamID'] = matchups['ID'].str.split('_').str[2].astype(int)


# join matchups to ratings so that we can make predictions on each matchup
preds = pd.merge(matchups, ratings, how = 'left', left_on = ['LowerTeamID'], right_on = ['TeamID'])
preds = pd.merge(preds, ratings, how = 'left', left_on = ['HigherTeamID'], right_on = ['TeamID'])


# weights to apply to each rating difference, I've used the moore system before so I'm giving slightly higher weight
weights = {
    'TalRating': 0.4,
    'MooreRating': 0.6
}



# function to predict predict win probability for team x against team y
def predict_win_prob(row):
    
    # predicted points difference across each rating system and adjustments
    tal_diff = row['TalRating_x'] - row['TalRating_y']
    moore_diff = row['MooreRating_x'] - row['MooreRating_y']
    adj_diff = row['PointsAdjustment_x'] - row['PointsAdjustment_y']
    
    # predicted number of possessions
    pred_diff = weights['TalRating'] * tal_diff + weights['MooreRating'] * moore_diff + adj_diff
    
    # typical std dev for the difference of points, Kenpom uses 11 points as std for men's
    typical_std = 11.0
    
    # return win prob using normal distribution and the predicted mean and estimated standard deviation
    return 1 - norm.cdf(0, loc = pred_diff, scale = typical_std)


# predict the win probability for each game
preds['Pred'] = preds.apply(predict_win_prob, axis = 1)


# only take needed columns
preds = preds[['ID', 'Pred']]

In [None]:
preds['TeamID_x'] = preds['ID'].str.split('_').str[1].astype(int)
preds['TeamID_y'] = preds['ID'].str.split('_').str[2].astype(int)
preds.head()

In [None]:
# Seeds for 2022
seeds = pd.read_csv('../input/womens-march-mania-2022/WDataFiles_Stage2/WNCAATourneySeeds.csv').query('Season == 2022')[['TeamID', 'Seed']]
seeds.head()

In [None]:
# File for names of teams
team_names = pd.read_csv('../input/womens-march-mania-2022/WDataFiles_Stage1/WTeams.csv')[['TeamID', 'TeamName']]
team_names.head()

In [None]:
# join names with seeds
team_names_seeds = pd.merge(team_names, seeds, on = 'TeamID')
team_names_seeds.head()

### Individual Team Adjustments

In this section, I decided to individually decrease Baylor's win probabilities because I thought they would be stronly picked by the field as a 2 seed, but I didn't think they were very good.

In [None]:
preds.head()

In [None]:
len(preds)

In [None]:
preds = pd.merge(preds, seeds, how = 'inner', left_on = ['TeamID_x'], right_on = ['TeamID'])
preds = pd.merge(preds, seeds, how = 'inner', left_on = ['TeamID_y'], right_on = ['TeamID'])
preds.head()

In [None]:
len(preds)

In [None]:
preds['TeamID_Lower'] = preds['ID'].str.split('_').str[1].astype(int)
preds['TeamID_Higher'] = preds['ID'].str.split('_').str[2].astype(int)

In [None]:
upper = ['01', '04', '05', '08', '09', '12', '13', '16']
lower = ['02', '03', '06', '07', '10', '11', '14', '15']
def game_type(row):
    if row['Seed_x'][0] == row['Seed_y'][0]:
        if (row['Seed_x'][1:3] in upper and row['Seed_y'][1:3] in upper) or (row['Seed_x'][1:3] in lower and row['Seed_y'][1:3] in lower):
            return 'First3'
        else:
            return 'Elite8'
    elif row['Seed_x'][0] != row['Seed_y'][0] and (row['Seed_x'][0] in ['Y', 'Z'] and row['Seed_y'][0] in ['Y', 'Z']) or (row['Seed_x'][0] in ['W', 'X'] and row['Seed_y'][0] in ['W', 'X']):
        return 'Final4'
    else:
        return 'Final'
    
preds['Round'] = preds.apply(game_type, axis = 1)
preds.head()

In [None]:
# default alteration of zero
preds['Alteration'] = 0.0

In [None]:
# inputs:
# game id (str): the game id of the matchup to be altered
# alteration: the amount of win prob that should be added to the lower team ID's win prob
def alter_matchup(game_id, alteration):
    preds.loc[preds['ID'] == game_id, 'Alteration'] = preds.loc[preds['ID'] == game_id, 'Alteration'] + alteration

In [None]:
# see plot below for what potential new probs look like
def get_alteration(prob, adj):
    if adj == 0.0:
        return prob * -1
    elif adj == 1.0:
        return (1 - prob)
    else:
        return (adj - 0.5) * min(prob, 1 - prob) * 4 * (0.5 - abs(0.5 - prob))

In [None]:
adjs = np.repeat([0.0, 0.3, 0.5, 0.6, 1.0], 101)
probs = [i / 100 for i in range(101)] * 5
new_probs = [probs[i] + get_alteration(probs[i], adjs[i]) for i in range(505)]
plot_df = pd.DataFrame({'prob': probs, 'new_prob': new_probs, 'adj': adjs})
(ggplot(plot_df, aes(x = 'prob', y = 'new_prob', group = 'adj', color = 'adj')) + geom_line())

In [None]:
round_indexes = {
    'First3': 0,
    'Elite8': 1,
    'Final4': 2,
    'Final': 3
}



# inputs:
# team_id (int): the team id to alter matchups
# adjs (list of floats, min = 0.0, max = 1.0, size = 4): for each round type of the tournament, what win prob should the team have for its 50/50 games

# DO NOT USE ADJS GREATER THAN 0.75 OR LESS THAN 0.25, USE 0.0 OR 1.0 TO SET THE PROB AT 0 OR 1
def alter_team(team_id, adjs):
    if len(adjs) != 4 or max(adjs) > 1 or min(adjs) < 0:
        print('Error')
    else:
        team_games = preds[(preds['TeamID_Higher'] == team_id) | (preds['TeamID_Lower'] == team_id)].reset_index()
        for i in range(len(team_games)):
            round_num = round_indexes[team_games.loc[i, 'Round']]
            game_id = team_games.loc[i, 'ID']
            pred = team_games.loc[i, 'Pred']
                        
            if team_id == team_games.loc[i, 'TeamID_Higher']:
                sign = -1
            else:
                sign = 1
                
            alteration = get_alteration(pred, sign * (adjs[round_num] - 0.5) + 0.5)
            
            alter_matchup(game_id, alteration)

In [None]:
# team alterations here
alter_team(3124, [0.5, 0.45, 0.25, 0]) # fading Baylor

In [None]:
preds.head()

In [None]:
# look at games with largest alterations
preds.reindex(preds['Alteration'].abs().sort_values(ascending = False).index).head(20)

In [None]:
preds['Pred'] = preds['Pred'] + preds['Alteration']
preds.head()

### Make all probabilities more aggressive

Overfitting slightly to increase the variance of my predictions.

In [None]:
# function to move prediction to a more aggressive number
# 50.0001% goes to ~55%
# 49.9999% goes to ~45%
# 60% goes to ~62%
# 40% goes to ~38%
# 70% goes to ~71%
# 80% goes to ~80%
def aggressive_all(row):
    # whichever probability is lower than 50% between the two teams, ie in a 90/10 prediction, the 10%
    lower_prob = 0.5 - abs(row['Pred'] - 0.5)
    
    # multiplier to multiply the lower probability by, 0.9 at 0.5 and 1 at 0, quadratic in between
    multiplier = (1 - 0.4 * lower_prob ** 2)
    
    # return the probaiblity multiplied by the multiplier
    if row['Pred'] < 0.5:
        return row['Pred'] * multiplier
    else:
        return 1 - ((1 - row['Pred']) * multiplier)
    
preds['Pred'] = preds.apply(aggressive_all, axis = 1)

In [None]:
preds.head()

### Kaggle Submissions

Finally, Kaggle allows two different submissions. We can take advantage of that by completely altering a game or a few to make sure that the two submissions can lead to a large decrease in log loss.

Here, I will make the first submission give UConn 100% probability to reach the finals and I will make the second submission give UConn a 0% probability to win a game in the final 4.

Therefore, I gain a large advantage in log loss as long as UConn reaches the final 4.

No final 4 = 0 advantage in log loss (doesn't matter, my system is high on UConn anyways so I already need UConn to do well)

Loss in semifinal = Large advantge in log loss for submission 2

Win in semifinal = Large advantage in log loss for submission 1

In [None]:
submission1 = preds[['ID', 'Pred', 'Round']].copy()
submission2 = preds[['ID', 'Pred', 'Round']].copy()

In [None]:
# alteration 1 is moving all of UConn's probabilties before the title game to 1
def alteration1(row):
    id_list = row['ID'].split('_')
    lower_id = id_list[1]
    higher_id = id_list[2]
    rnd = row['Round']
    
    if lower_id == '3163' and rnd != 'Final':
        return 1
    elif higher_id == '3163' and rnd != 'Final':
        return 0
    else:
        return row['Pred']
    
submission1['Pred'] = submission1.apply(alteration1, axis = 1)

In [None]:
# alteration 2 is moving all of UConn's probabilties in final four to zero
def alteration2(row):
    id_list = row['ID'].split('_')
    lower_id = id_list[1]
    higher_id = id_list[2]
    rnd = row['Round']
    
    if lower_id == '3163' and rnd == 'Final4':
        return 0
    elif higher_id == '3163' and rnd == 'Final4':
        return 1
    else:
        return row['Pred']
    
submission2['Pred'] = submission2.apply(alteration2, axis = 1)

Since I am reliant on UConn to make the final, I want to be extra risky if UConn ends up losing before then. Therefore, I will make the underdogs that could realistically beat UConn less of favorites or less of underdogs in their next game.

In [None]:
# submission 1 doesn't matter because it's auto last if UConn loses early

# Indiana and Kentucky bigger favorites over NC State
submission2.loc[submission2['ID'] == '2022_3246_3301', 'Pred'] = 0.55
submission2.loc[submission2['ID'] == '2022_3231_3301', 'Pred'] = 0.55

# NC State less of favorite over Texas and less of underdog over Stanford
submission2.loc[submission2['ID'] == '2022_3301_3400', 'Pred'] = 0.45
submission2.loc[submission2['ID'] == '2022_3301_3390', 'Pred'] = 0.55

### Check to make sure probabilities are how we want them

In [None]:
# prints the predicted probabilties for all games involving the given team id
def print_games(team_id, df):
    pred_list = []
    opps = []
    round_list = []
    
    for i in range(len(df)):
        id_list = df.loc[i, 'ID'].split('_')
        prob = df.loc[i, 'Pred']
        rnd = df.loc[i, 'Round']
        id_x = int(id_list[1])
        id_y = int(id_list[2])
        if id_x == team_id:
            opps.append(team_names[team_names['TeamID'] == id_y].reset_index().loc[0, 'TeamName'])
            pred_list.append(prob)
            round_list.append(rnd)
        elif team_id == id_y:
            opps.append(team_names[team_names['TeamID'] == id_x].reset_index().loc[0, 'TeamName'])
            pred_list.append((1 - prob))
            round_list.append(rnd)
    
    return pd.DataFrame({'Pred': pred_list, 'Opp': opps, 'Round': round_list}).sort_values('Pred')

In [None]:
# UConn, going to final
pd.set_option('display.max_rows', 70)
print_games(3163, submission1)

In [None]:
# UConn, losing in final 4
print_games(3163, submission2)

In [None]:
# NC State, underdog vs Indiana, Kentucky, and Texas but favorite vs Stanford
print_games(3301, submission2)

In [None]:
# Baylor, faded
print_games(3124, submission2)

In [None]:
submission1[['ID', 'Pred']].to_csv('Wsubmission1.csv', index = False)
submission2[['ID', 'Pred']].to_csv('Wsubmission2.csv', index = False)