# EDA of Scores

## Table of Contents
* [Regular Season](#1)
* [Tourney](#2)
* [Secondary Tourney](#3)
* [Compare Score Differences by Phase](#4)
* [All Phases combined](#5)
* [Baseline Submission](#6)

In [None]:
# packages

# standard
import numpy as np
import pandas as pd

# plot
import matplotlib.pyplot as plt
import seaborn as sns

# files
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

<a id='1'></a>
# Regular Season

In [None]:
df1 = pd.read_csv('../input/ncaam-march-mania-2021-spread/MRegularSeasonCompactResults.csv')
# add score difference
df1['ScoreDiff'] = df1.WScore - df1.LScore
df1.head()

In [None]:
# use df as temporary object for the following
df = df1.copy()

### Season and DayNum - Frequencies

In [None]:
plt.figure(figsize=(10,4))
df.Season.value_counts().plot(kind='bar')
plt.title('Season')
plt.grid()
plt.show()

plt.figure(figsize=(18,4))
df.DayNum.value_counts().plot(kind='bar')
plt.title('DayNum')
plt.grid()
plt.show()

In [None]:
# WLoc
print(df.WLoc.value_counts())
# show also plot
df.WLoc.value_counts().plot(kind='bar')
plt.title("Winner's location")
plt.grid()
plt.show()

In [None]:
# number of overtime periods
df.NumOT.value_counts()

In [None]:
# define relevant features
features_num = ['WScore','LScore','ScoreDiff']

In [None]:
# basic stats
df[features_num].describe(percentiles=[0.01,0.1,0.25,0.5,0.75,0.9,0.99])

In [None]:
# plot scores
sns.pairplot(df[features_num], kind='hist',
             diag_kws = {'bins' : 25},
             plot_kws = {'bins' : 40})
plt.show()

### Include visualization of overtime periods

In [None]:
# plot scores
sns.pairplot(df[features_num+['NumOT']], hue='NumOT',
             plot_kws={'alpha': 0.9})
plt.show()

### Include visualization of winner's location

In [None]:
# plot scores
sns.pairplot(df[features_num+['WLoc']], hue='WLoc',
             plot_kws={'alpha': 0.2, 's' : 20})
plt.show()

### Do we see changes in score difference over time?

In [None]:
plt.figure(figsize=(12,4))
sns.boxplot(data=df, x='Season', y='ScoreDiff')
plt.xticks(rotation=90)
plt.grid()
plt.show()

#### No real trend visible...

### Most frequent score differences

In [None]:
# what are the most frequent score differences?
df.ScoreDiff.value_counts()[0:20].plot(kind='bar')
plt.title('Top 20 Score Differences')
plt.grid()
plt.show()

#### Interesting: The most frequent value is 3, which is quite far away from the mean value of 12.1.

### Most frequent results

In [None]:
df['Result'] = df.WScore.astype(str) + ':' + df.LScore.astype(str)

# the ten most frequent results:
df.Result.value_counts()[0:10]

<a id='2'></a>
# Tourney

In [None]:
df2 = pd.read_csv('../input/ncaam-march-mania-2021-spread/MNCAATourneyCompactResults.csv')
# add score difference
df2['ScoreDiff'] = df2.WScore - df2.LScore
df2.head()

In [None]:
# use df as temporary object for the following
df = df2.copy()

### Season and DayNum - Frequencies

In [None]:
plt.figure(figsize=(10,4))
df.Season.value_counts().plot(kind='bar')
plt.title('Season')
plt.grid()
plt.show()

plt.figure(figsize=(10,4))
df.DayNum.value_counts().plot(kind='bar')
plt.title('DayNum')
plt.grid()
plt.show()

In [None]:
# WLoc is always 'N' here:
df.WLoc.value_counts()

In [None]:
# number of overtime periods
df.NumOT.value_counts()

In [None]:
# basic stats
df[features_num].describe(percentiles=[0.01,0.1,0.25,0.5,0.75,0.9,0.99])

In [None]:
# plot scores
sns.pairplot(df[features_num], kind='hist')
plt.show()

### Include visualization of overtime periods

In [None]:
# plot scores
sns.pairplot(df[features_num+['NumOT']], hue='NumOT',
             plot_kws={'alpha': 0.9})
plt.show()

### Do we see changes in score difference over time?

In [None]:
plt.figure(figsize=(12,4))
sns.boxplot(data=df, x='Season', y='ScoreDiff')
plt.xticks(rotation=90)
plt.grid()
plt.show()

#### No real trend visible...

### Most frequent score differences

In [None]:
# what are the most frequent score differences?
df.ScoreDiff.value_counts()[0:20].plot(kind='bar')
plt.title('Top 20 Score Differences')
plt.grid()
plt.show()

#### Most frequent score difference is now 2, mean value is 11.7.

### Most frequent results

In [None]:
df['Result'] = df.WScore.astype(str) + ':' + df.LScore.astype(str)

# the ten most frequent results:
df.Result.value_counts()[0:10]

<a id='3'></a>
# Secondary Tourney

In [None]:
df3 = pd.read_csv('../input/ncaam-march-mania-2021-spread/MSecondaryTourneyCompactResults.csv')
# add score difference
df3['ScoreDiff'] = df3.WScore - df3.LScore
df3.head()

In [None]:
# use df as temporary object for the following
df = df3.copy()

### Season and DayNum - Frequencies

In [None]:
plt.figure(figsize=(10,4))
df.Season.value_counts().plot(kind='bar')
plt.title('Season')
plt.grid()
plt.show()

plt.figure(figsize=(10,4))
df.DayNum.value_counts().plot(kind='bar')
plt.title('DayNum')
plt.grid()
plt.show()

In [None]:
# WLoc
print(df.WLoc.value_counts())
# show also plot
df.WLoc.value_counts().plot(kind='bar')
plt.title("Winner's location")
plt.grid()
plt.show()

In [None]:
# number of overtime periods
df.NumOT.value_counts()

In [None]:
# basic stats
df[features_num].describe(percentiles=[0.01,0.1,0.25,0.5,0.75,0.9,0.99])

In [None]:
# plot scores
sns.pairplot(df[features_num], kind='hist')
plt.show()

### Include visualization of overtime periods

In [None]:
# plot scores
sns.pairplot(df[features_num+['NumOT']], hue='NumOT',
             plot_kws={'alpha': 0.9})
plt.show()

### Include visualization of winner's location

In [None]:
# plot scores
sns.pairplot(df[features_num+['WLoc']], hue='WLoc',
             plot_kws={'alpha': 0.2, 's' : 20})
plt.show()

### Do we see changes in score difference over time?

In [None]:
plt.figure(figsize=(12,4))
sns.boxplot(data=df, x='Season', y='ScoreDiff')
plt.xticks(rotation=90)
plt.grid()
plt.show()

#### No real trend visible...

### Most frequent score differences

In [None]:
# what are the most frequent score differences?
df.ScoreDiff.value_counts()[0:20].plot(kind='bar')
plt.title('Top 20 Score Differences')
plt.grid()
plt.show()

In [None]:
# this is very close, let's look at the figures
df.ScoreDiff.value_counts()[0:5]

#### Most frequent score difference is again 2 (but 3 is as close as can be), mean value is 9.6.

### Most frequent results

In [None]:
df['Result'] = df.WScore.astype(str) + ':' + df.LScore.astype(str)

# the ten most frequent results:
df.Result.value_counts()[0:10]

<a id='4'></a>
# Compare Score Differences by Phase

In [None]:
# compare histograms
fig, (ax1,ax2,ax3) = plt.subplots(nrows=3, sharex=True, figsize=(10,10))
ax1.grid()
ax2.grid()
ax3.grid()
ax1.hist(df1.ScoreDiff, bins=20)
ax2.hist(df2.ScoreDiff, bins=20)
ax3.hist(df3.ScoreDiff, bins=20)
ax1.set_title('Regular Season')
ax2.set_title('Tourney')
ax3.set_title('Secondary Tourney')
plt.show()

In [None]:
# compare boxplots
fig, (ax1,ax2,ax3) = plt.subplots(ncols=3, sharey=True, figsize=(12,5))
ax1.grid()
ax2.grid()
ax3.grid()
ax1.boxplot(df1.ScoreDiff)
ax2.boxplot(df2.ScoreDiff)
ax3.boxplot(df3.ScoreDiff)
ax1.set_title('Regular Season')
ax2.set_title('Tourney')
ax3.set_title('Secondary Tourney')
plt.show()

#### More wins with high difference in regular season.

<a id='5'></a>
# All phases combined

In [None]:
# combine all three phase in one data frames
df_all = pd.concat([df1,df2,df3])

In [None]:
df_all.head()

In [None]:
# basic stats of combined data
df_all[features_num].describe(percentiles=[0.01,0.1,0.25,0.5,0.75,0.9,0.99])

In [None]:
# most frequent values
df_all.ScoreDiff.value_counts()[0:10].plot(kind='bar')
plt.title('Top 20 Score Differences')
plt.grid()
plt.show()

In [None]:
# eval score difference by winning team
df_all_grouped = df_all.groupby('WTeamID', as_index=False).agg(
    n = pd.NamedAgg(column='ScoreDiff', aggfunc='count'),
    mean_diff = pd.NamedAgg(column='ScoreDiff', aggfunc=np.mean),
    median_diff = pd.NamedAgg(column='ScoreDiff', aggfunc=np.median),
)

# calculate mode (most frequent value) by group separately
# [didn't manage to inject a "mode" function in the NamedAgg logic above...]
wteams = df_all_grouped.WTeamID.unique().tolist()
mode_list = []
for w in wteams:
    # calc most frequent value using value_counts
    current_mode = df_all[df_all.WTeamID==w].ScoreDiff.value_counts().index[0]
    # and add result to list
    mode_list.append(current_mode)
# add calculated modes to grouped data frame
df_all_grouped['mode_diff'] = mode_list
df_all_grouped.head()

In [None]:
# plot mean of difference by team
plt.figure(figsize=(14,5))
plt.scatter(df_all_grouped.WTeamID, df_all_grouped.mean_diff)
plt.xlabel('Team')
plt.ylabel('Mean of score differences')
plt.grid()
plt.show()

In [None]:
# plot mode of difference by team
plt.figure(figsize=(14,5))
plt.scatter(df_all_grouped.WTeamID, df_all_grouped.mode_diff)
plt.xlabel('Team')
plt.ylabel('Mode of score differences (most frequent value)')
plt.grid()
plt.show()

### So far we have explored the absolute score difference for the WINNING team. However, in order to really submit something useful this is unfortunately not enough! We would also need to provide a sign +1/-1 depending on which of the two teams wins.

### Evaluate (historical) winning percentage

In [None]:
# count winners and losers
count_win = df_all.WTeamID.value_counts().rename_axis('TeamID').reset_index(name='counts_W')
count_lose = df_all.LTeamID.value_counts().rename_axis('TeamID').reset_index(name='counts_L')

# and join tables
df_win_lose = count_win.merge(count_lose, on='TeamID')

# add sum of counts and winning percentage
df_win_lose['n'] = df_win_lose.counts_W + df_win_lose.counts_L
df_win_lose['win_perc'] = df_win_lose.counts_W / df_win_lose.n

# preview
df_win_lose.head()

In [None]:
# make results available for download
df_all.to_csv('df_all.csv', index=False)
df_all_grouped.to_csv('df_all_grouped.csv', index=False)
df_win_lose.to_csv('df_win_lose.csv', index=False)

<a id='6'></a>
# Baseline Submission 

In [None]:
# read submission template
sub = pd.read_csv('../input/ncaam-march-mania-2021-spread/MSampleSubmissionStage1.csv')

In [None]:
# number of rows
n_sub = sub.shape[0]

In [None]:
# parse ID and build extended submission data frame
sub_x = sub.copy()
sub_x['year'] = sub.ID.apply(lambda x: x[0:3+1])
sub_x['team_left'] = sub.ID.apply(lambda x: x[5:8+1])
sub_x['team_right'] = sub.ID.apply(lambda x: x[10:13+1])
# add a view columns for the following
sub_x['p_left'] = 0
sub_x['p_right'] = 0
sub_x['diff_left'] = 0
sub_x['diff_right'] = 0
# preview
sub_x.head()

### Let's try to use the winning percentages and mean differences per team to put together a first simple model

In [None]:
# calculate prediction for each row
for i in range(n_sub):    
    current_left = pd.to_numeric(sub_x.iloc[i].team_left)
    current_right = pd.to_numeric(sub_x.iloc[i].team_right)
    # winning percentage for both teams (independently)
    perc_win_left = df_win_lose[df_win_lose.TeamID==current_left].win_perc.values[0]
    perc_win_right = df_win_lose[df_win_lose.TeamID==current_right].win_perc.values[0]
    # translate winning percentages in probabilties,
    # this is actually extremely simplified, but ok for a first base line
    p_left = perc_win_left / (perc_win_left+perc_win_right)
    p_right = perc_win_right / (perc_win_left+perc_win_right)
    # get expected differences for each team
    diff_left = df_all_grouped[df_all_grouped.WTeamID==current_left].mean_diff.values[0]
    diff_right = df_all_grouped[df_all_grouped.WTeamID==current_right].mean_diff.values[0]
    # combine to overall "prediction"
    pred = p_left * diff_left - p_right * diff_right
    # add to extended submission table
    sub_x.loc[i,'Pred'] = pred
    # add intermediate values as well
    sub_x.loc[i,'p_left'] = p_left
    sub_x.loc[i,'p_right'] = p_right
    sub_x.loc[i,'diff_left'] = diff_left
    sub_x.loc[i,'diff_right'] = diff_right
    

In [None]:
# preview
sub_x

In [None]:
# save submission file
sub.Pred = sub_x.Pred
sub.to_csv('submission.csv', index=False)