In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
from dateutil import parser
from collections import Counter

import matplotlib.pyplot as plt
import altair as alt

%matplotlib inline

from IPython.display import display, Markdown, display_html, clear_output

import ipywidgets as widgets
from ipywidgets import interact, interactive, fixed, interact_manual, Box, Layout, Button, Label

In [None]:
!jupyter nbextension enable --py --sys-prefix widgetsnbextension;

In [None]:
df_orig = pd.read_csv('/kaggle/input/superbowl-history-1967-2020/superbowl.csv')

lets preprocess data
 - conert `Date` to date type
 - set index to `Date` since superbowl can only be on 1 day a year.. also this helps in slicing by date

In [None]:
df_orig['Date'] = df_orig.Date.apply(lambda x : parser.parse(x).date())
df_orig['day'] = df_orig.Date.apply(lambda x : x.strftime("%a"))
df_orig.set_index('Date', inplace=True)

> superbowl has always been played on `Sunday`as far as I can remember but lets test it...

In [None]:
print(f"since {df_orig.index.min()} till {df_orig.index.max()} superbowl has always been played on {df_orig.day.unique()[0]}.. isnt that crazy..")

# teams
lets look at the team that has been in superbowl the most.. 
- how many times has each team win / lost
- win rate

In [None]:
def get_group_count(data, field_name):
    return pd.DataFrame(data,columns=['teams'])\
            .groupby('teams')\
            .agg({'teams':'count'})\
            .rename(columns={'teams':field_name})\
            .sort_values([field_name], ascending=False)

In [None]:
dfpresence = get_group_count(list(df_orig.Winner) + list(df_orig.Loser), 'presence_count')
dfwinners = get_group_count(list(df_orig.Winner), 'win_count')
dflosers = get_group_count(list(df_orig.Loser), 'loss_count')

dfbowl = dfpresence.join(dfwinners, how='outer').join(dflosers, how='outer').sort_values(by=['presence_count'], ascending=False)

# calculate percent difference then the previous team
dfbowl['presence_pct_diff'] = dfbowl.presence_count.pct_change().apply(lambda x : f"{round(x,2)}%")
dfbowl['win_pct_diff'] = dfbowl.win_count.pct_change().apply(lambda x : f"{round(x,2)}%")
dfbowl['loss_pct_diff'] = dfbowl.loss_count.pct_change().apply(lambda x : f"{round(x,2)}%")

dfbowl['win_rate'] = dfbowl.apply( lambda x : round(x['win_count']/x['presence_count'] * 100,2), axis=1)

In [None]:
bars = alt.Chart(dfbowl.reset_index())\
    .transform_fold(['presence_count','loss_count', 'win_count'])\
    .mark_bar()\
    .encode(
        y=alt.Y('teams:N', sort='-x', stack='zero'),
        x='value:Q',
        color='key:N',
        tooltip = ['presence_count','win_count', 'loss_count', 'win_rate']
    )\
    .properties(title='team with most presence')
# text = alt.Chart(dfbowl.reset_index())\
#     .transform_fold(['presence_count','loss_count', 'win_count'])\
#     .mark_text(dx=10,color='white')\
#     .encode(
#         y=alt.Y('teams', sort='-x', stack='zero'),
#         x='value:Q',
#         detail='key:N',
#         text=alt.Text('sum(value):Q', format='.0f')
#     )

bars_state = alt.Chart(df_orig)\
    .mark_bar()\
    .encode(
        y=alt.Y('State', sort='-x'),
        x='sum(Winner Pts)'
    )

bar_out = widgets.Output()
table_out = widgets.Output()
by_win_rate = widgets.Output()
display( widgets.HBox([bar_out,
                       widgets.VBox([table_out, by_win_rate])
                      ])
       )

with by_win_rate : display(dfbowl[dfbowl.presence_count > 5].sort_values(by=['win_rate'], ascending=False))

with bar_out: 
    display(bars)
    display(bars_state)

with table_out: 
    display(dfbowl.style\
                  .bar(subset=['win_rate', 'win_count'], color='#9fdfbf')\
                  .bar(subset=['presence_count'], color='#cce6ff')\
                  .bar(subset=['loss_count'], color='#ffb3b3')
           )



* even though Patriots have been in 11 superbowls . their win rate is 54%
* stellers have the better win rate at 75%, they have only lost 2 out of 8 superbowls...
* 49er on the other had have better win rate at 71%

# MVPs

In [None]:
df_orig.head()