___
this notebook gets the coach stats

In [1]:
import pandas as pd
import numpy as np

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:85% !important; }</style>"))

pd.set_option("display.max_columns",50)

## Summary of notebook data manipulation
1. years/season of experience
1. number of playoff made
1. number of championship won
1. win rate
    - regular
    - post
    - overall

## Read data

In [2]:
raw_data_regularseason = pd.read_csv("data/DataFiles/RegularSeasonDetailedResults.csv")
raw_data_coach = pd.read_csv('data/DataFiles/TeamCoaches.csv')
raw_data_postseason = pd.read_csv('data/DataFiles/NCAATourneyCompactResults.csv')


## Get number of season experience

In [3]:
# get season max numbner of days
season_max_days = (
    raw_data_coach
    .groupby(['Season'])
    .agg({"LastDayNum":"max"})
    .reset_index()
    .rename(columns={"LastDayNum":"season_max_days"})
)

# get number of season for each coach for each year
num_days_coach_for_season = (
    raw_data_coach
    .pipe(lambda x:x.assign(daysexp = x.LastDayNum-x.FirstDayNum))
    .merge(season_max_days, how='left',on=['Season'])
    .pipe(lambda x:x.assign(num_season = x.daysexp/x.season_max_days))
    .sort_values(['CoachName','Season'])
)
num_days_coach_for_season.head()

Unnamed: 0,Season,TeamID,FirstDayNum,LastDayNum,CoachName,daysexp,season_max_days,num_season
95,1985,1224,0,154,a_b_williamson,154,154,1.0
381,1986,1224,0,154,a_b_williamson,154,154,1.0
669,1987,1224,0,154,a_b_williamson,154,154,1.0
963,1988,1224,0,154,a_b_williamson,154,154,1.0
1257,1989,1224,0,154,a_b_williamson,154,154,1.0


## Get cumulative number of seasons experience

In [4]:
# get cumulative years of experience
cum_num_days_coach_for_season = (
    num_days_coach_for_season
    .sort_values(['CoachName','Season'])
    .groupby(['CoachName'])
    .cumsum()
    .pipe(lambda x:x.assign(Season = num_days_coach_for_season.Season.values))
    .pipe(lambda x:x.assign(TeamID = num_days_coach_for_season.TeamID.values))
)

cum_num_days_coach_for_season.head()

Unnamed: 0,Season,TeamID,FirstDayNum,LastDayNum,daysexp,season_max_days,num_season
95,1985,1224,0,154,154,154,1.0
381,1986,1224,0,308,308,308,2.0
669,1987,1224,0,462,462,462,3.0
963,1988,1224,0,616,616,616,4.0
1257,1989,1224,0,770,770,770,5.0


## Assign one coach to one season
- check which teams have more than one coach in one season
    - the coach with more days of coaching will be credited for the season

In [5]:
final_coach_for_season = (
    num_days_coach_for_season
    .groupby(['Season','TeamID'])
    .agg({"CoachName":"count"})
    .reset_index()
#     .query("CoachName > 1")
    .rename(columns={"CoachName":"coach_counts"})
    .merge(num_days_coach_for_season,how='left',on=['Season','TeamID'])
    .pipe(lambda x:x.assign(final_coach = np.where(x.num_season >= 0.5, x.CoachName, "ignore")))
    [['Season','TeamID','final_coach']]
)

final_coach_for_season.head()

Unnamed: 0,Season,TeamID,final_coach
0,1985,1102,reggie_minton
1,1985,1103,bob_huggins
2,1985,1104,wimp_sanderson
3,1985,1106,james_oliver
4,1985,1108,davey_whitney


## Get number of playoffs made for coaches
- check if team made to playoff to season
- final coach gets the credit

In [6]:
# get teams for post season
teams_for_postseason = (
    raw_data_postseason
    .groupby(['Season','WTeamID'])
    .agg({"NumOT":"count"})
    .reset_index()
    .append(raw_data_postseason[['Season','LTeamID','NumOT']].rename(columns={"LTeamID":"WTeamID"}))
    .groupby(['Season','WTeamID'])
    .agg({"NumOT":"count"})
    .reset_index()
    .rename(columns={"NumOT":"is_playoff"})
    .pipe(lambda x:x.assign(is_playoff = 1))
    .rename(columns={"WTeamID":"TeamID"})
)

teams_for_postseason.head()

Unnamed: 0,Season,TeamID,is_playoff
0,1985,1104,1
1,1985,1112,1
2,1985,1116,1
3,1985,1120,1
4,1985,1130,1


In [7]:
# join postseason to final coach
final_coach_with_postseason_each_year = (
    final_coach_for_season
    .merge(teams_for_postseason,how='left',on=['Season','TeamID'])
    .fillna(0)
)

## Get number of championships won for coaches
- check which team won championship
- final coach gets the credit

In [8]:
championship_team = (
    raw_data_postseason
    .merge(season_max_days,how='left',on=['Season'])
    .query("DayNum == season_max_days")
    .groupby(['Season','WTeamID'])
    .agg({"NumOT":"count"})
    .reset_index()
    .rename(columns={"NumOT":"is_champion","WTeamID":"TeamID"})
#     .merge(final_coach_with_season_each_year,how='left',on=['Season','TeamID'])
)


final_coach_with_postseason_champion_each_year = (
    final_coach_with_postseason_each_year
    .merge(championship_team,how='left',on=['Season','TeamID'])
    .fillna(0)
)

final_coach_with_postseason_champion_each_year.head()

Unnamed: 0,Season,TeamID,final_coach,is_playoff,is_champion
0,1985,1102,reggie_minton,0.0,0.0
1,1985,1103,bob_huggins,0.0,0.0
2,1985,1104,wimp_sanderson,1.0,0.0
3,1985,1106,james_oliver,0.0,0.0
4,1985,1108,davey_whitney,0.0,0.0


## Get win rate for coach during regular season
- get up till daynum of the coach in the team of the season
- get number of games won and lost, so that reconciling with cumulative table will be okay

In [9]:
# get winning games for coaches
games_won_for_coaches = (
    raw_data_regularseason
    [['Season','DayNum','WTeamID']]
    # merge for winning team
    .merge(num_days_coach_for_season[['Season','TeamID','FirstDayNum','LastDayNum','CoachName']],
           how='left',left_on=['Season','WTeamID'],right_on=['Season','TeamID'])
    .rename(columns={"FirstDayNum":"FirstDayNum_win","LastDayNum":"LastDayNum_win","CoachName":"CoachName_win","TeamID":"TeamID_win"})
#     # merge for losing team
#     .merge(num_days_coach_for_season[['Season','TeamID','FirstDayNum','LastDayNum','CoachName']],
#            how='left',left_on=['Season','LTeamID'],right_on=['Season','TeamID'])
#     .rename(columns={"FirstDayNum":"FirstDayNum_lose","LastDayNum":"LastDayNum_lose","CoachName":"CoachName_lose","TeamID":"TeamID_lose"})
    .pipe(lambda x:x.assign(which_coach_for_win = np.where((x.FirstDayNum_win <= x.DayNum) & (x.LastDayNum_win >= x.DayNum),1,0)))
    .query("which_coach_for_win != 0")
    .groupby(['Season','CoachName_win','WTeamID'])
    .agg({"which_coach_for_win":"sum"})
    .reset_index()
)

games_won_for_coaches.head()

Unnamed: 0,Season,CoachName_win,WTeamID,which_coach_for_win
0,2003,al_skinner,1130,18
1,2003,al_walker,1127,14
2,2003,andy_stoglin,1238,10
3,2003,armond_hill,1162,2
4,2003,barry_collier,1304,11


In [10]:
# get losing games for coaches
games_lose_for_coaches = (
    raw_data_regularseason
    [['Season','DayNum','LTeamID']]
#     # merge for winning team
#     .merge(num_days_coach_for_season[['Season','TeamID','FirstDayNum','LastDayNum','CoachName']],
#            how='left',left_on=['Season','WTeamID'],right_on=['Season','TeamID'])
#     .rename(columns={"FirstDayNum":"FirstDayNum_win","LastDayNum":"LastDayNum_win","CoachName":"CoachName_win","TeamID":"TeamID_win"})
    # merge for losing team
    .merge(num_days_coach_for_season[['Season','TeamID','FirstDayNum','LastDayNum','CoachName']],
           how='left',left_on=['Season','LTeamID'],right_on=['Season','TeamID'])
    .rename(columns={"FirstDayNum":"FirstDayNum_lose","LastDayNum":"LastDayNum_lose","CoachName":"CoachName_lose","TeamID":"TeamID_lose"})
    .pipe(lambda x:x.assign(which_coach_for_lose = np.where((x.FirstDayNum_lose <= x.DayNum) & (x.LastDayNum_lose >= x.DayNum),1,0)))
    .query("which_coach_for_lose != 0")
    .groupby(['Season','CoachName_lose','LTeamID'])
    .agg({"which_coach_for_lose":"sum"})
    .reset_index()
)

games_lose_for_coaches.head()

Unnamed: 0,Season,CoachName_lose,LTeamID,which_coach_for_lose
0,2003,al_skinner,1130,11
1,2003,al_walker,1127,13
2,2003,andy_stoglin,1238,18
3,2003,armond_hill,1162,25
4,2003,barry_collier,1304,18


In [11]:
# combine both losing and winning games
combine_regular_games_won_lose = (
    games_lose_for_coaches
    .merge(games_won_for_coaches,how='left',left_on=['Season','LTeamID','CoachName_lose'],right_on=['Season','WTeamID','CoachName_win'])
    .pipe(lambda x:x.assign(win_rate_regular = x.which_coach_for_win/(x.which_coach_for_win + x.which_coach_for_lose)))
    .drop(['CoachName_win','WTeamID'],1)
    .rename(columns={"CoachName_lose":"CoachName","LTeamID":"TeamID","which_coach_for_lose":"games_lost","which_coach_for_win":"games_won"})
)

combine_regular_games_won_lose.head()

Unnamed: 0,Season,CoachName,TeamID,games_lost,games_won,win_rate_regular
0,2003,al_skinner,1130,11,18.0,0.62069
1,2003,al_walker,1127,13,14.0,0.518519
2,2003,andy_stoglin,1238,18,10.0,0.357143
3,2003,armond_hill,1162,25,2.0,0.074074
4,2003,barry_collier,1304,18,11.0,0.37931


## Get win rate for coach during post season

In [12]:
# get winning games for coaches
post_games_won_for_coaches = (
    raw_data_postseason
    [['Season','DayNum','WTeamID']]
    # merge for winning team
    .merge(num_days_coach_for_season[['Season','TeamID','FirstDayNum','LastDayNum','CoachName']],
           how='left',left_on=['Season','WTeamID'],right_on=['Season','TeamID'])
    .rename(columns={"FirstDayNum":"FirstDayNum_win","LastDayNum":"LastDayNum_win","CoachName":"CoachName_win","TeamID":"TeamID_win"})
#     # merge for losing team
#     .merge(num_days_coach_for_season[['Season','TeamID','FirstDayNum','LastDayNum','CoachName']],
#            how='left',left_on=['Season','LTeamID'],right_on=['Season','TeamID'])
#     .rename(columns={"FirstDayNum":"FirstDayNum_lose","LastDayNum":"LastDayNum_lose","CoachName":"CoachName_lose","TeamID":"TeamID_lose"})
    .pipe(lambda x:x.assign(which_coach_for_win = np.where((x.FirstDayNum_win <= x.DayNum) & (x.LastDayNum_win >= x.DayNum),1,0)))
    .query("which_coach_for_win != 0")
    .groupby(['Season','CoachName_win','WTeamID'])
    .agg({"which_coach_for_win":"sum"})
    .reset_index()
)

post_games_won_for_coaches.head()

Unnamed: 0,Season,CoachName_win,WTeamID,which_coach_for_win
0,1985,andy_russo,1256,2
1,1985,bill_frieder,1276,1
2,1985,billy_tubbs,1328,3
3,1985,bob_donewald,1229,1
4,1985,bobby_cremins,1210,3


In [13]:
# get losing games for coaches
post_games_lose_for_coaches = (
    raw_data_postseason
    [['Season','DayNum','LTeamID']]
#     # merge for winning team
#     .merge(num_days_coach_for_season[['Season','TeamID','FirstDayNum','LastDayNum','CoachName']],
#            how='left',left_on=['Season','WTeamID'],right_on=['Season','TeamID'])
#     .rename(columns={"FirstDayNum":"FirstDayNum_win","LastDayNum":"LastDayNum_win","CoachName":"CoachName_win","TeamID":"TeamID_win"})
    # merge for losing team
    .merge(num_days_coach_for_season[['Season','TeamID','FirstDayNum','LastDayNum','CoachName']],
           how='left',left_on=['Season','LTeamID'],right_on=['Season','TeamID'])
    .rename(columns={"FirstDayNum":"FirstDayNum_lose","LastDayNum":"LastDayNum_lose","CoachName":"CoachName_lose","TeamID":"TeamID_lose"})
    .pipe(lambda x:x.assign(which_coach_for_lose = np.where((x.FirstDayNum_lose <= x.DayNum) & (x.LastDayNum_lose >= x.DayNum),1,0)))
    .query("which_coach_for_lose != 0")
    .groupby(['Season','CoachName_lose','LTeamID'])
    .agg({"which_coach_for_lose":"sum"})
    .reset_index()
)

post_games_lose_for_coaches.head()

Unnamed: 0,Season,CoachName_lose,LTeamID,which_coach_for_lose
0,1985,andy_russo,1256,1
1,1985,bill_bibb,1273,1
2,1985,bill_frieder,1276,1
3,1985,billy_tubbs,1328,1
4,1985,bob_donewald,1229,1


In [14]:
# combine both losing and winning post games
combine_post_games_won_lose = (
    post_games_lose_for_coaches
    .merge(post_games_won_for_coaches,how='left',left_on=['Season','LTeamID','CoachName_lose'],right_on=['Season','WTeamID','CoachName_win'])
    .pipe(lambda x:x.assign(win_rate_post = x.which_coach_for_win/(x.which_coach_for_win + x.which_coach_for_lose)))
    .drop(['CoachName_win','WTeamID'],1)
    .rename(columns={"CoachName_lose":"CoachName","LTeamID":"TeamID","which_coach_for_lose":"post_games_lost","which_coach_for_win":"post_games_won"})
    .fillna(0)
)

combine_post_games_won_lose.head()

Unnamed: 0,Season,CoachName,TeamID,post_games_lost,post_games_won,win_rate_post
0,1985,andy_russo,1256,1,2.0,0.666667
1,1985,bill_bibb,1273,1,0.0,0.0
2,1985,bill_frieder,1276,1,1.0,0.5
3,1985,billy_tubbs,1328,1,3.0,0.75
4,1985,bob_donewald,1229,1,1.0,0.5


## Get overall win rate for coaches

In [15]:
overall_win_rate_for_coaches = (
    combine_post_games_won_lose
    .merge(combine_regular_games_won_lose,how='left',on=['Season','CoachName','TeamID'])
    .fillna(0)
    .pipe(lambda x:x.assign(overall_games_won = x.post_games_won + x.games_won))
    .pipe(lambda x:x.assign(overall_games_lost = x.post_games_lost + x.games_lost))
    .pipe(lambda x:x.assign(win_rate_overall = x.overall_games_won/(x.overall_games_won + x.overall_games_lost)))
)

overall_win_rate_for_coaches.tail()

Unnamed: 0,Season,CoachName,TeamID,post_games_lost,post_games_won,win_rate_post,games_lost,games_won,win_rate_regular,overall_games_won,overall_games_lost,win_rate_overall
2112,2017,tim_cluess,1233,1,0.0,0.0,12.0,22.0,0.647059,22.0,13.0,0.628571
2113,2017,tim_jankovich,1374,1,0.0,0.0,4.0,30.0,0.882353,30.0,5.0,0.857143
2114,2017,tom_izzo,1277,1,1.0,0.5,14.0,19.0,0.575758,20.0,15.0,0.571429
2115,2017,tony_bennett,1438,1,1.0,0.5,10.0,22.0,0.6875,23.0,11.0,0.676471
2116,2017,will_wade,1433,1,0.0,0.0,8.0,26.0,0.764706,26.0,9.0,0.742857


## Combine all coach stats into one master table

In [16]:
final_coach_stats_table = (
    num_days_coach_for_season
    .merge(final_coach_with_postseason_champion_each_year,how='left',left_on=['Season','TeamID','CoachName'],right_on=['Season','TeamID','final_coach'])
    .fillna(0)
    .merge(overall_win_rate_for_coaches,how='left',on=['Season','TeamID','CoachName'])
    .fillna(0)
    .drop(['final_coach','FirstDayNum','LastDayNum'],1)
    .sort_values(['CoachName','Season'])
)

final_coach_stats_table.tail()

Unnamed: 0,Season,TeamID,CoachName,daysexp,season_max_days,num_season,is_playoff,is_champion,post_games_lost,post_games_won,win_rate_post,games_lost,games_won,win_rate_regular,overall_games_won,overall_games_lost,win_rate_overall
10987,2014,1119,zach_spiker,154,154,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10988,2015,1119,zach_spiker,154,154,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10989,2016,1119,zach_spiker,154,154,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10990,2017,1180,zach_spiker,154,154,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10991,2018,1180,zach_spiker,77,77,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Cumulative coach stats for master table

In [17]:
cumulative_final_coach_stats_table = (
    final_coach_stats_table
    .groupby(['CoachName'])
    .cumsum()
    .pipe(lambda x:x.assign(Season = final_coach_stats_table.Season.values))
    .pipe(lambda x:x.assign(TeamID = final_coach_stats_table.TeamID.values))
    .pipe(lambda x:x.assign(CoachName = final_coach_stats_table.CoachName.values))
    .pipe(lambda x:x.assign(win_rate_post = x.post_games_won/(x.post_games_won + x.post_games_lost)))
    .fillna(0)
    .pipe(lambda x:x.assign(win_rate_regular = x.games_won/(x.games_won + x.games_lost)))
    .fillna(0)
    .pipe(lambda x:x.assign(win_rate_overall = x.overall_games_won/(x.overall_games_won + x.overall_games_lost)))
    .fillna(0)
)
cumulative_final_coach_stats_table.head()

Unnamed: 0,Season,TeamID,daysexp,season_max_days,num_season,is_playoff,is_champion,post_games_lost,post_games_won,win_rate_post,games_lost,games_won,win_rate_regular,overall_games_won,overall_games_lost,win_rate_overall,CoachName
0,1985,1224,154,154,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,a_b_williamson
1,1986,1224,308,308,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,a_b_williamson
2,1987,1224,462,462,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,a_b_williamson
3,1988,1224,616,616,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,a_b_williamson
4,1989,1224,770,770,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,a_b_williamson


In [18]:
coach_file = 'data/DataFiles/TeamCoaches.csv'
regularseason_file = 'data/DataFiles/RegularSeasonDetailedResults.csv'
postseason_file = 'data/DataFiles/NCAATourneyCompactResults.csv'

In [19]:
from aggregate_function import coach_stats

In [20]:
testing_df = coach_stats.CoachStats(coach_file,regularseason_file,postseason_file)

In [22]:
testing_df.cumulative_final_coach_stats_table.head()

Unnamed: 0,Season,TeamID,daysexp,season_max_days,num_season,is_playoff,is_champion,post_games_lost,post_games_won,win_rate_post,games_lost,games_won,win_rate_regular,overall_games_won,overall_games_lost,win_rate_overall,CoachName
0,1985,1224,154,154,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,a_b_williamson
1,1986,1224,308,308,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,a_b_williamson
2,1987,1224,462,462,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,a_b_williamson
3,1988,1224,616,616,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,a_b_williamson
4,1989,1224,770,770,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,a_b_williamson


## Concluding remarks
- need to clean up the way class file is written, at the moment, its a direct copy and paste into the class file to save time