STATS 101C Final Project Notebook

Goal is to construct a new dataset with better, more "informational features" that we can then plug into a model for predicting whether a team would win over another

In [16]:
# reading data
import pandas as pd

In [53]:
# need openpyxl package installed
df = pd.read_excel("Dataset.xlsx")
df.head()

Unnamed: 0,Team,Match Up,Game Date,W/L,MIN,PTS,FGM,FGA,FG%,3PM,...,FT%,OREB,DREB,REB,AST,STL,BLK,TOV,PF,+/-
0,GSW,GSW vs. PHX,10/24/2023,L,240,104,36,101,35.6,10,...,78.6,18,31,49,19,11,6,11,23,-4
1,PHX,PHX @ GSW,10/24/2023,W,240,108,42,95,44.2,11,...,76.5,17,43,60,23,5,7,19,22,4
2,LAL,LAL @ DEN,10/24/2023,L,240,107,41,90,45.6,10,...,75.0,13,31,44,23,5,4,12,18,-12
3,DEN,DEN vs. LAL,10/24/2023,W,240,119,48,91,52.7,14,...,75.0,9,33,42,29,9,6,12,15,12
4,MEM,MEM vs. NOP,10/25/2023,L,240,104,38,91,41.8,12,...,80.0,8,29,37,23,8,7,13,19,-7


Replacing "Match Up" Column - Column for Opposing Team, Column for Home Game (Yes/No)

In [54]:
# need to provide a feature for indicating whether it was a home game for the team, or not
home_game = df["Match Up"].apply(lambda x: 0 if "@" in x else 1)

# print to see if it works (1 for home game, 0 for away game)
pd.concat([df["Match Up"], home_game], axis=1)

Unnamed: 0,Match Up,Match Up.1
0,GSW vs. PHX,1
1,PHX @ GSW,0
2,LAL @ DEN,0
3,DEN vs. LAL,1
4,MEM vs. NOP,1
...,...,...
2455,CLE vs. CHA,1
2456,NYK vs. CHI,1
2457,LAC vs. HOU,1
2458,LAL @ NOP,0


In [None]:
# need column for just the name of the opposing team (last 3 characters)
opposing_team = df.iloc[:, 1].str[-3:]

# check for equality to make sure didn't mess it up
all(opposing_team == (df["Match Up"].str[-3:]))

0       PHX
1       GSW
2       DEN
3       LAL
4       NOP
       ... 
2455    CHA
2456    CHI
2457    HOU
2458    NOP
2459    SAS
Name: Match Up, Length: 2460, dtype: object

In [60]:
# remove "Match Up" column
df.pop("Match Up")

0       GSW vs. PHX
1         PHX @ GSW
2         LAL @ DEN
3       DEN vs. LAL
4       MEM vs. NOP
           ...     
2455    CLE vs. CHA
2456    NYK vs. CHI
2457    LAC vs. HOU
2458      LAL @ NOP
2459      DET @ SAS
Name: Match Up, Length: 2460, dtype: object

In [59]:
# replace w/ new columns
df.insert(1, "opposing_team", opposing_team)
df.insert(2, "home_game", home_game)
df.head()

Unnamed: 0,Team,opposing_team,home_game,Match Up,Game Date,W/L,MIN,PTS,FGM,FGA,...,FT%,OREB,DREB,REB,AST,STL,BLK,TOV,PF,+/-
0,GSW,PHX,1,GSW vs. PHX,10/24/2023,L,240,104,36,101,...,78.6,18,31,49,19,11,6,11,23,-4
1,PHX,GSW,0,PHX @ GSW,10/24/2023,W,240,108,42,95,...,76.5,17,43,60,23,5,7,19,22,4
2,LAL,DEN,0,LAL @ DEN,10/24/2023,L,240,107,41,90,...,75.0,13,31,44,23,5,4,12,18,-12
3,DEN,LAL,1,DEN vs. LAL,10/24/2023,W,240,119,48,91,...,75.0,9,33,42,29,9,6,12,15,12
4,MEM,NOP,1,MEM vs. NOP,10/25/2023,L,240,104,38,91,...,80.0,8,29,37,23,8,7,13,19,-7


Working with Game Date - Need To Work With Data Before Nov 13, 2023

In [79]:
# view the earliest date and latest date in the entire dataset
earliest_date = pd.to_datetime(df["Game Date"]).min()
print("Earliest Date:" , earliest_date.strftime('%B %d, %Y'))
latest_date = pd.to_datetime(df["Game Date"]).max()
print("Latest Date:", latest_date.strftime('%B %d, %Y'))

Earliest Date: October 24, 2023
Latest Date: April 14, 2024


In [81]:
# create a new dataset with data before Nov 13, 2023
cutoff_date = "2023-11-13"
before = df[pd.to_datetime(df["Game Date"]) < pd.to_datetime(cutoff_date)]

earliest_date = pd.to_datetime(before["Game Date"]).min()
latest_date = pd.to_datetime(before["Game Date"]).max()
print("Closest Date:" , earliest_date.strftime('%B %d, %Y'))
print("Latest Date:", latest_date.strftime('%B %d, %Y'))

Closest Date: October 24, 2023
Latest Date: November 12, 2023


Creating Better Features - Home Advantage Feature (Done)

Creating Better Features - Stability of Key Game Statistics

In [82]:
# let's work with only a single team for now
gsw = before[before["Team"] == "GSW"]
gsw

Unnamed: 0,Team,opposing_team,home_game,Game Date,W/L,MIN,PTS,FGM,FGA,FG%,...,FT%,OREB,DREB,REB,AST,STL,BLK,TOV,PF,+/-
0,GSW,PHX,1,10/24/2023,L,240,104,36,101,35.6,...,78.6,18,31,49,19,11,6,11,23,-4
35,GSW,SAC,0,10/27/2023,W,240,122,48,87,55.2,...,85.7,7,32,39,32,14,3,19,24,8
69,GSW,HOU,0,10/29/2023,W,240,106,35,81,43.2,...,78.3,8,41,49,27,6,6,14,18,11
94,GSW,NOP,0,10/30/2023,W,240,130,51,98,52.0,...,65.0,21,43,64,28,6,7,15,21,28
116,GSW,SAC,1,11/01/2023,W,240,102,39,81,48.1,...,86.7,7,29,36,32,5,6,18,17,1
154,GSW,OKC,0,11/03/2023,W,240,141,49,92,53.3,...,83.3,13,25,38,36,9,2,15,25,2
180,GSW,CLE,0,11/05/2023,L,240,104,34,94,36.2,...,66.7,17,27,44,26,10,4,13,18,-11
195,GSW,DET,0,11/06/2023,W,240,120,46,97,47.4,...,76.2,17,27,44,26,8,4,11,14,11
214,GSW,DEN,0,11/08/2023,L,240,105,37,86,43.0,...,76.9,11,38,49,21,5,0,9,20,-3
262,GSW,CLE,1,11/11/2023,L,240,110,37,90,41.1,...,76.7,15,40,55,26,5,3,20,29,-8


In [None]:
# get a dataframe of only the game stats
gsw_stats = gsw.iloc[:, 5:]

# calculate the stability of the game stats

# get the average
avg_gsw_stats = gsw_stats.mean()

# get the std dev
sd_gsw_stats = gsw_stats.std()

# get the coefficient of variation
coeff_of_var = sd_gsw_stats / avg_gsw_stats

# take the average of the coefficients of variation
avg_variation = coeff_of_var.mean()

# subtract from 1 to get a measure of "stability", closer to 1 = more stable, closer to 0 = less stable
stability_score = 1 - avg_variation
stability_score

np.float64(0.5879039783343367)

In [96]:
# creating a running stability score, that gives us the stability score of the team up until that date
gsw_stats["stability_score"] = gsw_stats.expanding().apply(lambda x: 1 - (x.std() / x.mean())).mean(axis = 1)
gsw_stats

Unnamed: 0,MIN,PTS,FGM,FGA,FG%,3PM,3PA,3P%,FTM,FTA,...,OREB,DREB,REB,AST,STL,BLK,TOV,PF,+/-,stability_score
0,240,104,36,101,35.6,10,43,23.3,22,28,...,18,31,49,19,11,6,11,23,-4,
35,240,122,48,87,55.2,14,32,43.8,12,14,...,7,32,39,32,14,3,19,24,8,0.549712
69,240,106,35,81,43.2,18,45,40.0,18,23,...,8,41,49,27,6,6,14,18,11,0.70588
94,240,130,51,98,52.0,15,41,36.6,13,20,...,21,43,64,28,6,7,15,21,28,0.734672
116,240,102,39,81,48.1,11,31,35.5,13,15,...,7,29,36,32,5,6,18,17,1,0.72555
154,240,141,49,92,53.3,18,40,45.0,25,30,...,13,25,38,36,9,2,15,25,2,0.718677
180,240,104,34,94,36.2,16,41,39.0,20,30,...,17,27,44,26,10,4,13,18,-11,0.676464
195,240,120,46,97,47.4,12,41,29.3,16,21,...,17,27,44,26,8,4,11,14,11,0.70549
214,240,105,37,86,43.0,11,35,31.4,20,26,...,11,38,49,21,5,0,9,20,-3,0.681562
262,240,110,37,90,41.1,13,38,34.2,23,30,...,15,40,55,26,5,3,20,29,-8,0.639058


Final Dataset for Training Model, including the features: Team Name, Home Game, 