In [1]:
from nba_api.stats.endpoints import boxscoretraditionalv2 as boxscore, leagueleaders as ll, leaguegamelog, boxscorefourfactorsv2 as ff, playerdashptreb as rebounds
from nba_api.stats.endpoints import playercareerstats as player_career, draftcombinestats as combine, leaguehustlestatsplayer as hustle, commonteamroster as roster
from nba_api.stats.static import teams, players
import pandas as pd
from nba_api.stats.endpoints import commonplayerinfo, leaguedashplayershotlocations as shot_chart, leaguedashteamptshot as teamshot, leaguedashptdefend as defence
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold
import matplotlib.pyplot as plt
import statsmodels.api as sm
import numpy as np
from sklearn.metrics import confusion_matrix, roc_curve, precision_recall_curve, auc, plot_roc_curve, accuracy_score, recall_score, precision_score, silhouette_samples, silhouette_score
import requests
from sklearn.decomposition import PCA
from yellowbrick.cluster import KElbowVisualizer, SilhouetteVisualizer
from sklearn.cluster import AgglomerativeClustering, KMeans
import seaborn as sns
import warnings
import plotly.express as px
import plotly.graph_objects as go
import itertools


## Get Game Stats (Can be skipped if you use .csv files)

In [3]:
def format_data(data):
    data['Oreb Diff'] = ''
    data['3 Point Diff'] = ''
    data['TOV Margin'] = ''
    data['FGA Diff'] = ''
    data ['Home Team'] = ''
    data ['FTA Diff'] = ''

    for i in range(0,len(data)):
        if data['MATCHUP'][i][4] == '@':
            home_team = data['MATCHUP'][i][6:9]
        else:
            home_team = data['MATCHUP'][i][0:3]
        data.at[i, 'Home Team'] = home_team
        
        
        if i % 2 == 0:
            data.at[i,'Oreb Diff'] = data['OREB'][i] - data['OREB'][i+1]
            data.at[i,'3 Point Diff'] = data['FG3M'][i] - data['FG3M'][i+1]
            data.at[i,'TOV Margin'] = -1*(data['TOV'][i] - data['TOV'][i+1])
            data.at[i, 'FGA Diff'] = data['FGA'][i] - data['FGA'][i+1]
            data.at[i, 'FTA Diff'] = data['FTA'][i] - data['FTA'][i+1]
            
        if i % 2 !=0 :
            data.at[i,'Oreb Diff'] = -1*data['Oreb Diff'][i-1]
            data.at[i,'3 Point Diff'] = -1*data['3 Point Diff'][i-1] 
            data.at[i,'TOV Margin'] = -1*data['TOV Margin'][i-1] 
            data.at[i, 'FGA Diff'] = -1*data['FGA Diff'][i-1]
            data.at[i, 'FTA Diff'] = -1*data['FTA Diff'][i-1]
    
    data['Home Team'] = (data['Home Team'] == data['TEAM_ABBREVIATION'])*1
    return data


In [4]:
def get_gamelogs(start_year, end_year, season_type = 'Regular Season'):
    log = leaguegamelog.LeagueGameLog(season_type_all_star = season_type, season=start_year)
    log_df = log.get_data_frames()[0][['SEASON_ID', 'TEAM_ID', 'TEAM_ABBREVIATION','GAME_ID',
       'GAME_DATE','WL','OREB','FG3M','TOV', 'FGA','FTA','FG_PCT','PLUS_MINUS', 'MATCHUP']]
    log_df = format_data(log_df)
    for i in range(start_year+1,end_year +1):
        szn = str(i)
        print(szn)
        temp_log = leaguegamelog.LeagueGameLog(season_type_all_star = season_type, season=szn)
        temp_df = temp_log.get_data_frames()[0][['SEASON_ID', 'TEAM_ID', 'TEAM_ABBREVIATION','GAME_ID',
           'GAME_DATE','WL','OREB','FG3M','TOV','FGA','FTA','FG_PCT', 'PLUS_MINUS', 'MATCHUP']]
        temp_df = format_data(temp_df)
        log_df = pd.concat([log_df,temp_df], axis=0, ignore_index=True)
    return log_df
    

In [5]:
log_df = get_gamelogs(2017,2021)

log_df = log_df.dropna()
len(log_df)

2018
2019
2020
2021


11658

In [6]:
log_df = log_df.sort_values('GAME_ID')

In [7]:
def merge_data(new_data, old_data = None):
    new_game_data =  pd.merge(left=log_df, right=new_data, how='inner', on = ['GAME_ID','TEAM_ABBREVIATION'])
    if old_data is None:
        updated_df = new_game_data
    else:
        updated_df =  pd.concat([old_data, new_game_data])
    updated_df = updated_df.sort_values('GAME_ID')
    updated_df.to_html('game_data.html', index=False)
    return updated_df 

In [8]:
four_fact_df = pd.DataFrame(columns=['GAME_ID', 'TEAM_ID', 'TEAM_NAME', 'TEAM_ABBREVIATION', 'TEAM_CITY', 'MIN', 'EFG_PCT', 'FTA_RATE', 'TM_TOV_PCT', 'OREB_PCT', 'OPP_EFG_PCT', 'OPP_FTA_RATE', 'OPP_TOV_PCT', 'OPP_OREB_PCT'])
def get_ff(data, id_list):
    for i,j in enumerate(id_list):
        try:
            data = pd.concat([data,ff.BoxScoreFourFactorsV2(game_id=j).get_data_frames()[1]], ignore_index=True)
        except (ConnectionResetError, ConnectionError):
            print('ConnectionError')
            print(len(data))
            return data
        except (requests.exceptions.ReadTimeout, requests.exceptions.ConnectionError) :
            print('req error')
            print(len(data))
            return data
             

    return data

In [417]:
game_id = log_df.GAME_ID.values
last_game_id = game_data.GAME_ID.values[-1]
last_game_id 
last_idx = np.where(log_df.GAME_ID == last_game_id)
remaining_ids = log_df['GAME_ID'].values[last_idx[0][0]:]
four_fact_df = pd.DataFrame(columns=['GAME_ID', 'TEAM_ID', 'TEAM_NAME', 'TEAM_ABBREVIATION', 'TEAM_CITY', 'MIN', 'EFG_PCT', 'FTA_RATE', 'TM_TOV_PCT', 'OREB_PCT', 'OPP_EFG_PCT', 'OPP_FTA_RATE', 'OPP_TOV_PCT', 'OPP_OREB_PCT'])
four_fact_df = get_ff(four_fact_df, remaining_ids)



In [418]:
game_data = merge_data(four_fact_df, game_data)


In [419]:

game_data

Unnamed: 0,GAME_ID,TEAM_ID,TEAM_NAME,TEAM_ABBREVIATION,TEAM_CITY,MIN,EFG_PCT,FTA_RATE,TM_TOV_PCT,OREB_PCT,...,FG_PCT,PLUS_MINUS,MATCHUP,Oreb Diff,3 Point Diff,TOV Margin,FGA Diff,Home Team,FTA Diff,TEAM_ID_y
0,0021700001,1610612739,Cavaliers,CLE,Cleveland,240:00,0.488,0.301,0.167,0.188,...,,,,,,,,,,
1,0021700001,1610612738,Celtics,BOS,Boston,240:00,0.455,0.284,0.118,0.161,...,,,,,,,,,,
2,0021700001,1610612739,Cavaliers,CLE,Cleveland,240:00,0.488,0.301,0.167,0.188,...,,,,,,,,,,
3,0021700001,1610612738,Celtics,BOS,Boston,240:00,0.455,0.284,0.118,0.161,...,,,,,,,,,,
4,0021700002,1610612745,Rockets,HOU,Houston,240:00,0.562,0.196,0.120,0.185,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11041,0022101229,,Suns,PHX,Phoenix,240:00,0.476,0.146,0.107,0.286,...,0.408,-7.0,PHX vs. SAC,16,0,4,27,1.0,-15,1610612756
11045,0022101230,,Trail Blazers,POR,Portland,240:00,0.428,0.145,0.161,0.094,...,0.373,-31.0,POR vs. UTA,-10,0,1,1,1.0,-26,1610612757
11046,0022101230,,Jazz,UTA,Utah,240:00,0.506,0.463,0.169,0.300,...,0.451,31.0,UTA @ POR,10,0,-1,-1,0.0,26,1610612762
11044,0022101230,,Trail Blazers,POR,Portland,240:00,0.428,0.145,0.161,0.094,...,0.373,-31.0,POR vs. UTA,-10,0,1,1,1.0,-26,1610612757


In [421]:
game_data = game_data.drop_duplicates(subset=['GAME_ID', 'TEAM_ABBREVIATION'])
game_data

Unnamed: 0,GAME_ID,TEAM_ID,TEAM_NAME,TEAM_ABBREVIATION,TEAM_CITY,MIN,EFG_PCT,FTA_RATE,TM_TOV_PCT,OREB_PCT,...,FG_PCT,PLUS_MINUS,MATCHUP,Oreb Diff,3 Point Diff,TOV Margin,FGA Diff,Home Team,FTA Diff,TEAM_ID_y
0,0021700001,1610612739,Cavaliers,CLE,Cleveland,240:00,0.488,0.301,0.167,0.188,...,,,,,,,,,,
1,0021700001,1610612738,Celtics,BOS,Boston,240:00,0.455,0.284,0.118,0.161,...,,,,,,,,,,
4,0021700002,1610612745,Rockets,HOU,Houston,240:00,0.562,0.196,0.120,0.185,...,,,,,,,,,,
5,0021700002,1610612744,Warriors,GSW,Golden State,240:00,0.638,0.263,0.170,0.154,...,,,,,,,,,,
11,0021700003,1610612766,Hornets,CHA,Charlotte,240:00,0.459,0.397,0.170,0.065,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11036,0022101228,,76ers,PHI,Philadelphia,240:00,0.551,0.261,0.111,0.233,...,0.523,12.0,PHI vs. DET,-5,-6,9,5,1.0,-6,1610612755
11042,0022101229,,Kings,SAC,Sacramento,240:00,0.618,0.395,0.147,0.051,...,0.526,7.0,SAC @ PHX,-16,0,-4,-27,0.0,15,1610612758
11040,0022101229,,Suns,PHX,Phoenix,240:00,0.476,0.146,0.107,0.286,...,0.408,-7.0,PHX vs. SAC,16,0,4,27,1.0,-15,1610612756
11045,0022101230,,Trail Blazers,POR,Portland,240:00,0.428,0.145,0.161,0.094,...,0.373,-31.0,POR vs. UTA,-10,0,1,1,1.0,-26,1610612757


## Create Features

We are going to make some predictors from the data we got:
1. EFG Difference: The Difference in EFG for the teams in the game

2. Turnover Percent Difference: The difference in % turnovers for one team vs. their opponent

3. Offensive Rebound Percent Difference: The difference in OREB % for a team and their opponent

4. FT Rate Difference: Difference in free throw rates for the team and their opponent



In [3]:
game_data = pd.read_csv('game_data.csv', converters={'GAME_ID': str})[0]



game_data = game_data.drop_duplicates(subset=['GAME_ID', 'TEAM_ABBREVIATION'])


In [10]:
game_data = game_data.drop(['TEAM_ID'], axis=1)

In [11]:
game_data['EFG_Diff'] = game_data.EFG_PCT - game_data.OPP_EFG_PCT
game_data['TOV_PCT_Diff'] = -1*(game_data.TM_TOV_PCT - game_data.OPP_TOV_PCT)
game_data['OREB_PCT_Diff'] = game_data.OREB_PCT - game_data.OPP_OREB_PCT
game_data['FT_RATE_Diff'] = game_data.FTA_RATE - game_data.OPP_FTA_RATE
game_data['OREB_TOV'] = game_data.OREB_PCT_Diff + game_data.TOV_PCT_Diff


In [12]:
teams_list = log_df['TEAM_ABBREVIATION'].unique()

In [13]:
df_no_2021 = game_data[game_data['SEASON_ID']!=22021]
df_2021 = game_data[game_data['SEASON_ID']==22021]


log_df_raps_2021 = df_2021[(df_2021['TEAM_ABBREVIATION']=='TOR')].copy().reset_index()



## Model Fitting

Let's make some models! The model framework, in general, is as follows:

1. Define a population from which we will collect our sample data
   
    * Here, our population is all of the NBA Games and we sample the games from 2017-18 to 2020-2021

2. Define the set of inputs you want the model to consider when making a prediction:
    
    * We will use the 4 feautres we created above as our inputs to the model. These inputs will give the model the information it needs to make a prediction. We will consider multiple models that use different combinations of these 4 inputs and compare the results.

3. Define a response variable. What is the thing we want the model to predict?

    * We will use the game outcome, win or loss, to be our response variable, denoted as 'W' for win, and 'L' for loss. The model itself will give a probability of a win, that will be between 0 and 1 (i.e., a probability of 1 would be a 100% chance of victory, a probability of 0.5 reflects a 50% chance of victory and so on...). We will define a win as a probability of victory of 0.5 (50% chance) or higher.




In [14]:
def log_reg_games(model, test_data, features):
    df = pd.DataFrame(columns = ['Team', 'Accuracy', 'Correct Predictions', 'Missed W\'s', 'Missed L\'s'])
    df_stats_w = pd.DataFrame(columns = ['Team', 'Oreb Diff Avg.', 'EFG Diff Avg.', 'TOV Margin Avg.', 'FTR Diff Avg.', 'OREB_TOV AVG.','Sample Size'])
    df_stats_l = pd.DataFrame(columns = ['Team', 'Oreb Diff Avg.', 'EFG Diff Avg.', 'TOV Margin Avg.', 'FTR Diff Avg.', 'OREB_TOV AVG.','Sample Size'])
    
    np.seterr(all="ignore")
    warnings.filterwarnings('ignore')
    for i,j in enumerate(teams_list): 
        
        test = test_data[test_data['TEAM_ABBREVIATION'] == j].copy()
        
        ytest = (test['WL']=='W')*1
        
        xtest = test[features]
        

    
        pred = model.predict(xtest)
        
        test['Prediction Acc'] = (ytest == pred)
        test['Win Prob'] = model.predict_proba(xtest)[:,1]
        acc = accuracy_score(ytest, pred)
       


        false_w=test[(test['Prediction Acc']==False)& (test['WL']=='W')]
        oreb_avg = false_w['OREB_PCT_Diff'].values.mean()*100
        three_avg = false_w['EFG_Diff'].values.mean()*100
        tov_avg = false_w['TOV_PCT_Diff'].values.mean()*100
        ftr_avg = false_w['FT_RATE_Diff'].values.mean()*100
        oreb_tov = false_w['OREB_TOV'].values.mean()*100
        n = len(false_w)
        

        df.loc[i] = [j, acc, int(acc*82), n, 82- int(acc*82)-n]
        df_stats_w.loc[i] = [j, oreb_avg, three_avg, tov_avg, ftr_avg, oreb_tov,n]

        false_l=test[(test['Prediction Acc']==False)& (test['WL']=='W')]
        oreb_avg = false_l['OREB_PCT_Diff'].values.mean()*100
        three_avg = false_l['EFG_Diff'].values.mean()*100
        tov_avg = false_l['TOV_PCT_Diff'].values.mean()*100
        ftr_avg = false_l['FT_RATE_Diff'].values.mean()*100
        oreb_tov = false_l['OREB_TOV'].values.mean()*100
        n = len(false_l)
        

        df_stats_l.loc[i] = [j, oreb_avg, three_avg, tov_avg, ftr_avg, oreb_tov,n]
    df = df.sort_values('Accuracy', ascending=False).reset_index(drop=True)   
    df_stats_w = df_stats_w[df_stats_w['Sample Size'] > 5]
    return (df, df_stats_w, df_stats_l)

### Model 1 : Use EFG Difference, TOV Percent Difference, and Free Throw Rate Difference as our inputs

* We will train this model by using the above data from the games in the  2017-2018 season to the 2020-2021 season

In [15]:
ytrain = (game_data['WL']=='W')*1
xtrain = game_data[[ 'EFG_Diff','FT_RATE_Diff']]
log_reg = sm.Logit(ytrain,sm.add_constant(xtrain).astype(float))
result = log_reg.fit_regularized(method='l1', alpha=1)
print(result.summary())
lr = LogisticRegression(penalty='l1', solver='saga')
lr.fit(xtrain, ytrain)
lr.coef_


Optimization terminated successfully    (Exit mode 0)
            Current function value: 0.41493398509643886
            Iterations: 53
            Function evaluations: 54
            Gradient evaluations: 53
                           Logit Regression Results                           
Dep. Variable:                     WL   No. Observations:                11658
Model:                          Logit   Df Residuals:                    11655
Method:                           MLE   Df Model:                            2
Date:                Tue, 02 Aug 2022   Pseudo R-squ.:                  0.4042
Time:                        16:28:21   Log-Likelihood:                -4807.2
converged:                       True   LL-Null:                       -8068.2
Covariance Type:            nonrobust   LLR p-value:                     0.000
                   coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------

array([[25.331214  ,  4.56502391]])

We can get a general idea of the model's ability to make prediction by measuring its accuracy. Accuracy is the number of correct predictions divided by the total number of predictions:

$Accuracy = \frac{ Number Of Correct Predictions}{Total Number Of Predictions}$


We can use a technique called Cross-validation to see the average accuracy rate of this model over the set of data we gave the model to learn. Essentially, we partition the data into equal and distinct segments, train the model over all of these segments except for one, which we use to predict on and calculate accuracy. Each segment gets a turn to be the segment on which we  make a prediction. Then, we take the average of all of the accuracies.

In [16]:
skf = StratifiedKFold(10)
cv = cross_val_score(lr,xtrain, ytrain, cv=skf, scoring='accuracy').mean()
print(f'The average accuracy over the training data is {cv}')

The average accuracy over the training data is 0.8170410559559478


So, the model is performing quite well (as expected) over the training data (2017-2021). Now, let's train the model over the entire training dataset without leaving out any segments for cross-validation. Then, we will test this model by predicting every game, from every team, in the 2021-2022 season.

In [17]:
team_pred_df = log_reg_games(lr, df_2021, ['EFG_Diff',  'FT_RATE_Diff'])


### Model Performance:

In [18]:
team_pred_df[0]

Unnamed: 0,Team,Accuracy,Correct Predictions,Missed W's,Missed L's
0,DAL,0.926829,76,5,1
1,MIN,0.926829,76,6,0
2,CHA,0.926829,76,4,2
3,POR,0.914634,75,4,3
4,OKC,0.902439,74,4,4
5,SAC,0.890244,73,4,5
6,ATL,0.890244,73,5,4
7,MIA,0.890244,73,7,2
8,BOS,0.878049,72,4,6
9,BKN,0.865854,71,5,6


In [19]:
import plotly.graph_objects as go
import pandas as pd
import datapane as dp
!datapane login --token=3ef0fb283073ac0369b8ab4b222796f69253b2d7




Connected successfully to https://datapane.com as rpandeya29@gmail.com


In [20]:
from msilib.schema import PublishComponent


fig = go.Figure(data=[go.Table(
    header=dict(values=list(team_pred_df[0].columns),
                fill_color='paleturquoise',
                align='left'),
    cells=dict(values=[team_pred_df[0].Team, team_pred_df[0].Accuracy, team_pred_df[0]['Correct Predictions'], team_pred_df[0]['Missed W\'s'], team_pred_df[0]['Missed L\'s']],
               fill_color='lavender',
               align='left'))
])

fig.update_layout(title_text = "Model Performance for Each Team's 2021-2022 Season")
fig.show()
dp.Report(
    dp.Plot(fig)).upload(name='Model Performance Table', publicly_visible=True)

Uploading report and associated data - *please wait...*

Your report only contains a single element - did you know you can include additional plots, tables and text in a single report? More info <a href='https://docs.datapane.com/reports/blocks/layout-pages-and-selects' target='_blank'>here</a>

Report successfully uploaded. View and share your report <a href='https://datapane.com/reports/8AV6LaA/model-performance-table/' target='_blank'>here</a>, or edit your report <a href='https://datapane.com/reports/8AV6LaA/model-performance-table/edit/' target='_blank'>here</a>.

Some teams had their season's predicted nearly perfectly, like the Mavericks, Hornets, Trail Blazers, and Timberwolves. For others, like the Bulls and Knicks, the model had below average performances. The model performed worst of all on the Raptors, with a 70% accuracy. This is still fairly good in absolute terms, but relative to the average from the cross validation of 82% and the prediction of the other teams' games in 2021-22, it is subpar. Let's look at why that is:

* The model primarily failed to detect wins from the Raptors. The model misclassified 19 games from the Raptors' season, and 15 of those games were wins.

* Let's take a look at some numbers from the misclassified wins that the model made for every team:

In [21]:
def get_team_rank(data, categories, asc ):
    
    rank_df = pd.DataFrame(columns=['Team'])
    team_list = data.Team.unique().tolist()
  
    for i in range(len(categories)):
        df_cat = data.sort_values(categories[i], ascending = asc[i]).reset_index()
       
       
        rank_df[categories[i] + 'Rank'] = ''
        
        

        for count, team in enumerate(team_list):
           rank_df.loc[count, 'Team'] = team
           rank =  df_cat.index[df_cat['Team']==team][0] +1
           
           team_idx = rank_df.index[rank_df['Team']==team]
           rank_df.loc[team_idx, categories[i] + 'Rank'] = rank


    rank_df['Avg. Rank'] = rank_df.iloc[:,1:].sum(axis=1)/len(categories)
    return rank_df.sort_values('Avg. Rank')
           


        

### Sort by Average OREB % Difference

In [22]:
team_pred_df[1].sort_values('Oreb Diff Avg.', ascending = False).reset_index(drop = True)

Unnamed: 0,Team,Oreb Diff Avg.,EFG Diff Avg.,TOV Margin Avg.,FTR Diff Avg.,OREB_TOV AVG.,Sample Size
0,NYK,8.84,-3.2,1.41,2.16,10.25,10
1,NOP,8.1,-3.963636,5.618182,5.718182,13.718182,11
2,MEM,3.5,-1.84,3.386667,-2.653333,6.886667,15
3,CHI,3.4875,-3.0125,3.525,-2.875,7.0125,8
4,TOR,1.966667,-3.708333,5.325,-2.708333,7.291667,24
5,MIL,1.725,-1.9,2.2125,2.1125,3.9375,8
6,DEN,-0.19,-3.15,0.69,-0.44,0.5,10
7,GSW,-0.257143,-3.385714,3.085714,-1.871429,2.828571,7
8,MIA,-0.757143,-2.514286,6.742857,1.842857,5.985714,7
9,DET,-0.89,-2.69,2.1,-8.38,1.21,10


The raptors were second in average difference in their OREB% and their Opponent's OREB% in the games that the model incorrectly classified as losses throughout the league. 

### Sort by Average Turnover % Difference:

In [23]:
team_pred_df[1].sort_values('TOV Margin Avg.', ascending = False).reset_index(drop = True)

Unnamed: 0,Team,Oreb Diff Avg.,EFG Diff Avg.,TOV Margin Avg.,FTR Diff Avg.,OREB_TOV AVG.,Sample Size
0,MIA,-0.757143,-2.514286,6.742857,1.842857,5.985714,7
1,CLE,-4.733333,-4.45,5.95,11.466667,1.216667,6
2,NOP,8.1,-3.963636,5.618182,5.718182,13.718182,11
3,PHX,-6.02,-0.65,5.49,-6.22,-0.53,10
4,TOR,1.966667,-3.708333,5.325,-2.708333,7.291667,24
5,MIN,-2.45,-1.233333,3.916667,-8.4,1.466667,6
6,CHI,3.4875,-3.0125,3.525,-2.875,7.0125,8
7,SAS,-4.171429,-3.057143,3.5,1.0,-0.671429,7
8,MEM,3.5,-1.84,3.386667,-2.653333,6.886667,15
9,ORL,-2.283333,-2.4,3.233333,-1.933333,0.95,6


The raptors were third in average difference in their opponent's TOV% and their TOV% in the games that the model incorrectly classified as losses throughout the league. 

### Lets Look at the Average EFG Difference:

In [24]:
team_pred_df[1].sort_values('EFG Diff Avg.', ascending = True).reset_index(drop = True)

Unnamed: 0,Team,Oreb Diff Avg.,EFG Diff Avg.,TOV Margin Avg.,FTR Diff Avg.,OREB_TOV AVG.,Sample Size
0,CLE,-4.733333,-4.45,5.95,11.466667,1.216667,6
1,NOP,8.1,-3.963636,5.618182,5.718182,13.718182,11
2,TOR,1.966667,-3.708333,5.325,-2.708333,7.291667,24
3,GSW,-0.257143,-3.385714,3.085714,-1.871429,2.828571,7
4,NYK,8.84,-3.2,1.41,2.16,10.25,10
5,DEN,-0.19,-3.15,0.69,-0.44,0.5,10
6,SAS,-4.171429,-3.057143,3.5,1.0,-0.671429,7
7,CHI,3.4875,-3.0125,3.525,-2.875,7.0125,8
8,DET,-0.89,-2.69,2.1,-8.38,1.21,10
9,MIA,-0.757143,-2.514286,6.742857,1.842857,5.985714,7


The Raptors were outshot by the widest margin, on average, in games that the model incorrectly classified as losses.

### Let's Look at the Average Free Throw Rate Difference:

Let's extend this to also look at the average turnover percent difference and the average EFG difference. Here, Turnover Percent Difference rank will be high (i.e, 1 is the highest) when teams force more turnovers on average than they surrender. EFG Rank will be high in this case if the team shoots worse than their opponent on average. 

In [25]:
team_pred_df[1].sort_values('FTR Diff Avg.', ascending = True).reset_index(drop = True)

Unnamed: 0,Team,Oreb Diff Avg.,EFG Diff Avg.,TOV Margin Avg.,FTR Diff Avg.,OREB_TOV AVG.,Sample Size
0,MIN,-2.45,-1.233333,3.916667,-8.4,1.466667,6
1,DET,-0.89,-2.69,2.1,-8.38,1.21,10
2,PHX,-6.02,-0.65,5.49,-6.22,-0.53,10
3,CHI,3.4875,-3.0125,3.525,-2.875,7.0125,8
4,TOR,1.966667,-3.708333,5.325,-2.708333,7.291667,24
5,MEM,3.5,-1.84,3.386667,-2.653333,6.886667,15
6,ORL,-2.283333,-2.4,3.233333,-1.933333,0.95,6
7,GSW,-0.257143,-3.385714,3.085714,-1.871429,2.828571,7
8,DEN,-0.19,-3.15,0.69,-0.44,0.5,10
9,SAS,-4.171429,-3.057143,3.5,1.0,-0.671429,7


The Raptors, on average, had the 4th worst free throw rate disparity in games that the model incorrectly predicted as losses

### Let's Summaraize this

We can take these rankings and calculate an average ranking of the 4 categories.

In [26]:
rank_df = get_team_rank(team_pred_df[1], ['Oreb Diff Avg.','TOV Margin Avg.', 'EFG Diff Avg.', 'FTR Diff Avg.'], [False,  False, True, True])
rank_df

Unnamed: 0,Team,Oreb Diff Avg.Rank,TOV Margin Avg.Rank,EFG Diff Avg.Rank,FTR Diff Avg.Rank,Avg. Rank
13,TOR,5,5,3,5,4.5
8,NOP,2,3,2,15,5.5
14,CHI,4,7,8,4,5.75
1,GSW,8,11,4,8,7.75
3,MIA,9,1,10,12,8.0
7,MEM,3,9,14,6,8.0
0,CLE,15,2,1,16,8.5
2,DET,10,14,9,2,8.75
10,MIN,13,6,15,1,8.75
15,NYK,1,15,5,14,8.75


In [27]:
fig_rank = go.Figure(data=[go.Table(
    header=dict(values=list(rank_df.columns),
                fill_color='paleturquoise',
                align='left'),
    cells=dict(values=[rank_df['Team'], rank_df['Oreb Diff Avg.Rank'], rank_df['TOV Margin Avg.Rank'], rank_df['EFG Diff Avg.Rank'], rank_df['FTR Diff Avg.Rank'], rank_df['Avg. Rank']],
               fill_color='lavender',
               align='left'))
])

fig_rank.update_layout(title_text = "Team Rankings in 4 Factor Categories in Missed Wins")
fig_rank.show()
dp.Report(
    dp.Plot(fig_rank)).upload(name='Missed Wins Ranks', publicly_visible=True)

Uploading report and associated data - *please wait...*

Your report only contains a single element - did you know you can include additional plots, tables and text in a single report? More info <a href='https://docs.datapane.com/reports/blocks/layout-pages-and-selects' target='_blank'>here</a>

Report successfully uploaded. View and share your report <a href='https://datapane.com/reports/n3RV513/missed-wins-ranks/' target='_blank'>here</a>, or edit your report <a href='https://datapane.com/reports/n3RV513/missed-wins-ranks/edit/' target='_blank'>here</a>.

 This leads to the Raptors highest total rank, which is an average of the four ranked categories.

### Let's Summaraize What we Know So Far

* This model with EFG Difference, Turnover % Difference, and Free Throw Rate Difference is generally very good at explaining who would win a basketball game

* While the model's performance on the Raptors' 2021-2022 season has a solid prediction accuracy of 76%, this is generally underwhelming in comparison to the model's performance on other teams' seasons across the league.

* We saw that the model failed mostly in predicting wins from the Raptors, with 15 of the 19 incorrect predictions being games that the Raptors won.


* In these games, the Raptors were outshot and out free-throwed, while forcing more turnovers and offensive rebounds than the rest of the league on average.

## Examining the Missed Wins in Depth

In [28]:
xtst = log_df_raps_2021[['EFG_Diff',   'FT_RATE_Diff']]
ytst = (log_df_raps_2021.WL == 'W')*1
ypred = lr.predict(xtst)
yproba = lr.predict_proba(xtst)
log_df_raps_2021['Win Probability'] = yproba[:,1]
pred_acc = ytst == ypred
log_df_raps_2021['Prediction Accuracy'] = pred_acc
wrong_pred  = log_df_raps_2021[log_df_raps_2021['Prediction Accuracy']== False]

wrong_predicted_w = wrong_pred[wrong_pred["WL"] == 'W']


wins_df = wrong_predicted_w[['GAME_ID','GAME_DATE','MATCHUP','PLUS_MINUS','EFG_PCT','OPP_EFG_PCT', 'EFG_Diff', 'TOV_PCT_Diff','TOV Margin', 'OREB_PCT_Diff','Oreb Diff','FT_RATE_Diff', 'Win Probability']].sort_values('FT_RATE_Diff')


In [29]:
len(wins_df)

24

In [30]:
games_no_shot = log_df_raps_2021[(log_df_raps_2021['WL']=='W')& (log_df_raps_2021['EFG_Diff']<0)& (log_df_raps_2021['FT_RATE_Diff']<=0)]['GAME_ID']

In [31]:
games_no_shot.isin(wins_df.GAME_ID).sum()/len(games_no_shot)

1.0

In [109]:
not_efg = wins_df[(wins_df['EFG_Diff']<=0)]




In [93]:
not_efg_report = not_efg[['GAME_DATE','MATCHUP','EFG_Diff','TOV_PCT_Diff','OREB_PCT_Diff','FT_RATE_Diff', 'Win Probability'  ]].reset_index(drop=True)
not_efg_report = not_efg_report.rename({'GAME_DATE': 'Date','MATCHUP': 'Matchup','EFG_Diff': 'EFG% Diff','TOV_PCT_Diff': 'TOV% Diff','OREB_PCT_Diff': 'OREB% Diff','FT_RATE_Diff': 'FTR% Diff', 'Win Probability': 'Probability' }, axis='columns')

In [116]:
raps_avg = pd.DataFrame(np.round(not_efg_report.mean(axis=0),3)*100, columns = ['Mean Value  (%)'])

report = dp.Report(
    dp.Text('# Raptors Average Stats for Missed Wins with Lost EFG and FTR Differentials'),
    
    dp.Table(raps_avg)
    )



report.upload(formatting = dp.ReportFormatting( text_alignment=dp.TextAlignment.LEFT),name='Raptors Averages', publicly_visible=True)

Uploading report and associated data - *please wait...*

Your report only contains a single element - did you know you can include additional plots, tables and text in a single report? More info <a href='https://docs.datapane.com/reports/blocks/layout-pages-and-selects' target='_blank'>here</a>

Report successfully uploaded. View and share your report <a href='https://datapane.com/reports/vAqvwM3/raptors-averages/' target='_blank'>here</a>, or edit your report <a href='https://datapane.com/reports/vAqvwM3/raptors-averages/edit/' target='_blank'>here</a>.

In [100]:

report = dp.Report(
    dp.Text('# Missed Wins Where the Raptors Lost the EFG Differential'),
    dp.Text('Game Statistics'),
    dp.Table(not_efg_report[['Date', 'Matchup','EFG% Diff', 'TOV% Diff', 'OREB% Diff', 'FTR% Diff']]),
    dp.Text('Win Probabilities') ,
    dp.Table(not_efg_report[['Date', 'Matchup', 'Probability']])
    )



report.upload(formatting = dp.ReportFormatting( text_alignment=dp.TextAlignment.LEFT),name='Raptors Poor shooting Wins Stats', publicly_visible=True)

Uploading report and associated data - *please wait...*

Report successfully uploaded. View and share your report <a href='https://datapane.com/reports/dA6R8MA/raptors-poor-shooting-wins-stats/' target='_blank'>here</a>, or edit your report <a href='https://datapane.com/reports/dA6R8MA/raptors-poor-shooting-wins-stats/edit/' target='_blank'>here</a>.

In [70]:
lost_efg = go.Figure(data=[go.Table(
    header=dict(values=['Date','Game', 'EFG% Diff', 'TOV% Diff', 'OREB% Diff','FTR Diff', 'Probability'],
                fill_color='paleturquoise',
                align='left'),
    cells=dict(values=[not_efg['GAME_DATE'],not_efg['MATCHUP'], np.round(not_efg['EFG_Diff'],4), np.round(not_efg['TOV_PCT_Diff'],4),np.round(not_efg['OREB_PCT_Diff'],4),np.round(not_efg['FT_RATE_Diff'],4), np.round(not_efg['Win Probability'],4)],
               fill_color='lavender',
               align='left'))
])

lost_efg.update_layout(title_text = "Missed Wins from the Raptors where they Lost the EFG Differential")
lost_efg.show()
report = dp.Report(
    dp.Plot(lost_efg)
    )



report.upload(formatting = dp.ReportFormatting(width=dp.ReportWidth.FULL, text_alignment=dp.TextAlignment.LEFT),name='Missed Wins EFG Games', publicly_visible=True)

Uploading report and associated data - *please wait...*

Your report only contains a single element - did you know you can include additional plots, tables and text in a single report? More info <a href='https://docs.datapane.com/reports/blocks/layout-pages-and-selects' target='_blank'>here</a>

Report successfully uploaded. View and share your report <a href='https://datapane.com/reports/0keDR57/missed-wins-efg-games/' target='_blank'>here</a>, or edit your report <a href='https://datapane.com/reports/0keDR57/missed-wins-efg-games/edit/' target='_blank'>here</a>.

In [34]:
not_efg

Unnamed: 0,GAME_ID,GAME_DATE,MATCHUP,PLUS_MINUS,EFG_PCT,OPP_EFG_PCT,EFG_Diff,TOV_PCT_Diff,TOV Margin,OREB_PCT_Diff,Oreb Diff,FT_RATE_Diff,Win Probability
7,22100099,2021-11-01,TOR @ NYK,9.0,0.505,0.566,-0.061,0.087,8.0,0.032,6.0,-0.175,0.07542
70,22101069,2022-03-20,TOR @ PHI,5.0,0.414,0.459,-0.045,0.013,1.0,0.176,15.0,-0.137,0.127028
79,22101197,2022-04-07,TOR vs. PHI,5.0,0.577,0.62,-0.043,0.063,6.0,0.02,6.0,-0.135,0.133808
28,22100430,2022-02-03,TOR vs. CHI,7.0,0.487,0.544,-0.057,0.05,5.0,0.068,12.0,-0.099,0.113247
39,22100599,2022-01-09,TOR vs. NOP,4.0,0.506,0.533,-0.027,-0.001,0.0,0.062,8.0,-0.098,0.215261
6,22100081,2021-10-30,TOR @ IND,3.0,0.484,0.512,-0.028,0.026,2.0,-0.022,4.0,-0.068,0.234715
66,22101009,2022-03-12,TOR @ DEN,12.0,0.529,0.622,-0.093,0.111,11.0,0.086,14.0,-0.063,0.057023
80,22101206,2022-04-08,TOR vs. HOU,2.0,0.495,0.593,-0.098,0.086,9.0,0.025,6.0,-0.056,0.05214
12,22100171,2021-11-11,TOR @ PHI,6.0,0.51,0.517,-0.007,-0.0,0.0,0.025,7.0,-0.039,0.373428
25,22100385,2021-12-10,TOR vs. NYK,3.0,0.414,0.414,0.0,0.076,7.0,-0.124,-2.0,-0.032,0.423539


In [27]:
len(wins_df[(wins_df['EFG_Diff']<0)& (wins_df['OREB_PCT_Diff']>0) & (wins_df['TOV_PCT_Diff']>0)])

10

In [28]:
len(wins_df[(wins_df['EFG_Diff']<0)& (wins_df['OREB_PCT_Diff']<=0) & (wins_df['TOV_PCT_Diff']>0)])

5

In [29]:
len(wins_df[(wins_df['EFG_Diff']<0)& (wins_df['OREB_PCT_Diff']>0) & (wins_df['TOV_PCT_Diff']<=0)])

5

* In 14 of the 15 wins that the model did not predict, the Raptors won the offensive rebounding percent differential. In the one game that they did not against the Pacers on October 30th, they still won the raw offensive rebounding battle by 4 rebounds. 

* In every single one of these games, the Raptors were outshot in terms of effective field goal percentage. 

* In all of these games, the Raptors either won the Turnover percentage differential, or lost it by less than 2%. Further, they never lost the raw turnover battle by more than 1 turnover.

* Typically, in games when the Raptors are far behind in the free throw rate differential, they manage to win both the turnover percentage and offensive rebounding differential. It doesn't seem however, that free throw rate has a huge impact in either direction on prediction as there's many games where the Raptors had a higher and lower free throw rate than their opponent. 

* So what we see is that when the Raptors are winning these games where they get outshot by forcing turnovers and hitting the offensive boards. These wins show that maybe this model overvalues shooting and undervalues turnover percentage difference for the Raptors, specifically. In other words, it seems like under these parameters, the 2021-2022 Raptors are not representative of the general NBA population relative to other teams.  

