In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime
import seaborn as sns

Raw Data Initial Cleaning

In [None]:
#Central research question: How Many outlier players to have an outlier team?

nba = pd.read_excel("NBA_Stats_71_Years_Updated.xlsx")

In [None]:
print("Total Dataframe null counts\n\n", nba.isnull().sum())

In [None]:
print("Total Dataframe info\n\n",nba.info())

In [None]:
#Creating a function incase we need to find players later
def findPlayer(s):
    return nba.loc[nba["Player"]==s]

In [None]:
#Raw Data + turning it into regular season / playoff
nba = pd.read_excel("NBA_Stats_71_Years_Updated.xlsx")

regInd = np.where(nba["Season Type"] == "Regular Season")
playInd = np.where(nba["Season Type"] == "Playoffs")

nbaRegular = nba.iloc[regInd[0],:]
nbaPlayoff = nba.iloc[playInd[0],:]

In [None]:
print("Null counts from regular season df\n", nbaRegular.isnull().sum())

In [None]:
print("Regular season Info:")
nbaRegular.info()

In [None]:
print("Null counts from Playoff df:\n", nbaRegular.isnull().sum())

In [None]:
print("Playoff Info:")
nbaRegular.info()

Adding columns - and removing nan Rows - from the two dataframes

In [None]:
#Regular Season: Immediately add columns for calculation of PPS

nbaRegular["2-Pt FG Attempts"] = (nbaRegular["FG Attempts"] - nbaRegular["3-Pt FG Attempts"])
nbaRegular["2-Pt FG Made"] = nbaRegular["FG Made"] - nbaRegular["3-Pt FG Made"]
nbaRegular["EFG"] = (nbaRegular["2-Pt FG Made"] + (1.5*nbaRegular["3-Pt FG Made"])) / nbaRegular["FG Attempts"]

#points - free throws, since missed attempts do not count. Could divide free throw attempts by 2 but feels unintuitive. 
nbaRegular["Points-FT"] = nbaRegular["Points Scored"] - nbaRegular["FT Made"]
nbaRegular["PPS"] = nbaRegular["Points-FT"] / nbaRegular["FG Attempts"] #Points per shot

nbaRegular["FGA/Game"] = nbaRegular["FG Attempts"] / nbaRegular["Games Played"]

nbaRegular["Year"] = nbaRegular["Year"].astype("string")
nbaRegular["Team ID"] = nbaRegular["Team ID"].astype("string")

nbaRegular["FT/Game"] = nbaRegular["FT Attempts"] / nbaRegular["Games Played"]

In [None]:
#Playoff : Immediately add columns for calculation of PPS

nbaPlayoff["2-Pt FG Attempts"] = (nbaPlayoff["FG Attempts"] - nbaPlayoff["3-Pt FG Attempts"])
nbaPlayoff["2-Pt FG Made"] = nbaPlayoff["FG Made"] - nbaPlayoff["3-Pt FG Made"]
nbaPlayoff["EFG"] = (nbaPlayoff["2-Pt FG Made"] + (1.5*nbaPlayoff["3-Pt FG Made"])) / nbaPlayoff["FG Attempts"]

#points - free throws, since missed attempts do not count. Could divide free throw attempts by 2 but feels unintuitive. 

nbaPlayoff["Points-FT"] = nbaPlayoff["Points Scored"] - nbaPlayoff["FT Made"]
nbaPlayoff["PPS"] = nbaPlayoff["Points-FT"] / nbaPlayoff["FG Attempts"] #Points per shot

nbaPlayoff["FGA/Game"] = nbaPlayoff["FG Attempts"] / nbaPlayoff["Games Played"]


nbaPlayoff["Year"] = nbaPlayoff["Year"].astype("string")
nbaPlayoff["Team ID"] = nbaPlayoff["Team ID"].astype("string")

nbaPlayoff["FT/Game"] = nbaPlayoff["FT Attempts"] / nbaPlayoff["Games Played"]

In [None]:
# rows_to_remove = np.where(nbaRegular["PPS"] == np.nan)
# # Removing the specified rows from the DataFrame
# nbaRegular = nbaRegular.drop(rows_to_remove[0])

# rows_to_remove = np.where(nbaPlayoff["PPS"] == np.nan)
# # Removing the specified rows from the DataFrame
# nbaPlayoff = nbaPlayoff.drop(rows_to_remove[0])

Initial EDA on Regular Season, Exploring Correlations with our Features

In [None]:
fig,ax=plt.subplots(figsize=(14,8))

C = nbaRegular.corr()
#set figure size
ax.set_title("Correlation of Regular Season Metrics",fontsize=14,weight="bold")

#plot heatmap
ax = sns.heatmap(C)
plt.show

In [None]:
print("Regular season Point per shot description:",end="\n")
print(nbaRegular.loc[:,"PPS"].describe(),end="\n\n")
print("-----------------------------",end="\n\n")
print("Raw Correlation Value of all columns with Points per Shot:")
print(C["PPS"])
print("Notably")

In [None]:
print("Regular season Free Throw Attempts description:",end="\n")
print(nbaRegular.loc[:,"FT Attempts"].describe(),end="\n\n")
print("-----------------------------",end="\n\n")

print("Raw Correlation Value of all columns with FT Attempts:")
C["FT Attempts"]

In [None]:
print("Regular season Field Goal Attempts description:",end="\n")
print(nbaRegular.loc[:,"FG Attempts"].describe(),end="\n\n")
print("-----------------------------",end="\n\n")
print("Raw Correlation Value of all columns with FG Attempts:")

C["FG Attempts"]

Initial EDA on Playoff Data, Exploring Correlations with our Features

In [None]:
fig,ax=plt.subplots(figsize=(14,8))

C2 = nbaPlayoff.corr()
#set figure size

#plot heatmap
ax.set_title("Correlation of Playoff Metrics",fontsize=14,weight="bold"
             )
ax = sns.heatmap(C)
plt.show

In [None]:
print("Playoff Point per shot description:",end="\n")
print(nbaPlayoff.loc[:,"PPS"].describe(),end="\n\n")
print("-----------------------------",end="\n\n")
print("Raw Correlation Value of all columns with Points per Shot:")
C2["PPS"]

In [None]:
print("Playoff Free Throw Attempts description:",end="\n")
print(nbaPlayoff.loc[:,"FT Attempts"].describe(),end="\n\n")
print("-----------------------------",end="\n\n")
print("Raw Correlation Value of all columns with FT Attempts:")
C2["FT Attempts"]

In [None]:
print("Playoff Field Goal Attempts description:",end="\n")
print(nbaPlayoff.loc[:,"FG Attempts"].describe(),end="\n\n")
print("-----------------------------",end="\n\n")
print("Raw Correlation Value of all columns with FG Attempts:")

C2["FG Attempts"]

Removing nan rows

In [None]:
rows_to_remove = np.where(nbaRegular["PPS"] == np.nan)
# Removing the specified rows from the DataFrame
nbaRegular = nbaRegular.drop(rows_to_remove[0])

rows_to_remove = np.where(nbaPlayoff["PPS"] == np.nan)
# Removing the specified rows from the DataFrame
nbaPlayoff = nbaPlayoff.drop(rows_to_remove[0])

Why are we considering PPS a feature?
To answer this, let's start with the definition of the given "efficiency" metric, from the kaggle dataframe... 

"Efficency: (Points Scored + Rebounds + Assists + Steals + Blocks - Missed FG - Missed FT - Turnovers) / Games Played"

Reason 1. The provided efficiency metric measures many different types of events, as rebounds, steals, blocks, turnovers, and games played are completely separate events in the sport.
Reason 2. In basketball, you win by scoring more points. Every other type of event (rebound, steal, block, etc.) is an event that ends up leading to a shot - FG or FT attempt - later.
Reason 3. The given formula is unneccesarily long and unintuitive!

So this leads us to, why use points per shot (PPS) as a measure of efficiency?
Well, going back to reason 2, if the purpose of the sport is to score more points than the opponent, we want to focus on events that involve scoring points - field goal and free throw attempts.

PPS = (Total Points Scored - Free Throw Made) / Field Goal Attempts

So why remove Free throws from our total points scored, if our goal is to measure events that invovle scoring points? 
This gets into the rules of the sport itself, but free throws are not taken during the regular game. Free Throws are a penalty shot, and the game and time stops when a free throw is taken. Thus, by removing free throws, we can focus on only the events that occur during the regular game/time.
Ultimately, by removing made free throws from the numerator of points per shot, we can end up with a metric that only measures events from when a player is shooting the ball during regular play, i.e. the game of basketball itself. 

In the end, points per shot is a simple formula essentially the expected value -in points- for a player every time they shoot the ball. It is normalized to one attempt, and thus regardless of whether a player takes 1000 or 100 attempts, the PPS formula will hold true. There is the case where if a player takes one 3-pointer and makes it, their PPS would be 3 (very high), and this led us to define player outliers the way we did. (contribute at least )

Free throws we will measure as a completely separate metric. 

Notes/Outline for our presentation/plots:

How do we make our team better? in order to answer that question, --> Does a good player make a good team? 
BECAUSE BETTER TEAM = MORE TICKET SALES, SPONSORS, ETC.

Slide 1 = Introduce the market size, profit, etc. of the nba (Could be like Lec 5 slide 54)
Slide 2 = Then prove why making a team better = more profit. (Better team = more app engagement, views, sponsors, etc.)

##ONLY PLOT DIFFERENCES FROM THE MEDIAN, NOT THE RAW VALUES BECAUSE MEDIAN CHANGES EVERY YEQAR

Hypothesis is that outlier players make teams better.

#           1. What does an average player do? What does an average team do? (Per year) [Scatter plot of years and teams ; Subplot next to eachother]
#           Next step. Quantile those PPS average per player and team, look at only look at above median PPS players
#            2. Do top outlier teams always have top outlier players? (This answers how do we make our team better) (Table of all top outlier teams, and then a yes or no on whether they contain top outlier players) (Sum/Aggregate of all year) [Count barplot of Teams with top player (Yes), versus without a star player (No)]
#           For example, if only 50% of top outlier teams have outlier players, that means theres not a strong correlation between having a top player and being a top team (Would expect there to be top players on top teams)
#           Versus, if 90% of top outlier teams ghave top outlier players, that means in order to be a top team, you need a top player.
#           2b. Similarly, we check the same yes or no for having 2 top outlier players. (Sum/Aggregate of all years) [Subplot or not, but similar just a barplot of counts. Top teams with 2 top players (yes), versus top teams without 2 top players (no)] (Put 2 players and 1 player count histogram next to eachother)

#           3. Average free throw attempts (order may change). Plot free throw attempts of top players, because there is a clear trend that top players take significantly more free throws than other teams. (Relate free throws back to points per shot)


Team Data: Team PPS Per Year (Regular Season)

In [None]:
#Regular Season: More cleaning of data for team pps per year
teamPPS_perYear = nbaRegular.groupby(['Year', 'Team'])["Points-FT","FG Attempts"] #Fix team PPS because this only take smedian of plauyer
teamPPS_perTeam = nbaRegular.groupby(['Team', 'Year'])["Points-FT","FG Attempts"]

uniqueYear = nbaRegular["Year"].unique()

nbaRegular["Player Median PPS this Year"] = np.nan
nbaRegular["This Teams PPS this Year"] = np.nan

In [None]:
#Regular Season: Gets team PPS and assigns to a new column

for i in teamPPS_perTeam.groups:
    thisGroup = teamPPS_perTeam.get_group(i)
    for j in nbaRegular.index:
        if nbaRegular.loc[j,"Year"] == i[1]:
                if nbaRegular.loc[j,"Team"] == i[0]: #0 0 = Team ; 0,1 = team
                    thisFGA_sum = np.sum(thisGroup["FG Attempts"])
                    thisPt_sum = np.sum(thisGroup["Points-FT"])
                    nbaRegular.loc[j,"This Teams PPS this Year"] = thisPt_sum / thisFGA_sum

Team Data: Team PPS Per Year (Playoff)

In [None]:
#Playoff: More cleaning of data for team pps per year
teamPPS_perYear_playoff = nbaPlayoff.groupby(['Year', 'Team'])["Points-FT","FG Attempts"] #Add Points & FGA-FT to groupBy
teamPPS_perTeam_playoff = nbaPlayoff.groupby(['Team', 'Year'])["Points-FT","FG Attempts"]

uniqueYear_playoff = nbaPlayoff["Year"].unique()

nbaPlayoff["Player Median PPS this Year"] = np.nan
nbaPlayoff["This Teams PPS this Year"] = np.nan

In [None]:
# Playoff: Gets team PPS and assigns to a new column

for i in teamPPS_perTeam_playoff.groups:
    thisGroup = teamPPS_perTeam_playoff.get_group(i)
    for j in nbaPlayoff.index:
        if nbaPlayoff.loc[j,"Year"] == i[1]:
                if nbaPlayoff.loc[j,"Team"] == i[0]: #0 0 = Team ; 0,1 = team
                    thisFGA_sum = np.sum(thisGroup["FG Attempts"])
                    thisPt_sum = np.sum(thisGroup["Points-FT"])
                    nbaPlayoff.loc[j,"This Teams PPS this Year"] = thisPt_sum / thisFGA_sum

Defining Outlier Teams (75th percentile in TEAM PPS within their respective year) (Playoff + reg)

In [None]:
teamPPS_groupPlayoff = nbaPlayoff.groupby("Year")["This Teams PPS this Year"]
nbaPlayoff["This Year Median Team PPS"] = np.nan
for i in teamPPS_groupPlayoff.groups:
    thisGroup = teamPPS_groupPlayoff.get_group(i)
    for j in nbaPlayoff.index:
        if nbaPlayoff.loc[j,"Year"] == i:
            this75 = thisGroup.median()
            nbaPlayoff.loc[j,"This Year Median Team PPS"] = this75

In [None]:
teamPPS_groupReg = nbaRegular.groupby("Year")["This Teams PPS this Year"]
nbaRegular["This Year Median Team PPS"] = np.nan
for i in teamPPS_groupReg.groups:
    thisGroup = teamPPS_groupReg.get_group(i)
    for j in nbaRegular.index:
        if nbaRegular.loc[j,"Year"] == i:
            this75 = thisGroup.median()
            nbaRegular.loc[j,"This Year Median Team PPS"] = this75

Shot Attempts percentile (Playoff+Reg)

In [None]:
#75% of shots per year
perYear_FGAplayoff = nbaPlayoff.groupby("Year")["FG Attempts","Games Played"] 

FGA75_perGame_playoff = {}
FGAmed_perGame_playoff = {}
for i in perYear_FGAplayoff.groups:
    FGA75_perGame_playoff[i] = (perYear_FGAplayoff.get_group(i)["FG Attempts"] / perYear_FGAplayoff.get_group(i)["Games Played"]).quantile([.75])

nbaPlayoff["This Year 75% FGA/Game"] = np.nan
for i in FGA75_perGame_playoff: #Loop thru years/keys
    for j in nbaPlayoff.index:
        if i == nbaPlayoff.loc[j,"Year"]:
            nbaPlayoff.loc[j,"This Year 75% FGA/Game"] = FGA75_perGame_playoff[i][.75]

In [None]:
#75% of shots per year
perYear_FGA = nbaRegular.groupby("Year")["FG Attempts","Games Played"] 
FGA75_perGame = {}
for i in perYear_FGA.groups:
    FGA75_perGame[i] = (perYear_FGA.get_group(i)["FG Attempts"] / perYear_FGA.get_group(i)["Games Played"]).quantile([.75])


nbaRegular["This Year 75% FGA/Game"] = np.nan
for i in FGA75_perGame: #Loop thru years/keys
    for j in nbaRegular.index:
        if i == nbaRegular.loc[j,"Year"]:
            nbaRegular.loc[j,"This Year 75% FGA/Game"] = FGA75_perGame[i][.75]

Player PPS Quantiles (Playoff)

In [None]:
#Gets player median PPS and assigns to a new column

playerPPS_75quant_PerYear_playoff = {}
playerPPS_99quant_PerYear_playoff = {}
playerPPS_25quant_PerYear_playoff = {}

perYear_allPPS_playoff = nbaPlayoff.groupby("Year")["PPS"] #Using median to be robust to outlier
perYear_medPPS_playoff = nbaPlayoff.groupby("Year")["PPS"].median() #Using median to be robust to outlier

nbaPlayoff["League-Wide Player 75% PPS this Year"] = np.nan
nbaPlayoff["League-Wide Player 99% PPS this Year"] = np.nan
nbaPlayoff["League-Wide Player 25% PPS this Year"] = np.nan

for i in perYear_allPPS_playoff.groups:
    for j in nbaPlayoff.index:
        if nbaPlayoff.loc[j,"Year"] == i:
                nbaPlayoff.loc[j,"Player Median PPS this Year"] = perYear_medPPS_playoff[i]
                playerPPS_75quant_PerYear_playoff[i] = perYear_allPPS_playoff.get_group(i).quantile(.75)
                nbaPlayoff.loc[j,"League-Wide Player 75% PPS this Year"] = playerPPS_75quant_PerYear_playoff[i]
                playerPPS_99quant_PerYear_playoff[i] = perYear_allPPS_playoff.get_group(i).quantile(.99)
                nbaPlayoff.loc[j,"League-Wide Player 99% PPS this Year"] = playerPPS_99quant_PerYear_playoff[i]
                playerPPS_25quant_PerYear_playoff[i] = perYear_allPPS_playoff.get_group(i).quantile(.25)
                nbaPlayoff.loc[j,"League-Wide Player 25% PPS this Year"] = playerPPS_25quant_PerYear_playoff[i]

In [None]:
nbaPlayoff["Player PPS % above Median"] = 100*((nbaPlayoff["PPS"]-nbaPlayoff["Player Median PPS this Year"]) / nbaPlayoff["Player Median PPS this Year"])
nbaRegular["Player PPS % above Median"] = 100*((nbaRegular["PPS"]-nbaRegular["Player Median PPS this Year"]) / nbaRegular["Player Median PPS this Year"])

In [None]:
perYear_allPoints_playoff = nbaPlayoff.groupby("Year")["Player PPS % above Median"] #Using median to be robust to outlier
nbaPlayoff["Player 90% PPS This Year"] = np.nan

for i in perYear_allPoints_playoff.groups:
    thisYr_points = perYear_allPoints_playoff.get_group(i)
    thisQuant = thisYr_points.quantile([.90])
    for j in nbaPlayoff.index:
            if nbaPlayoff.loc[j,"Year"] == i:
                nbaPlayoff.loc[j,"Player 90% PPS This Year"] = thisQuant[.90]

Points and PPS Quantile (REgylar)

In [None]:
#Gets player median PPS of quantiled players and assigns to a new column
playerPPS_75quant_PerYear = {}
playerPPS_99quant_PerYear = {}
playerPPS_25quant_PerYear = {}

perYear_allPPS = nbaRegular.groupby("Year")["PPS"] #Using median to be robust to outlier
perYear_medPPS = nbaRegular.groupby("Year")["PPS"].median() #Using median to be robust to outlier

nbaRegular["League-Wide Player 75% PPS this Year"] = np.nan
nbaRegular["League-Wide Player 99% PPS this Year"] = np.nan
nbaRegular["League-Wide Player 25% PPS this Year"] = np.nan

for i in perYear_allPPS.groups:
    for j in nbaRegular.index:
        if nbaRegular.loc[j,"Year"] == i:
                nbaRegular.loc[j,"Player Median PPS this Year"] = perYear_medPPS[i]
                playerPPS_75quant_PerYear[i] = perYear_allPPS.get_group(i).quantile(.75)
                nbaRegular.loc[j,"League-Wide Player 75% PPS this Year"] = playerPPS_75quant_PerYear[i]
                playerPPS_99quant_PerYear[i] = perYear_allPPS.get_group(i).quantile(.99)
                nbaRegular.loc[j,"League-Wide Player 99% PPS this Year"] = playerPPS_99quant_PerYear[i]
                playerPPS_25quant_PerYear[i] = perYear_allPPS.get_group(i).quantile(.25)
                nbaRegular.loc[j,"League-Wide Player 25% PPS this Year"] = playerPPS_25quant_PerYear[i]

In [None]:
nbaRegular["Player 90% PPS This Year"] = np.nan

perYear_allPPS = nbaRegular.groupby("Year")["Player PPS % above Median"] #Using median to be robust to outlier
for i in perYear_allPPS.groups:
    thisYr_PPS = perYear_allPPS.get_group(i)
    thisQuant = thisYr_PPS.quantile([.90])
    for j in nbaRegular.index:
            if nbaRegular.loc[j,"Year"] == i:
                nbaRegular.loc[j,"Player 90% PPS This Year"] = thisQuant[.90]

EDA Plot for PPS Across Years

In [None]:
fig,ax = plt.subplots(figsize=(10,8),)

ax.plot(nbaRegular["Year"],nbaRegular["This Year Median Team PPS"],label="Team PPS",color="#17408B") #Need to fix the indices for now and make it look nice
title_font = {
    'fontsize': 12,        # Font size
    'fontfamily': 'Century Gothic', # Font family
}
ax.set_title("Regular Season Points per Shot vs Year",fontdict=title_font)
ax.vlines(x=28,ymin=.7,ymax=1.0999999999,color='#B8B2B2',alpha=.8,linewidth=1.4,linestyles='dashed',label="3pt Line Introduction")

label = uniqueYear_playoff[0::5]
ax.set_xticks(range(0,72,5))
ax.set_xticklabels(label,rotation=45)
ax.tick_params(axis='x',bottom=False)
ax.tick_params(axis='y',left=False)

ax.plot(nbaRegular["Year"],nbaRegular["Player Median PPS this Year"],color="#C9082A",label="Player PPS")


ax.grid(color='grey', axis = 'y', linestyle='solid', linewidth=.35,alpha=.3)

ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['bottom'].set_visible(False)

plt.legend()
plt.show()

#Zone defemce was imnammed

In [None]:
fig,ax = plt.subplots(figsize=(10,8),)

ax.plot(nbaPlayoff["Year"],nbaPlayoff["This Year Median Team PPS"],label="Team PPS",color="#17408B",linewidth=1.8) #Need to fix the indices for now and make it look nice
ax.set_title("Playoff Points per Shot vs Year")
ax.vlines(x=28,ymin=.7,ymax=1.0999999999,color='#B8B2B2',alpha=.8,linewidth=1.4,linestyles='dashed',label="3 Pt line Introduction")

ax.plot(nbaPlayoff["Year"],nbaPlayoff["Player Median PPS this Year"],color="#C9082A",label="Player PPS",linewidth=1.8,)

label = uniqueYear_playoff[0::5]
ax.set_xticks(range(0,72,5))
ax.set_xticklabels(label,rotation=45)
ax.tick_params(axis='x',bottom=False)
ax.tick_params(axis='y',left=False)


ax.grid(color='grey', axis = 'y', linestyle='solid', linewidth=.35,alpha=.3)

ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['bottom'].set_visible(False)

plt.legend()
plt.show()

Calculating Player % above median and Quantifying Outlier

In [None]:
nbaPlayoff["Player PPS % above Median"] = 100*((nbaPlayoff["PPS"]-nbaPlayoff["Player Median PPS this Year"]) / nbaPlayoff["Player Median PPS this Year"])
nbaRegular["Player PPS % above Median"] = 100*((nbaRegular["PPS"]-nbaRegular["Player Median PPS this Year"]) / nbaRegular["Player Median PPS this Year"])

In [None]:
teamPoints_perTeam = nbaRegular.groupby(['Team', 'Year'])["Points Scored"]
nbaRegular["This Team Total Pts"] = np.nan

for i in teamPoints_perTeam.groups:
    thisGroup = teamPoints_perTeam.get_group(i)
    for j in nbaRegular.index:
        if nbaRegular.loc[j,"Year"] == i[1]:
                if nbaRegular.loc[j,"Team"] == i[0]: #0 0 = Team ; 0,1 = team
                    nbaRegular.loc[j, "This Team Total Pts"] = np.sum(thisGroup)

In [None]:
nbaRegular["This Team PPG"] = nbaRegular["This Team Total Pts"] / 82
nbaRegular["This Player PPG"] = nbaRegular["Points Scored"] / nbaRegular["Games Played"]

In [None]:
teamPoints_perTeam = nbaPlayoff.groupby(['Team', 'Year'])["Points Scored"]
teamGames_perTeam = nbaPlayoff.groupby(['Team', 'Year'])["Games Played"]

nbaPlayoff["This Team Total Pts"] = np.nan
nbaPlayoff["This Team Games Played"] = np.nan

for i in teamPoints_perTeam.groups:
    thisGroup = teamPoints_perTeam.get_group(i)
    thisGroup_game = teamGames_perTeam.get_group(i)
    for j in nbaPlayoff.index:
        if nbaPlayoff.loc[j,"Year"] == i[1]:
                if nbaPlayoff.loc[j,"Team"] == i[0]: #0 0 = Team ; 0,1 = team
                    nbaPlayoff.loc[j, "This Team Total Pts"] = np.sum(thisGroup)
                    nbaPlayoff.loc[j, "This Team Games Played"] = thisGroup_game.max()

In [None]:
nbaPlayoff["This Team PPG"] = nbaPlayoff["This Team Total Pts"] / nbaPlayoff["This Team Games Played"]
nbaPlayoff["This Player PPG"] = nbaPlayoff["Points Scored"] / nbaPlayoff["Games Played"]

In [None]:
nbaRegular["Player % of Team Pts"] = 100 * (nbaRegular["This Player PPG"] / nbaRegular["This Team PPG"])
nbaPlayoff["Player % of Team Pts"] = 100 * (nbaPlayoff["This Player PPG"] / nbaPlayoff["This Team PPG"])

Outlier players are defined as 
1. 90th percentile in "player % of team points" = 18%. This ensures that an outlier player significantly contributes to their team performance. 
2. 75th percentile in "FG attempts" within that respective year. This helps avoid the problem of: if a player takes only 1 FG attempt and makes a 3 pointer, their points per shot would be 3, when the median hovers around 1.
3. Player PPS is above median in regular season, and no less than 2% below the median in the playoffs. In the playoffs, competition gets stronger, where only 16/32 make the playoffs. Thus, when strong players play strong players, their strengths are neutralized, so we lower the PPS cutoff to account for that. Generally speaking now, we decide to keep the cutoff for PPS as at least median, because to us an outlier is a player who contributes significantly to their team (% of team points). For example, if an opponenet team knows one player is by far the best on our team, the opponent team will use a strategy to minimize the best players contribution, making the best player closer to the median. With our definition, we are aiming to take those "best players" from the example, that can still perform at median or above.

In [None]:
##Definition of an outlier Player

yoffOutlier = nbaPlayoff["Player % of Team Pts"].quantile(.9)
# nbaPlayoff["Outlier Player"] = (nbaPlayoff["Player % of Team Pts"] > yoffOutlier) & (nbaPlayoff["FGA/Game"] >= nbaPlayoff['This Year 75% FGA/Game'])
nbaPlayoff["Outlier Player"] = ((nbaPlayoff["Player % of Team Pts"] > yoffOutlier) & (nbaPlayoff["Player PPS % above Median"] >= -2) & ((nbaPlayoff["FGA/Game"] >= nbaPlayoff['This Year 75% FGA/Game']))) #-2
regOutlier = nbaRegular["Player % of Team Pts"].quantile(.9)
# nbaRegular["Outlier Player"] = (nbaRegular["Player % of Team Pts"] > regOutlier) & (nbaRegular["FGA/Game"] >= nbaRegular['This Year 75% FGA/Game'])
nbaRegular["Outlier Player"] = ((nbaRegular["Player % of Team Pts"] > regOutlier) & (nbaRegular["Player PPS % above Median"] >= 0) & ((nbaRegular["FGA/Game"] >= nbaRegular['This Year 75% FGA/Game']))) #-1

Quantifying Team Outliers

In [None]:
weirdVal = np.where((nbaPlayoff["Outlier Player"] == True) & (nbaPlayoff["Player PPS % above Median"] <0) & (nbaPlayoff["Year"] =="1954-55"))
wVal = np.where((nbaRegular["Outlier Player"] == True) & (nbaRegular["Player PPS % above Median"] <0) & (nbaRegular["Year"] =="1954-55"))

In [None]:
nbaPlayoff["Team PPS % above Median"] = 100*((nbaPlayoff["This Teams PPS this Year"] - nbaPlayoff["This Year Median Team PPS"]) / nbaPlayoff["This Year Median Team PPS"])
nbaRegular["Team PPS % above Median"] = 100*((nbaRegular["This Teams PPS this Year"] - nbaRegular["This Year Median Team PPS"])/ nbaRegular["This Year Median Team PPS"])

In [None]:
# teamPPS_outlierPlayoff = nbaPlayoff.groupby(["Year"])["Team PPS % above Median"].quantile([.75])
teamPPS_outlierPlayoff = nbaPlayoff.groupby(["Year"])["Team PPS % above Median"]
# top10_yoff = teamPPS_outlierPlayoff.apply(lambda x: x.nlargest(10).iloc[-1])

nbaPlayoff["This Year 75% Team PPS Above Med"] = np.nan
nbaPlayoff["This Year Top 8 Team PPS Above Med"] = np.nan

for i in teamPPS_outlierPlayoff.groups:
    thisYear = np.sort((teamPPS_outlierPlayoff.get_group(i)).unique())
    if len(thisYear) ==1:
        top10 = thisYear[-1] #Could be 3
    elif len(thisYear) <= 10:
        top10 = thisYear[-2] #Could be 3
    else:
        top10 = thisYear[-4] #Could be 5
    for j in nbaPlayoff.index:
        if i == nbaPlayoff.loc[j,"Year"]:
            nbaPlayoff.loc[j,"This Year Top 8 Team PPS Above Med"] = top10


In [None]:
# teamPPS_outlierReg = nbaRegular.groupby(["Year"])["Team PPS % above Median"].quantile([.75])
teamPPS_outlierReg = nbaRegular.groupby(["Year"])["Team PPS % above Median"]
# top10 = teamPPS_outlierReg.apply(lambda x: x.nlargest(10).iloc[-1])

nbaRegular["This Year 75% Team PPS Above Med"] = np.nan
nbaRegular["This Year Top 8 Team PPS Above Med"] = np.nan
nbaPlayoff["Team Outlier in PPS"] = False

teamOut_counter ={}
for i in teamPPS_outlierReg.groups: #Year
    if i not in teamOut_counter:
        teamOut_counter[i] = 0
    thisYear = np.sort((teamPPS_outlierReg.get_group(i)).unique())
    if len(thisYear) <= 10:
        top10 = thisYear[-4] #could be 5
    else:
        top10 = thisYear[-8] #Could be 10
    for j in nbaRegular.index:
        if i == nbaRegular.loc[j,"Year"]:
            # nbaRegular.loc[j,"This Year 75% Team PPS Above Med"] = teamPPS_outlierReg[i[0]][.75]
            nbaRegular.loc[j,"This Year Top 8 Team PPS Above Med"] = top10
            if nbaRegular.loc[j,"Team PPS % above Median"] >= nbaRegular.loc[j,"This Year Top 8 Team PPS Above Med"]:
                nbaRegular.loc[j,"Team Outlier in PPS"] = True

For a Team outlier, we simply define them as a team who is top 8 in PPS, or top 4 if there are less teams that year. (8/32 = 1/4, thus 75th percentile.) 
In the playoff, outliers are top 4 or 2 respectively, keeping the percentile the same (4/16)

In [None]:
nbaPlayoff["Team Outlier in PPS"] = nbaPlayoff["Team PPS % above Median"] >= nbaPlayoff["This Year Top 8 Team PPS Above Med"]
nbaRegular["Team Outlier in PPS"] = nbaRegular["Team PPS % above Median"] >= nbaRegular["This Year Top 8 Team PPS Above Med"]


Creating New DF for players meeting minimum FGA

In [None]:
regPlayer_fgaInd = np.where((nbaRegular["Team Outlier in PPS"]==True))
playoffPlayer_fgaInd = np.where((nbaPlayoff["Team Outlier in PPS"]==True))

playoffFGA_playerDF = nbaPlayoff.iloc[playoffPlayer_fgaInd[0],:]
regularFGA_playerDF = nbaRegular.iloc[regPlayer_fgaInd[0],:]

In [None]:
#Taking new median PPS only from players

tInd = np.where((regularFGA_playerDF["Team Outlier in PPS"]==True)) #This is giving int index, not index in this column
pTrueInd = np.where((regularFGA_playerDF["Outlier Player"]==True))

Counting Outlier per Team (Regular)

In [None]:
regularFGA_playerDF = regularFGA_playerDF.reset_index()

In [None]:
count = 0
tCount = 0
yearDict = {}
for i in tInd[0]: #Loop thru outlier teams indices
    if regularFGA_playerDF.loc[i,"Year"] not in yearDict:
        yearDict[regularFGA_playerDF.loc[i,"Year"]] = {} #Create a nested dictionary for this year, where the nested keys are tje team
    if regularFGA_playerDF.loc[i,"Team"] not in yearDict[regularFGA_playerDF.loc[i,"Year"]]:
        yearDict[regularFGA_playerDF.loc[i,"Year"]][regularFGA_playerDF.loc[i,"Team"]] = 0
        tCount+=1
    if i in pTrueInd[0]: #If this outlier team entry is also an outlier player, add 1
        count+=1
        yearDict[regularFGA_playerDF.loc[i,"Year"]][regularFGA_playerDF.loc[i,"Team"]] += 1
    #Half the time, there is no outlier team
    
print(count)
print(tCount)


In [None]:
yearDict

Counting Outlier per Team (Playoffs)

In [None]:
yoffThresh = playoffFGA_playerDF["Player PPS % above Median"].quantile([.75])
yoffMedThresh = playoffFGA_playerDF["Player PPS % above Median"].median()

tpInd = np.where((playoffFGA_playerDF["Team Outlier in PPS"]==True)) #This is giving int index, not index in this column
yoffTrueInd = np.where((playoffFGA_playerDF["Outlier Player"]==True))
# yoffTrueInd = np.where((playoffFGA_playerDF["Outlier in PPS"]==True))
playoffFGA_playerDF = playoffFGA_playerDF.reset_index()

In [None]:
print(len(yoffTrueInd[0]))

In [None]:
ct = 0
ptCT = 0
yearDict_playoff = {}
for i in tpInd[0]: #Loop thru outlier teams indices
    if playoffFGA_playerDF.loc[i,"Year"] not in yearDict_playoff:
        yearDict_playoff[playoffFGA_playerDF.loc[i,"Year"]] = {} #Create a nested dictionary for this year, where the nested keys are tje team
    if playoffFGA_playerDF.loc[i,"Team"] not in yearDict_playoff[playoffFGA_playerDF.loc[i,"Year"]]:
        ptCT += 1
        yearDict_playoff[playoffFGA_playerDF.loc[i,"Year"]][playoffFGA_playerDF.loc[i,"Team"]] = 0
    if i in yoffTrueInd[0]: #If this outlier team entry is also an outlier player?
        ct+=1
        yearDict_playoff[playoffFGA_playerDF.loc[i,"Year"]][playoffFGA_playerDF.loc[i,"Team"]] += 1

print(ct)
print(ptCT)

In [None]:
yearDict_playoff

Adding team outlier count dictionary values to another dictionary lol

In [None]:
finalYoff_dict = {}

for year in yearDict_playoff:
    for team in yearDict_playoff[year]:
        if yearDict_playoff[year][team] not in finalYoff_dict: #If this player count not in output dict
            finalYoff_dict[yearDict_playoff[year][team]] = 1
        else:
            finalYoff_dict[yearDict_playoff[year][team]] += 1

In [None]:
print(ptCT)
finalYoff_dict

In [None]:
finalReg_dict = {}
counter = 0
for year in yearDict:
    for team in yearDict[year]:
        if yearDict[year][team] not in finalReg_dict: #If this player count not in output dict
            finalReg_dict[yearDict[year][team]] = 1
        else:
            finalReg_dict[yearDict[year][team]] += 1
        counter += yearDict[year][team]

In [None]:
#com f Final
print(count)
print(tCount)
finalReg_dict

In [None]:
# finalReg_dict.pop(5) #For plot, drop 1 instance team


In [None]:
regPCT_dict = {}
cumReg_dict = {}
sortKeys = list(finalReg_dict.keys())
sortKeys.sort()
print(sortKeys)
for i in sortKeys:
    regPCT_dict[i] = finalReg_dict[i] / 512
    if i==0:
        cumReg_dict[i] = regPCT_dict[i]
    else:
        cumReg_dict[i] = regPCT_dict[i] + cumReg_dict[i-1]
        
yoffPCT_dict = {}
cumYoff_dict = {}

sortKeys_yoff = list(finalYoff_dict.keys())
sortKeys_yoff.sort()
for i in sortKeys_yoff:
    yoffPCT_dict[i] = finalYoff_dict[i] / 238
    if i==0:
        cumYoff_dict[i] = yoffPCT_dict[i]
    else:
        cumYoff_dict[i] = yoffPCT_dict[i] + cumYoff_dict[i-1]



Raw PDF of % of teams with certain number of outliers

In [None]:
plt.bar(regPCT_dict.keys(),regPCT_dict.values()) #PDF

Presentation CDF of Teams with certain # of Outlier (Regular Season)

In [None]:
fig,ax=plt.subplots(figsize=(12,8))

colors = ["#717e97","#717e97","#c00900","#717e97","#717e97"]
alphas = [.65,.65,1,.65,.65]

bars = ax.bar(cumReg_dict.keys(),cumReg_dict.values(),color=colors,width=0.4,)

ax.grid(color='grey', axis = 'y', linestyle='solid', linewidth=.35,alpha=.3)

ax.set_yticks(np.arange(0,1.01,.1))
ax.set_xticks([0,1,2,3,4,5])
ax.set_xticklabels(["Zero","One","Two","Three","Four","Five"],rotation=45)
ax.tick_params(axis='x',bottom=False)
ax.tick_params(axis='y',left=False)

ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['bottom'].set_visible(False)

plt.plot(cumReg_dict.keys(),cumReg_dict.values(),color="grey",alpha=.15,linewidth=4)

plt.show()


In [None]:
# print(cumYoff_dict)
# cumYoff_dict.pop(4) #Drop value with 1 occurance for simplicity of CDF
# print(cumYoff_dict)

Presentation CDF of Teams with certain # of Outlier (Playoff)

In [None]:
fig,ax=plt.subplots(figsize=(12,8))

colors = ["#717e97","#717e97","#c00900","#717e97","#717e97"]
alphas = [.65,.65,1,.65,.65]

bars = ax.bar(cumYoff_dict.keys(),cumYoff_dict.values(),color=colors,width=.4,)

ax.grid(color='grey', axis = 'y', linestyle='solid', linewidth=.35,alpha=.3)

ax.set_yticks(np.arange(0,1.01,.1))
ax.set_xticks([0,1,2,3])
ax.set_xticklabels(["Zero","One","Two","Three"],rotation=45)
ax.tick_params(axis='x',bottom=False)
ax.tick_params(axis='y',left=False)

ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['bottom'].set_visible(False)


# for bar, alpha in zip(bars, alphas):
#     bar.set_alpha(alpha)


plt.plot(cumYoff_dict.keys(),cumYoff_dict.values(),color="grey",alpha=.15,linewidth=4)

plt.show()

Regular Season Free throw plot:

In [None]:
outlierDf_reg = nbaRegular.loc[nbaRegular["Outlier Player"]==True]
outlierDf_yoff = nbaPlayoff.loc[nbaPlayoff["Outlier Player"]==True]

In [None]:
ftYear_group_outlier = outlierDf_reg.groupby("Year")["FT/Game"].median()
ftYear_groupPlayoff_outlier = outlierDf_yoff.groupby("Year")["FT/Game"].median()

ftYear_group = nbaRegular.groupby("Year")["FT/Game"].median()


In [None]:
#Regular FT Plot
fig,ax = plt.subplots(figsize=(10,8),)

ax.vlines(x=28,ymin=.7,ymax=1.0999999999,color='#B8B2B2',alpha=.8,linewidth=1.4,linestyles='dashed',label="3 Pt line Introduction")
xx = range(0,72)

ax.plot(ftYear_group_outlier.index,ftYear_group_outlier.values,color="#C9082A",label="Outlier Median FT Attempts",linewidth=1.8,)
ax.fill_between(x=xx,y1=0,y2=ftYear_group_outlier.values,color="#C9082A",alpha=.4)

ax.plot(ftYear_group.index,ftYear_group.values,label="Median FT Attempts",color="#17408B",linewidth=1.8) #Need to fix the indices for now and make it look nice
ax.fill_between(x=xx,y1=0,y2=ftYear_group.values,color="#17408B",alpha=.4)


label = uniqueYear_playoff[0::5]
ax.set_xticks(range(0,72,5))
ax.set_xticklabels(label,rotation=45)
ax.tick_params(axis='x',bottom=False)
ax.tick_params(axis='y',left=False)

ax.grid(color='grey', axis = 'y', linestyle='solid', linewidth=.35,alpha=.3)

ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['bottom'].set_visible(False)

# plt.legend()
plt.show()

Regular Season Field Goal Plot

In [None]:
ftYear_group_outlier = outlierDf_reg.groupby("Year")["FGA/Game"].median()
# ftYear_groupPlayoff_outlier = outlierDf_yoff.groupby("Year")["FGA/Game"].median()

ftYear_group = nbaRegular.groupby("Year")["FGA/Game"].median()
# ftYear_groupPlayoff = nbaPlayoff.groupby("Year")["FGA/Game"].median()

# ftpYear_group_outlier = outlierDf_reg.groupby("Year")["FG %"].median()

# ftpYear_group = nbaRegular.groupby("Year")["FG %"].median()
# ftpYear_groupPlayoff = nbaPlayoff.groupby("Year")["FG %"].median()

In [None]:
#Regular FG Plot
fig,ax = plt.subplots(figsize=(10,8),)

xx = range(0,72)
gg="#998179"
dark="#105F73"
light = "#9DBCC9"

ax.plot(ftYear_group_outlier.index,ftYear_group_outlier.values,color=light,label="Outlier Median FT Attempts",linewidth=1.8,)
ax.fill_between(x=xx,y1=ftYear_group.values.min(),y2=ftYear_group_outlier.values,color=light,alpha=.4)

ax.plot(ftYear_group.index,ftYear_group.values,label="Median FT Attempts",color=dark,linewidth=1.8) #Need to fix the indices for now and make it look nice
ax.fill_between(x=xx,y1=ftYear_group.values.min(),y2=ftYear_group.values,color=dark,alpha=.4)

# ax.vlines(x=28,ymin=ftYear_group.values.min(),ymax=ftYear_group_outlier.values.max(),color='#4E5961',alpha=.5,linewidth=1.4,linestyles='dotted',label="3pt Line Introduction")


label = uniqueYear_playoff[0::5]
ax.set_xticks(range(0,72,5))
ax.set_xticklabels(label,rotation=45)
ax.tick_params(axis='x',bottom=False)
ax.tick_params(axis='y',left=False)

ax.grid(color='grey', axis = 'y', linestyle='solid', linewidth=.35,alpha=.3)

ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['bottom'].set_visible(False)

# plt.legend()
plt.show()

Playoff Plot Free Throw

In [None]:
fgYear_groupPlayoff = nbaPlayoff.groupby("Year")["FT/Game"].median()
fgYear_groupPlayoff_outlier = outlierDf_yoff.groupby("Year")["FT/Game"].median()

fgYear_groupPlayoff_outlier = outlierDf_yoff.groupby("Year")["FT/Game"].median()
fgYear_groupPlayoff = nbaPlayoff.groupby("Year")["FT/Game"].median()

In [None]:
#Playoff FT Plot
fig,ax = plt.subplots(figsize=(10,8),)

ax.vlines(x=28,ymin=.7,ymax=1.0999999999,color='#B8B2B2',alpha=.8,linewidth=1.4,linestyles='dashed',label="3 Pt line Introduction")
xx = range(0,72)

ax.plot(fgYear_groupPlayoff_outlier.index,fgYear_groupPlayoff_outlier.values,color="#C9082A",label="Outlier Median FT Attempts",linewidth=1.8,)
ax.fill_between(x=xx,y1=0,y2=fgYear_groupPlayoff_outlier.values,color="#C9082A",alpha=.4)

ax.plot(fgYear_groupPlayoff.index,fgYear_groupPlayoff.values,label="Median FT Attempts",color="#17408B",linewidth=1.8) #Need to fix the indices for now and make it look nice
ax.fill_between(x=xx,y1=0,y2=fgYear_groupPlayoff.values,color="#17408B",alpha=.4)


label = uniqueYear_playoff[0::5]
ax.set_xticks(range(0,72,5))
ax.set_xticklabels(label,rotation=45)
ax.tick_params(axis='x',bottom=False)
ax.tick_params(axis='y',left=False)

ax.grid(color='grey', axis = 'y', linestyle='solid', linewidth=.35,alpha=.3)

ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['bottom'].set_visible(False)

# plt.legend()
plt.show()

Playoff FG Plot

In [None]:
fgYear_groupPlayoff = nbaPlayoff.groupby("Year")["FGA/Game"].median()
fgYear_groupPlayoff_outlier = outlierDf_yoff.groupby("Year")["FGA/Game"].median()

fgYear_groupPlayoff_outlier = outlierDf_yoff.groupby("Year")["FGA/Game"].median()
fgYear_groupPlayoff = nbaPlayoff.groupby("Year")["FGA/Game"].median()

In [None]:
#Playoff FG Plot
fig,ax = plt.subplots(figsize=(10,8),)

xx = range(0,72)
gg="#998179"
dark="#105F73"
light = "#9DBCC9"

ax.plot(fgYear_groupPlayoff_outlier.index,fgYear_groupPlayoff_outlier.values,color=light,label="Outlier Median FT Attempts",linewidth=1.8,)
ax.fill_between(x=xx,y1=fgYear_groupPlayoff.values.min(),y2=fgYear_groupPlayoff_outlier.values,color=light,alpha=.4)

ax.plot(fgYear_groupPlayoff.index,fgYear_groupPlayoff.values,label="Median FT Attempts",color=dark,linewidth=1.8) #Need to fix the indices for now and make it look nice
ax.fill_between(x=xx,y1=fgYear_groupPlayoff.values.min(),y2=fgYear_groupPlayoff.values,color=dark,alpha=.4)


label = uniqueYear_playoff[0::5]
ax.set_xticks(range(0,72,5))
ax.set_xticklabels(label,rotation=45)
ax.tick_params(axis='x',bottom=False)
ax.tick_params(axis='y',left=False)

# ax.vlines(x=28,ymin=fgYear_groupPlayoff.values.min(),ymax=fgYear_groupPlayoff_outlier.values.max(),color='#4E5961',alpha=.5,linewidth=1.4,linestyles='dotted',label="3pt Line Introduction")


ax.grid(color='grey', axis = 'y', linestyle='solid', linewidth=.35,alpha=.3)

ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['bottom'].set_visible(False)

# plt.legend()
plt.show()

Playoff FG Percentage Plot

In [None]:
fgpYear_groupPlayoff = nbaPlayoff.groupby("Year")["FG %"].median()
fgpYear_groupPlayoff_outlier = outlierDf_yoff.groupby("Year")["FG %"].median()

fgpYear_groupPlayoff_outlier = outlierDf_yoff.groupby("Year")["FG %"].median()
fgpYear_groupPlayoff = nbaPlayoff.groupby("Year")["FG %"].median()

In [None]:
fig,ax = plt.subplots(figsize=(10,8),)


ax.plot(fgpYear_groupPlayoff.index,fgpYear_groupPlayoff.values,label="Regular FG %",color=dark,linewidth=3) #Need to fix the indices for now and make it look nice
# ax.set_title("Playoff FG%")

ax.plot(fgpYear_groupPlayoff_outlier.index,fgpYear_groupPlayoff_outlier.values,color=light,label="Outlier Player FG%",linewidth=3,)



label = uniqueYear_playoff[0::5]
ax.set_xticks(range(0,72,5))
ax.set_xticklabels(label,rotation=45)
ax.tick_params(axis='x',bottom=False)
ax.tick_params(axis='y',left=False)
ax.set_yticks(np.arange(0,.81,.05))

# ax.vlines(x=28,ymin=.7,ymax=1.0999999999,color='#B8B2B2',alpha=.8,linewidth=1.4,linestyles='dashed',label="3 Pt line Introduction")

ax.grid(color='grey', axis = 'y', linestyle='solid', linewidth=.35,alpha=.3)

ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['bottom'].set_visible(False)

# plt.legend()
plt.show()

Playoff Ft % plot

In [None]:
fgpYear_groupPlayoff = nbaPlayoff.groupby("Year")["FT %"].median()
fgpYear_groupPlayoff_outlier = outlierDf_yoff.groupby("Year")["FT %"].median()

fgpYear_groupPlayoff_outlier = outlierDf_yoff.groupby("Year")["FT %"].median()
fgpYear_groupPlayoff = nbaPlayoff.groupby("Year")["FT %"].median()

In [None]:
fig,ax = plt.subplots(figsize=(10,8),)

red = "#C9082A"
blue = "#3719E0"

ax.plot(fgpYear_groupPlayoff.index,fgpYear_groupPlayoff.values,label="Regular FT %",color=blue,linewidth=3) #Need to fix the indices for now and make it look nice
# ax.set_title("Playoff FT%")

ax.plot(fgpYear_groupPlayoff_outlier.index,fgpYear_groupPlayoff_outlier.values,color=red,label="Outlier Player FG%",linewidth=3,)


label = uniqueYear_playoff[0::5]
ax.set_xticks(range(0,72,5))
ax.set_xticklabels(label,rotation=45)
ax.tick_params(axis='x',bottom=False)
ax.tick_params(axis='y',left=False)
ax.set_yticks(np.arange(0,1.01,.05))

# ax.vlines(x=28,ymin=.7,ymax=1.0999999999,color='#B8B2B2',alpha=.8,linewidth=1.4,linestyles='dashed',label="3 Pt line Introduction")

ax.grid(color='grey', axis = 'y', linestyle='solid', linewidth=.35,alpha=.3)

ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['bottom'].set_visible(False)

# plt.legend()
plt.show()

Regular Season FG Percent Plot

In [None]:
fgpYear_group = nbaRegular.groupby("Year")["FG %"].median()
fgpYear_group_outlier = outlierDf_reg.groupby("Year")["FG %"].median()

fgpYear_group_outlier = outlierDf_reg.groupby("Year")["FG %"].median()
fgpYear_group = nbaRegular.groupby("Year")["FG %"].median()

In [None]:
fig,ax = plt.subplots(figsize=(10,8),)

red = "#C9082A"
blue = "#3719E0"

ax.plot(fgpYear_group.index,fgpYear_group.values,label="Regular FT %",color=dark,linewidth=3) #Need to fix the indices for now and make it look nice
# ax.set_title("Playoff FT%")

ax.plot(fgpYear_group_outlier.index,fgpYear_group_outlier.values,color=light,label="Outlier Player FG%",linewidth=3,)

label = uniqueYear_playoff[0::5]
ax.set_xticks(range(0,72,5))
ax.set_xticklabels(label,rotation=45)
ax.tick_params(axis='x',bottom=False)
ax.tick_params(axis='y',left=False)
ax.set_yticks(np.arange(0.2,.8,.05))

# ax.vlines(x=28,ymin=.7,ymax=1.0999999999,color='#B8B2B2',alpha=.8,linewidth=1.4,linestyles='dashed',label="3 Pt line Introduction")

ax.grid(color='grey', axis = 'y', linestyle='solid', linewidth=.35,alpha=.3)

ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['bottom'].set_visible(False)

# plt.legend()
plt.show()

Regular Season FT %

In [None]:
fgpYear_group = nbaRegular.groupby("Year")["FT %"].median()
fgpYear_group_outlier = outlierDf_reg.groupby("Year")["FT %"].median()

fgpYear_group_outlier = outlierDf_reg.groupby("Year")["FT %"].median()
fgpYear_group = nbaRegular.groupby("Year")["FT %"].median()

In [None]:
fig,ax = plt.subplots(figsize=(10,8),)

red = "#C9082A"
blue = "#3719E0"

ax.plot(fgpYear_group.index,fgpYear_group.values,label="Regular FT %",color=blue,linewidth=3) #Need to fix the indices for now and make it look nice
# ax.set_title("Playoff FT%")

ax.plot(fgpYear_group_outlier.index,fgpYear_group_outlier.values,color=red,label="Outlier Player FG%",linewidth=3,)

label = uniqueYear_playoff[0::5]
ax.set_xticks(range(0,72,5))
ax.set_xticklabels(label,rotation=45)
ax.tick_params(axis='x',bottom=False)
ax.tick_params(axis='y',left=False)
ax.set_yticks(np.arange(0,1.01,.05))

# ax.vlines(x=28,ymin=.7,ymax=1.0999999999,color='#B8B2B2',alpha=.8,linewidth=1.4,linestyles='dashed',label="3 Pt line Introduction")

ax.grid(color='grey', axis = 'y', linestyle='solid', linewidth=.35,alpha=.3)

ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['bottom'].set_visible(False)

# plt.legend()
plt.show()

StripPlot (Unused in Presentation, more for EDA) that help visualize what our player outliers "look" like in terms of Points Per Game.

In [None]:
# corrDf = regularFGA_playerDF.loc[:,regularFGA_playerDF.drop(["index","Season Start Year","Season Type","Player ID","Games Played","Defensive Rebounds","Assists","Blocks","Personal Fouls","STL/TOV","This Year 75% FGA/Game","League-Wide Player 25% PPS this Year","Player 90% PPS This Year","Team PPS % above Median","This Year 75% Team PPS Above Med",'This Year Top 10 Team PPS Above Med'])]
t=regularFGA_playerDF.columns
corrDf = regularFGA_playerDF.drop(columns=[t[0],t[2],t[3]])

In [None]:
#StripPlot for PPG
outGroup = outlierDf_reg.groupby("Year")["This Player PPG"]
outMed = outlierDf_reg["This Player PPG"].median()


x_values = list(outGroup.groups.keys())

yv = {}
for yr in x_values:
    yv[yr] = outGroup.get_group(yr)

df = pd.DataFrame.from_dict(yv, orient='index').transpose()
melted_df = df.melt(var_name='Year', value_name="Points Per Game")

total = 0
for key in yv:
    thisG = yv[key]
    for val in thisG:
        total+=1

In [None]:
fig,ax = plt.subplots(figsize=(11,6))

xx = range(0,72)

# label = uniqueYear_playoff[0::5]
# ax.set_xticks(range(0,72,5))

cmap = sns.color_palette("icefire")
cmapp = "icefire"
ax.set_xticklabels(labels=nba["Year"].unique(),rotation=90)
ax.tick_params(axis='x',bottom=False)
ax.tick_params(axis='y',left=False)
sns.stripplot(data=melted_df, x='Year', y="Points Per Game", jitter=True,palette=cmapp)

ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['bottom'].set_visible(False)

plt.show()

Presented StripPlot that Shows our Outliers are mostly 10% above the Median in PPS for their respective year

In [None]:
outGroup = outlierDf_reg.groupby("Year")["Player PPS % above Median"]
# print(outGroup.groups.keys())

# Create a list of x-axis values (years) from the dictionary keys
x_values = list(outGroup.groups.keys())

# Create a list of y-axis values (individual data points) from the dictionary values

yv = {}
for yr in x_values:
    yv[yr] = outGroup.get_group(yr)

df = pd.DataFrame.from_dict(yv, orient='index').transpose()
melted_df = df.melt(var_name='Year', value_name="% Difference from year's PPS Median")

zCount = 0
total = 0
for key in yv:
    thisG = yv[key]
    for val in thisG:
        total+=1
        if val <=0:
            zCount+=1

In [None]:
NonoutlierDf_reg = nbaRegular.loc[nbaRegular["Outlier Player"]==False]
NonoutlierDf_yoff = nbaPlayoff.loc[nbaPlayoff["Outlier Player"]==False]

rows_to_remove = np.where(NonoutlierDf_reg["Player PPS % above Median"] >=90)
# Removing the specified rows from the DataFrame
NonoutlierDf_reg = NonoutlierDf_reg.drop(NonoutlierDf_reg.index[rows_to_remove[0]])


#StripPlot for PPG
NonoutGroup = NonoutlierDf_reg.groupby("Year")["Player PPS % above Median"]
NonoutMed = NonoutlierDf_reg["This Player PPG"].median()


Nonx_values = list(outGroup.groups.keys())

Nonyv = {}
for yr in Nonx_values:
    Nonyv[yr] = NonoutGroup.get_group(yr)

Nondf = pd.DataFrame.from_dict(Nonyv, orient='index').transpose()
Nonmelted_df = Nondf.melt(var_name='Year', value_name="% Difference from year's PPS Median")

Nontotal = 0
for key in yv:
    NonthisG = yv[key]
    for val in thisG:
        Nontotal+=1

In [None]:
fig,ax = plt.subplots(figsize=(11,6))

xx = range(0,72)


#Textbook % of values below median

# label = uniqueYear_playoff[0::5]
# ax.set_xticks(range(0,72,5))
cmap = sns.color_palette("flare",) #Repeats colors versus not
cmapp = "flare"
ax.set_xticklabels(labels=nba["Year"].unique(),rotation=90)
ax.tick_params(axis='x',bottom=False)
ax.tick_params(axis='y',left=False)

ax.hlines(y=0,xmin=0,xmax=72,color='#017580',alpha=0.6,linewidth=2.5,label="Median")  #.7
# sns.stripplot(data=Nonmelted_df, x='Year', y="% Difference from year's PPS Median", jitter=True,color="#89a0b0",alpha=.3) #.5
sns.stripplot(data=melted_df, x='Year', y="% Difference from year's PPS Median", jitter=True,palette=cmapp,alpha=.7) #.5


ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['bottom'].set_visible(False)

plt.show()

More EDA on just outlier players

In [None]:
print("Regualar season outlier info:\n")
outlierDf_reg.info()


In [None]:
print("Playoff outlier info:\n")
outlierDf_yoff.info()

In [None]:
#FTA correlation with rank across all years
ftCorr = outlierDf_reg.corr()

#set figure size
fig, ax = plt.subplots(figsize=(20,9))
ax.set_title("Correlation of for outlier players only, during the Regular Season",fontsize=14,weight="bold")


#plot heatmap
sns.heatmap(ftCorr)

In [None]:
print("Regular season outlier player raw correlations with FT Attempts\n\n")


regFTCorr_outlier= outlierDf_reg.corr()["FT Attempts"]
regFTCorr_outlier

In [None]:
print("Regular season outlier player correlation with\nBoolean value, 'if the team is an outlier in pps'\n\n")

outlierDf_reg.corr()["Team Outlier in PPS"]


In [None]:
#FTA correlation with rank across all years

ftCorr = outlierDf_yoff.corr()

#set figure size
fig, ax = plt.subplots(figsize=(20,9))
ax.set_title("Correlation of for outlier players only, during the playoff",fontsize=14,weight="bold")

#plot heatmap
sns.heatmap(ftCorr)

In [None]:
print("Playoff outlier player raw correlations with FT Attempts\n\n")


yoffFTCorr_outlier= outlierDf_yoff.corr()["FT Attempts"]
yoffFTCorr_outlier

In [None]:
print("Playoff outlier player correlation with\nBoolean value, 'if the team is an outlier in pps'\n\n")

outlierDf_yoff.corr()["Team Outlier in PPS"]

Plot on team wins that are top 10 in revenue

In [None]:
nykWin = (47)/(82)
lalWin = (43)/(82)
chiWin = (40)/(82)
gswWin = (44)/(82)
houWin = (20)/(82)
mkwWin = (58)/(82)
bknWin = .54 #found percentage data
dmvWin = .46
lacWin = .52
bosWin = .69

avgWinpct = (bosWin + lacWin + dmvWin + bknWin + mkwWin + houWin + gswWin + chiWin + lalWin + nykWin)/10

In [None]:

vals=[nykWin, lalWin, chiWin, gswWin, houWin, mkwWin,bknWin,dmvWin,lacWin,bosWin, 0.35, 0.38, 0.35, 0.51, 0.45,0.39,0.35,0.48,0.51,.42,0.41,0.37,0.34,0.43,0.39,0.24,0.41,0.31,0.53,0.57,0.43,0.42]
print(len(vals))


In [None]:
fig, ax = plt.subplots()
sns.set_style("white")
sns.color_palette("rocket", as_cmap=True)
sns.distplot(vals, hist=True, kde=True,
             hist_kws={"color": "#8EB9ED", "alpha": .45},  # Histogram color
             kde_kws={"color":"#104A73", "lw": 2})   
ax.scatter(avgWinpct,2.7,s=100,c="#EDB707")
ax.text(.64,2.7,"Top 10 Revenue Teams \n won over 50% of \n their games in 2022",fontsize=9,fontname='serif',color='black',bbox=dict(facecolor='#EDB707', edgecolor='grey'))
ax.hlines(y=2.7,xmin=avgWinpct,xmax=.64,color='#EDB707',linewidth=5,alpha=0.4, clip_on=False)

ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['bottom'].set_visible(False)

plt.show()

External links used

In [None]:
nbaRevenuelink = "https://www.cnbc.com/2021/10/18/nba-2021-2022-season-10-billion-revenue-tv-viewership-rebound.html"
linkforWins2022 = "https://www.nba.com/standings"
"https://www.forbes.com/sites/mikeozanian/2022/10/27/nba-team-values-2022-for-the-first-time-in-two-decades-the-top-spot-goes-to-a-franchise-thats-not-the-knicks-or-lakers/?sh=1923f1911cce"

In [None]:
# nbaRegular.to_excel(r"/Users/obtin/Desktop/GradSchool/MSDS593/MSDS593_Final/nbaStats/nbaRegular_full.xlsx", index=False)
# nbaPlayoff.to_excel(r"/Users/obtin/Desktop/GradSchool/MSDS593/MSDS593_Final/nbaStats/nbaPlayoff_full.xlsx", index=False)