In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Question: Are there hot or cold streaks? Is the percentage higher/lower the more consecutive shots were made/missed before?

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv("../input/nba-shot-logs/shot_logs.csv")

## Getting familar with the data

In [None]:
df.describe()

In [None]:
df.columns

In [None]:
df.head(30)

In [None]:
#check for 1:1 assignments of player_name and player_id
df[['player_name', 'player_id']].nunique()

In [None]:
df[['player_name', 'player_id']].value_counts()

player name and id have the same amount of unique values despite all players names in the sample are "brian roberts". In the end one of these columns will be redundant and therefore dropped. I will need to sort them in another way to model hot streaks. I want to sort by game, player, and shot number to get them in order for each player for each game to create a new column which represents consecutively made shots.

Compared to https://www.basketball-reference.com/leagues/NBA_2015_totals.html#totals_stats::fga the shots aren't fully recorded in this dataset in the following analysis we will assume they are. At least 4 of the top 5 of bbref are there. So, it's close enough.

In [None]:
df = df.sort_values(by=["GAME_ID","player_id","SHOT_NUMBER"])

In [None]:
df.tail(30)

In [None]:
#check for 1:1 assignments of GAME_ID and MATCHUP
df[['GAME_ID', 'MATCHUP']].nunique()

The amount of match ups is doubled of the game ID. Let's see if how the MATCHUPs are distributed.

In [None]:
df[['GAME_ID', 'MATCHUP']].groupby("GAME_ID").nunique().describe()

In [None]:
# look at one sample
df[df["GAME_ID" ] == 21400899]['MATCHUP'].unique()

The notation differs in the same game by the point of view. The home team has a different notation from the away team. The team whose point of view it is positions its own name first and follows up with an @ if it's an away game and with vs. if it's a home game. 

In [None]:
df = df.drop(columns=['MATCHUP', 'FINAL_MARGIN', 'SHOT_CLOCK', 'DRIBBLES', 'TOUCH_TIME', 'SHOT_DIST', 'PTS_TYPE', 'CLOSEST_DEFENDER',
       'CLOSEST_DEFENDER_PLAYER_ID', 'CLOSE_DEF_DIST', 'FGM', 'PTS'])

In [None]:
df.head(50)

Now, that the data is properly sorted, confirmed by SHOT_NUMBER also being in order for each player. There needs to be a new column for the streaks of consecutive made shots.

In order to do that we transform missed shots to -1 and made shots to +1. Next the new column will count all same shot results as before so e.g. 4 consecutive missed shots is -4 4 consecuitve made shots is represented by +4.

Also, the shot result column will be transformed to 0 and 1 to make calculations of averages easier to comprehend.

In [None]:
df["SHOT_RESULT_BINARY"] = df["SHOT_RESULT"].map({"missed": 0, "made": 1})
df["SHOT_RESULT"] = df["SHOT_RESULT"].map({"missed": -1, "made": 1})

In [None]:
streak = [0,-1]
for i in range(2, df.shape[0]-2):
    # each player starts with a 0 streak
    if df.at[i, "SHOT_NUMBER"] == 1:
        streak.append(0)
    # start new series when shot result changes
    elif df.at[i-2, "SHOT_RESULT"] != df.at[i-1, "SHOT_RESULT"]:
        streak.append(df.at[i-1, "SHOT_RESULT"]) 
    # increment streak (because shot result is the same)
    else:
        streak.append(df.at[i-1, "SHOT_RESULT"] + streak[-1])

df["streak"] = pd.Series(streak)

## Distribution of shooting streaks

Next, I will take a look at the distribution of the shots to get an better idea of the data.

In [None]:
print(df[["streak"]].describe())
print(df.groupby("streak")["SHOT_RESULT_BINARY"].describe())
plt.bar(df["streak"].value_counts().index, df["streak"].value_counts())
plt.legend()
plt.show()

There is a normal distribution centered around -1 and 1. But there is a very low count (less than 50) for streaks from -13 to -9 and +9 to +13, so we will drop these for this analysis

In [None]:
streak_pct = pd.Series(df.groupby("streak")["SHOT_RESULT_BINARY"].mean()).drop(index=range(-13,-9)).drop(index=range(9,14))

In [None]:
plt.scatter(streak_pct.index, streak_pct, label="Shooting percentage with streaks")
plt.xticks(streak_pct.index)
plt.xlabel("Streak")
plt.ylabel("Shooting Percentage")
plt.legend()

At first glance there's a large spike at the out bounds but no obvious trend. But let's see with the whole scale with 0% to 100%

In [None]:
plt.scatter(streak_pct.index, streak_pct, label="Shooting percentage with streaks")
plt.xticks(streak_pct.index)
plt.xlabel("Streak")
plt.ylabel("Shooting Percentage")
plt.ylim(0,1)
plt.legend()

It looks like there is no trend either. Next we'll add a linear regression in the first plot with the narrower y scale to see if there's a trend according to the streak status.

In [None]:
from scipy import stats
import numpy as np

slope, intercept, r_value, p_value, std_err = stats.linregress(streak_pct.index, streak_pct)

x_reg = np.linspace(-9, 9)
y_reg = x_reg*slope+intercept

plt.scatter(streak_pct.index, streak_pct, label="Shooting percentage with streaks")
plt.plot(x_reg, y_reg, label="regression line", color="r")
plt.xticks(streak_pct.index)
plt.xlabel("Streak")
plt.ylabel("Shooting Percentage")
plt.legend()

print(f"slope:\t\t{slope},\nintercept:\t{intercept}\nr_value:\t{r_value}\np_value:\t{p_value}\nstd_err:\t{std_err}")

In [None]:
plt.scatter(streak_pct.index, streak_pct, label="Shooting percentage with streaks")
plt.plot(x_reg, y_reg, label="regression line", color="r")
plt.xticks(streak_pct.index)
plt.ylim(0,1)
plt.xlabel("Streak")
plt.ylabel("Shooting Percentage")
plt.legend()

# Conclusion

When looking at the average shooting percentage the relationship of being "hot" and shooting better we fail to reject the null hypothesis that there is no relationship with these two factors according to the P-Value of .2768 which is over the threshold of .05

With each consecutive missed shot the shooting percentage goes down by 0.14% or respectively up by 0.14% with each made shot with an expected standard error of 0.125% which is 90% of the 0.14%.

## Possible improvements for this model:

* Only count streaks when they are in a shorter time. This models looks at streak over the whole game. So called "ice-cold" or "red hot".
* Look at more seasons
* Only look at players that shoot a high volume every game in order to exclude role players that can't run hot because they don't shoot as much.
* Distinguish between 3-pointers and 2-Pointers

In [None]:
df.head(30)