In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
import missingno as msno
%matplotlib inline

In [None]:
nba = pd.read_csv("/kaggle/input/nba-shot-logs/shot_logs.csv")
df = nba.copy()
df.head()

In [None]:
df.columns = df.columns.str.lower()

In [None]:
df.shape

In [None]:
df.isnull().sum()

In [None]:
# Take a quick look shot_clock column which has nan value
df[df.shot_clock.isnull()==True].head()

In [None]:
# deleting the columns which I wont't use and creating a new dataframe
new_df = df.drop(columns=["matchup",
                          "final_margin",
                          "shot_number",
                          "closest_defender",
                          "closest_defender_player_id",
                          "fgm",
                          "pts",
                          "player_id"], axis=1)

In [None]:
new_df.describe()

# pay attention to touch_time column. It has min value that a negative. 
# touch_time means the time that is passing from player's first touch to the ball to the moment of player's taking the shot in each offensive play
# So it couldn't be negative

In [None]:
len(new_df.touch_time[new_df.touch_time<0])

In [None]:
# assigning touch_time column's mean to the negative values 
new_df.touch_time[new_df.touch_time<0] = new_df.touch_time.mean()

In [None]:
new_df.isnull().sum()

In [None]:
new_df.info()

In [None]:
# converting to nominal value from the categorical values to add to some calculating
new_df.location = new_df.location.map({"H":1,"A":0})
new_df.w = new_df.w.map({"W":1,"L":0})
new_df.pts_type = new_df.pts_type.map({2:0,3:1})
new_df.shot_result = new_df.shot_result.map({"made":1,"missed":0})

In [None]:
new_df = new_df.rename(columns={"location":"home_away","pts_type":"3pts_shot","shot_result":"hit"})

In [None]:
# the unique values in dataframe. There are 904 games and 281 players in dataframe.
new_df.nunique()

In [None]:
new_df.head()

In [None]:
new_df.describe()

In [None]:
msno.matrix(new_df);

In [None]:
# filling the nan values in shot_clock column with shot clock column's mean
new_df.shot_clock = new_df.shot_clock.fillna(new_df.shot_clock.mean())

In [None]:
new_df.isnull().sum()

In [None]:
# converting the game_clock values to the seconds
new_df.game_clock = new_df.game_clock.apply(lambda x: int(x.split(":")[0])*60 + int(x.split(":")[1]))

In [None]:
new_df.head()

In [None]:
shot_perc_by_player = new_df.groupby("player_name", as_index=False)[["hit"]].mean().sort_values(by="hit", ascending=False)
shot_perc_by_player.hit = shot_perc_by_player.hit.apply(lambda x: round(x*100, 2))
# the players that is top 10 highest shot percentages 
shot_perc_by_player.head(10)

In [None]:
def perc_log(name,size):
    perc = new_df[new_df.player_name==name].groupby("game_id")["hit"].mean().values
    percents = perc[:(len(perc)//size)*size].reshape(-1, size)
    return sns.distplot(percents)

In [None]:
# shot percentage distribution per each 5 games for james harden
perc_log("james harden", 5);

In [None]:
# shot percentage distribution per each 5 games for damian lillard
perc_log("damian lillard", 5);

In [None]:
# shot percentage distribution per each 5 games for lebron james
perc_log("lebron james", 5);

In [None]:
# shot distribution according to shot_clock
sns.distplot(new_df.shot_clock);

In [None]:
hit_clock = new_df.groupby("shot_clock", as_index=False)[["hit"]].mean()

In [None]:
# shot_clock that has max hit rate 
hit_clock[hit_clock.hit==hit_clock.hit.max()]

In [None]:
# shot_clock that has min hit rate 
hit_clock[hit_clock.hit==hit_clock.hit.min()]

In [None]:
# shot clock that has maximum shot rate 
sns.scatterplot(x="shot_clock", y="hit", data=hit_clock);

In [None]:
new_df.head()

In [None]:
# shot clock average of the missed shots
round(new_df[new_df.hit==0].shot_clock.mean(),2)

In [None]:
# shot clock average of the accurate shots
round(new_df[new_df.hit==1].shot_clock.mean(),2)

In [None]:
# 3 pointers hit rate
new_df[new_df["3pts_shot"]==1].hit.mean()

In [None]:
# 2 pointers hit rate
new_df[new_df["3pts_shot"]==0].hit.mean()

In [None]:
# the average distance of the defensive player in successful 3-point shots
new_df[(new_df["3pts_shot"]==1)&(new_df["hit"]==1)].close_def_dist.mean()

In [None]:
# the average distance of the defensive player in missed 3-point shots
new_df[(new_df["3pts_shot"]==1)&(new_df["hit"]==0)].close_def_dist.mean()