In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import linregress
import numpy as np
from sklearn.linear_model import LinearRegression
import sys
import os
import warnings
warnings.filterwarnings('ignore')

In [12]:
# data extracted from https://www.basketball-reference.com/teams/WAS/2023.html#all_per_game-playoffs_per_game

directory = os.listdir('../data/uncleanData')

In [13]:
def getMinutes(x):
    return int(x.split(':')[0])

In [14]:
totalPoints = 0
df = pd.DataFrame()
for i in range(len(directory)):
    if directory[i].find('csv') == -1:
        continue
    data = pd.read_csv("../data/uncleanData/" + directory[i])
    data = data[data.GS!='Did Not Play']
    data = data[data.GS!='Inactive']
    data = data[data.GS!='Did Not Dress']
    data = data[data.GS!='Not With Team']
    data.MP = data.MP.apply(getMinutes)
    cols = ['MP','GS', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%', 'FT', 'FTA', 'FT%', 'ORB',
       'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'GmSc']
    date = data['Date']
    data = data[cols].astype(float)
    data['Date'] = date
    data.FGA = data.FGA.fillna(data.FGA.mean())
    data.FTA = data.FTA.fillna(data.FTA.mean())
    data.PTS = data.PTS.fillna(data.PTS.mean())
    data['TSA'] = (0.5*data.PTS)/(data.FGA + 0.44*data.FTA)
    data['TS'] = data.FTA + data.FGA
    data = data[data.MP > 10]
    data = data[data.TSA < 1]
    data = data[data.TSA > 0.2]
    currentFileName = directory[i]
    currentFileName = currentFileName.replace('unclean','clean')
    player_name = directory[i].replace('-unclean.csv','')
    player_name = player_name.replace('-','_')
    data['Player_Name'] = player_name
    data.to_csv("../data/cleanData/" + currentFileName, index=False)
    if len(df.columns) == 0:
        df = data.copy()
    else:
        df = pd.concat([df,data], ignore_index=False)

In [15]:
df

Unnamed: 0,MP,GS,FG,FGA,FG%,3P,3PA,3P%,FT,FTA,...,STL,BLK,TOV,PF,PTS,GmSc,Date,TSA,TS,Player_Name
0,34.0,1.0,8.0,19.0,0.421,3.0,10.0,0.300,3.0,5.0,...,0.0,2.0,2.0,4.0,22.0,14.6,2022-10-19,0.518868,24.0,kyle_kuzma
1,33.0,1.0,8.0,16.0,0.500,4.0,8.0,0.500,6.0,7.0,...,0.0,0.0,1.0,1.0,26.0,19.8,2022-10-21,0.681342,23.0,kyle_kuzma
2,37.0,1.0,4.0,8.0,0.500,1.0,2.0,0.500,2.0,4.0,...,0.0,2.0,0.0,4.0,11.0,8.7,2022-10-23,0.563525,12.0,kyle_kuzma
3,27.0,1.0,9.0,17.0,0.529,0.0,5.0,0.000,7.0,8.0,...,0.0,1.0,3.0,4.0,25.0,14.9,2022-10-25,0.609162,25.0,kyle_kuzma
4,37.0,1.0,7.0,19.0,0.368,3.0,7.0,0.429,1.0,1.0,...,1.0,0.0,3.0,4.0,18.0,8.7,2022-10-28,0.462963,20.0,kyle_kuzma
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77,24.0,0.0,5.0,11.0,0.455,1.0,3.0,0.333,0.0,0.0,...,1.0,0.0,0.0,2.0,11.0,13.0,2023-04-02,0.500000,11.0,jordan_goodwin
78,28.0,1.0,6.0,16.0,0.375,2.0,4.0,0.500,0.0,0.0,...,0.0,1.0,3.0,2.0,14.0,10.0,2023-04-04,0.437500,16.0,jordan_goodwin
79,29.0,1.0,4.0,10.0,0.400,0.0,3.0,0.000,2.0,2.0,...,4.0,1.0,1.0,2.0,10.0,12.3,2023-04-05,0.459559,12.0,jordan_goodwin
80,19.0,0.0,3.0,9.0,0.333,0.0,3.0,0.000,0.0,0.0,...,0.0,0.0,1.0,0.0,6.0,6.2,2023-04-07,0.333333,9.0,jordan_goodwin


In [16]:
df.to_csv('../data/seasonData/aggregate_player_date_for_season.csv', index=False)

In [17]:
pts_sum_by_date = df.groupby('Date')['PTS'].sum()
dates_with_pts_sum_gt_60 = pts_sum_by_date[pts_sum_by_date > 60]
dates_map_all_players = dates_with_pts_sum_gt_60.sort_values(ascending=False).to_dict()


shots_sum_by_date = df.groupby('Date')['FGA','FTA'].sum()
shots_sum_by_date['total_shots'] = shots_sum_by_date.FGA + shots_sum_by_date.FTA
shots_sum_by_date = shots_sum_by_date.drop(['FGA','FTA'], axis = 1)

In [18]:
dates_map_all_players

{'2023-03-24': 134.0,
 '2023-01-18': 130.0,
 '2023-03-28': 128.0,
 '2023-04-04': 128.0,
 '2023-02-11': 127.0,
 '2023-01-30': 125.0,
 '2022-11-28': 124.0,
 '2023-02-13': 124.0,
 '2023-02-28': 119.0,
 '2023-01-24': 119.0,
 '2023-03-08': 118.0,
 '2023-01-16': 117.0,
 '2023-03-02': 116.0,
 '2022-12-07': 115.0,
 '2023-01-06': 114.0,
 '2023-04-07': 114.0,
 '2023-02-03': 114.0,
 '2023-02-04': 113.0,
 '2022-12-09': 111.0,
 '2022-11-20': 110.0,
 '2023-01-09': 108.0,
 '2023-03-10': 107.0,
 '2022-12-02': 107.0,
 '2023-02-14': 106.0,
 '2022-12-14': 106.0,
 '2022-11-30': 105.0,
 '2023-03-05': 105.0,
 '2022-12-23': 104.0,
 '2022-11-25': 103.0,
 '2023-02-24': 103.0,
 '2022-12-30': 103.0,
 '2023-04-02': 102.0,
 '2023-04-05': 102.0,
 '2022-10-28': 102.0,
 '2023-01-28': 101.0,
 '2022-12-04': 101.0,
 '2023-02-08': 101.0,
 '2022-12-18': 100.0,
 '2022-12-10': 100.0,
 '2023-03-31': 99.0,
 '2023-03-26': 99.0,
 '2022-12-20': 99.0,
 '2022-11-16': 99.0,
 '2023-01-25': 98.0,
 '2022-11-02': 98.0,
 '2023-01-03': 9

In [19]:
shots_sum_by_date = shots_sum_by_date.reset_index()
shots_sum_by_date

Unnamed: 0,Date,total_shots
0,2022-10-18,9.0
1,2022-10-19,92.0
2,2022-10-21,70.0
3,2022-10-23,70.0
4,2022-10-25,85.0
...,...,...
87,2023-04-02,93.0
88,2023-04-04,120.0
89,2023-04-05,111.0
90,2023-04-07,121.0


In [20]:
directory = os.listdir('../data/cleanData/')
for i in range(len(directory)):
    if directory[i].find('csv') == -1:
        continue
        
    data = pd.read_csv("../data/cleanData/" + directory[i])
    data['percentage_of_TS'] = float('nan')
    for j in range(data.shape[0]):
        if data.iloc[j]['Date'] in dates_map_all_players:
            date_to_find = data.iloc[j]['Date']
            total_shots_for_date = shots_sum_by_date.loc[shots_sum_by_date['Date'] == date_to_find, 'total_shots'].values[0]
            data.at[j, 'percentage_of_TS'] = (data.iloc[j]['TS']/total_shots_for_date)*(48.0/data.iloc[j]['MP'])
            data.to_csv("../data/cleanData/" + directory[i], index=False)