In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

Columns:

0-6: Identifying info -- player name, birthday, draft pick #, season, YrsOff = number of seasons missed (if player had played previously but did not play in the most recent prior season), age, # of prior years in league (so 0 for a rookie)\
7: Team(s) played on that season\
8: # of games the team played that season\
REGULAR SEASON STATS\
9-39: raw counting stats (some of these are rate stats and that's why there are fewer columns in the next 2 sections)\
40-60: same stats but averaged per game played\
61-80: same stats but averaged per minute played\
81-93: advanced rate statistics (these don't need to be averaged per game or per minute because they are all rate stats)\
94-116: cumulative stats for counting stats (including this season, their total stats in each category for their career)\
PLAYOFF STATS\
117-147: raw counting stats\
148-168: per game\
169-188: per minute\
189-201: advanced rate stats\
202-224: cumulative stats for counting stats\
AWARDS\
225-231: 1 for winning award, 0 for not\
232-236: cumulative award totals\
237-241: previous award totals (different from cumulative because t does not include current season)\
242-265: same pattern for all-defense team awards and all-rookie team awards\
266-290: More award info / award voting info\
291-300: team stats for that season\

In [2]:
data = pd.read_csv('all.csv', encoding = 'unicode_escape')

In [3]:
data['Birthdate'] = pd.to_datetime(data['Birthdate']).dt.strftime('%m%d%y')
data['Player_ID'] = data['Player'].str.replace(' ', '_')
data['Player_ID'] = data['Player_ID'].str.lower() + '_' + data['Birthdate']
data['AllNBAV'] = data['AllNBA1']*10 + data['AllNBA2']*5 + data['AllNBA3']*1
data = data.sort_values(by=['Player_ID', 'Year'], ascending=True).reset_index(drop=True, level=0)

In [4]:
data

Unnamed: 0,Player,Birthdate,Pick,Year,YrsOff,Age,Exp,Team,Team_Gm,GP,...,WinPct,W,L,PW,PL,MOV,ORtg,DRtg,NRtg,Player_ID
0,A.C. Green,100463,23,1986,0,22,0,LAL,82,82,...,0.756,62,20,59,23,7.74,113.3,105.8,7.5,a.c._green_100463
1,A.C. Green,100463,23,1987,0,23,1,LAL,82,79,...,0.793,65,17,62,20,9.30,115.6,106.5,9.1,a.c._green_100463
2,A.C. Green,100463,23,1988,0,24,2,LAL,82,82,...,0.756,62,20,56,26,5.84,113.1,107.3,5.8,a.c._green_100463
3,A.C. Green,100463,23,1989,0,25,3,LAL,82,82,...,0.695,57,25,58,24,7.17,113.8,106.7,7.1,a.c._green_100463
4,A.C. Green,100463,23,1990,0,26,4,LAL,82,82,...,0.768,63,19,58,24,6.78,114.0,107.0,7.0,a.c._green_100463
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18804,Zydrunas Ilgauskas,060575,20,2009,0,33,10,CLE,82,65,...,0.805,66,16,65,17,8.93,112.4,102.4,10.0,zydrunas_ilgauskas_060575
18805,Zydrunas Ilgauskas,060575,20,2010,0,34,11,CLE,82,64,...,0.744,61,21,59,23,6.52,111.2,104.1,7.1,zydrunas_ilgauskas_060575
18806,Zydrunas Ilgauskas,060575,20,2011,0,35,12,MIA,82,72,...,0.707,58,24,61,21,7.46,111.7,103.5,8.2,zydrunas_ilgauskas_060575
18807,Zylan Cheatham,111795,65,2020,0,24,0,NOP,72,4,...,0.417,30,42,33,39,-1.29,110.7,111.9,-1.2,zylan_cheatham_111795


In [5]:
columns = [
    'Player_ID',
    'Player',
    'Pick',
    'Exp',
    'Age',
    'YrsOff',
    'Year',
    'GP',
    'MP',
    'PTS', # use PTS and GP to compute PTS_g
    'FGM',
    'FGA', # use FGA to compute TSp and FTr over multiple previous seasons
    'FGM3', # use FGM3 and FGA3 to compute FG3p over multiple previous seasons
    'FGA3',
    'FGM2', # use FGM2 and FGA2 to compute FG2p over multiple previous seasons
    'FGA2',
    'FTM', # use FTM and FTA to compute FTp over multiple previous seasons
    'FTA',
    # exclude FGp = FGM / FGA since FGM = FGM2 + FGM3 and FGA = FGA2 + FGA3
    # compute EFGp over multiple previous seasons, the formula is (FG + 0.5 * FGM3) / FGA
    # compute TSp over multiple previous seasons, the formula is PTS / (2 * (FGA + 0.44 * FTA))
    # compute FTr over multiple previous seasons, the formula is FTA / FGA
    'OREB', # use OREB and GP to compute OREB_g over multiple previous seasons
    'DREB', # use DREB and GP to compute DREB_g over multiple previous seasons
    'REB', # use REB and GP to compute REB_g over multiple previous seasons
    'AST', # use AST and GP to compute AST_g over multiple previous seasons
    'STL', # use STL and GP to compute STL_g over multiple previous seasons
    'BLK', # use BLK and GP to compute BLK_g over multiple previous seasons
    'TOV', # use TOV and GP to compute TOV_g over multiple previous seasons
    'PF', # use PF and GP to compute PF_g over multiple previous seasons
    'OWS', # use OWS and GP to compute OWS_g over multiple previous seasons
    'DWS', # use DWS and GP to compute DWS_g over multiple previous seasons
    'WS', # use WS and GP to compute WS_g over multiple previous seasons
    'VORP', # use VORP and GP to compute VORP_g over multiple previous seasons
    'pMP',
    'pPTS',
    'pWS',
    'AS', # use AS and Cum_AS to compute Prev_AS, use AS to compute AS over multiple previous seasons
    'AllDef',
    'AllDef1',
    'AllDef2',
    'AllRook',
    'AllRook1',
    'AllRook2',
    'AllNBAV',
    'AllNBA', # target, use ALLNBA to compute ALLNBA over multiple previous seasons
    'AllNBA1', # target, use ALLNBA1 to compute ALLNBA1 over multiple previous seasons
    'AllNBA2', # target, use ALLNBA2 to compute ALLNBA2 over multiple previous seasons
    'AllNBA3', # target, use ALLNBA3 to compute ALLNBA3 over multiple previous seasons
    'MVP_Share', # target, use MVP_Share to compute MVP_Share over multiple previous seasons
    'PW', # use PW and PL to compute PW / (PW + PL) in the previous season
    'PL'
]

In [6]:
data_1 = data[columns]
data_1

Unnamed: 0,Player_ID,Player,Pick,Exp,Age,YrsOff,Year,GP,MP,PTS,...,AllRook1,AllRook2,AllNBAV,AllNBA,AllNBA1,AllNBA2,AllNBA3,MVP_Share,PW,PL
0,a.c._green_100463,A.C. Green,23,0,22,0,1986,82,1542,521,...,0,0,0,0,0,0,0,0.0,59,23
1,a.c._green_100463,A.C. Green,23,1,23,0,1987,79,2240,852,...,0,0,0,0,0,0,0,0.0,62,20
2,a.c._green_100463,A.C. Green,23,2,24,0,1988,82,2636,937,...,0,0,0,0,0,0,0,0.0,56,26
3,a.c._green_100463,A.C. Green,23,3,25,0,1989,82,2510,1088,...,0,0,0,0,0,0,0,0.0,58,24
4,a.c._green_100463,A.C. Green,23,4,26,0,1990,82,2709,1061,...,0,0,0,0,0,0,0,0.0,58,24
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18804,zydrunas_ilgauskas_060575,Zydrunas Ilgauskas,20,10,33,0,2009,65,1765,838,...,0,0,0,0,0,0,0,0.0,65,17
18805,zydrunas_ilgauskas_060575,Zydrunas Ilgauskas,20,11,34,0,2010,64,1339,474,...,0,0,0,0,0,0,0,0.0,59,23
18806,zydrunas_ilgauskas_060575,Zydrunas Ilgauskas,20,12,35,0,2011,72,1145,360,...,0,0,0,0,0,0,0,0.0,61,21
18807,zylan_cheatham_111795,Zylan Cheatham,65,0,24,0,2020,4,51,12,...,0,0,0,0,0,0,0,0.0,33,39


In [7]:
def decayed_sum(colum):
    beta = 0.9
    weights = [beta ** x for x in range(len(colum))]
    weights = weights[::-1]
    weighted_sum = np.sum(colum**weights)
    return weighted_sum

col_list = ['GP', 'MP', 'PTS', 'FGM', 'FGA', 'FGM3', 'FGA3', 'FGM2', 'FGA2', 'FTM', 'FTA', 'OREB', 'DREB', 
            'REB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'OWS', 'DWS', 'WS', 'VORP', 'pMP', 'pPTS', 'pWS',
            'AS', 'AllDef', 'AllDef1', 'AllDef2', 'AllRook', 'AllRook1', 'AllRook2', 'AllNBAV', 'AllNBA',
            'AllNBA1', 'AllNBA2', 'AllNBA3', 'MVP_Share', 'PW', 'PL']
years_back = 5

for c in col_list:
    col_1 = 'Prev_1_' + c
    data_1[col_1] = data_1.sort_values(by=['Player_ID', 'Year'], ascending=True)\
                    .groupby(['Player_ID'])[c].shift(1)
    for y in range(2, years_back+1):
        col_y = 'Prev_' + str(y) + '_' + c
        data_1[col_y] = data_1.sort_values(by=['Player_ID', 'Year'], ascending=True)\
                            .groupby(['Player_ID'])[c]\
                            .rolling(y, min_periods = 1, closed = 'left').sum()\
                            .reset_index(drop=True, level=0)
    col_cum = 'Prev_' + c    
    data_1[col_cum] = data_1.groupby(['Player_ID'])[col_1].cumsum()

data_1['Prev_AllNBAV_decay'] = data_1.sort_values(by=['Player_ID', 'Year'], ascending=True)\
                    .groupby(['Player_ID'])['AllNBAV']\
                    .rolling(25, min_periods = 1, closed = 'left').apply(decayed_sum, raw = False)\
                    .reset_index(drop=True, level=0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_1[col_1] = data_1.sort_values(by=['Player_ID', 'Year'], ascending=True)\
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_1[col_y] = data_1.sort_values(by=['Player_ID', 'Year'], ascending=True)\
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_1[col_y] = data_1.sort_values(by=['Player_ID

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_1[col_y] = data_1.sort_values(by=['Player_ID', 'Year'], ascending=True)\
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_1[col_y] = data_1.sort_values(by=['Player_ID', 'Year'], ascending=True)\
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_1[col_y] = data_1.sort_values(by=['Player_ID

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_1[col_y] = data_1.sort_values(by=['Player_ID', 'Year'], ascending=True)\
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_1[col_cum] = data_1.groupby(['Player_ID'])[col_1].cumsum()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_1[col_1] = data_1.sort_values(by=['Player_ID', 'Year'], as

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_1[col_y] = data_1.sort_values(by=['Player_ID', 'Year'], ascending=True)\
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_1[col_y] = data_1.sort_values(by=['Player_ID', 'Year'], ascending=True)\
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_1[col_y] = data_1.sort_values(by=['Player_ID

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_1[col_y] = data_1.sort_values(by=['Player_ID', 'Year'], ascending=True)\
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_1[col_cum] = data_1.groupby(['Player_ID'])[col_1].cumsum()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_1[col_1] = data_1.sort_values(by=['Player_ID', 'Year'], as

  data_1[col_y] = data_1.sort_values(by=['Player_ID', 'Year'], ascending=True)\
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_1[col_y] = data_1.sort_values(by=['Player_ID', 'Year'], ascending=True)\
  data_1[col_cum] = data_1.groupby(['Player_ID'])[col_1].cumsum()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_1[col_cum] = data_1.groupby(['Player_ID'])[col_1].cumsum()
  data_1[col_1] = data_1.sort_values(by=['Player_ID', 'Year'], ascending=True)\
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_inde

  data_1[col_y] = data_1.sort_values(by=['Player_ID', 'Year'], ascending=True)\
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_1[col_y] = data_1.sort_values(by=['Player_ID', 'Year'], ascending=True)\
  data_1[col_y] = data_1.sort_values(by=['Player_ID', 'Year'], ascending=True)\
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_1[col_y] = data_1.sort_values(by=['Player_ID', 'Year'], ascending=True)\
  data_1[col_y] = data_1.sort_values(by=['Player_ID', 'Year'], ascending=True)\
A value is trying to be set on a copy of a slice from a DataFrame.
Try usi

  data_1[col_y] = data_1.sort_values(by=['Player_ID', 'Year'], ascending=True)\
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_1[col_y] = data_1.sort_values(by=['Player_ID', 'Year'], ascending=True)\
  data_1[col_y] = data_1.sort_values(by=['Player_ID', 'Year'], ascending=True)\
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_1[col_y] = data_1.sort_values(by=['Player_ID', 'Year'], ascending=True)\
  data_1[col_y] = data_1.sort_values(by=['Player_ID', 'Year'], ascending=True)\
A value is trying to be set on a copy of a slice from a DataFrame.
Try usi

  data_1[col_y] = data_1.sort_values(by=['Player_ID', 'Year'], ascending=True)\
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_1[col_y] = data_1.sort_values(by=['Player_ID', 'Year'], ascending=True)\
  data_1[col_y] = data_1.sort_values(by=['Player_ID', 'Year'], ascending=True)\
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_1[col_y] = data_1.sort_values(by=['Player_ID', 'Year'], ascending=True)\
  data_1[col_y] = data_1.sort_values(by=['Player_ID', 'Year'], ascending=True)\
A value is trying to be set on a copy of a slice from a DataFrame.
Try usi

  data_1[col_y] = data_1.sort_values(by=['Player_ID', 'Year'], ascending=True)\
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_1[col_y] = data_1.sort_values(by=['Player_ID', 'Year'], ascending=True)\
  data_1[col_y] = data_1.sort_values(by=['Player_ID', 'Year'], ascending=True)\
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_1[col_y] = data_1.sort_values(by=['Player_ID', 'Year'], ascending=True)\
  data_1[col_y] = data_1.sort_values(by=['Player_ID', 'Year'], ascending=True)\
A value is trying to be set on a copy of a slice from a DataFrame.
Try usi

  data_1[col_y] = data_1.sort_values(by=['Player_ID', 'Year'], ascending=True)\
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_1[col_y] = data_1.sort_values(by=['Player_ID', 'Year'], ascending=True)\
  data_1[col_y] = data_1.sort_values(by=['Player_ID', 'Year'], ascending=True)\
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_1[col_y] = data_1.sort_values(by=['Player_ID', 'Year'], ascending=True)\
  data_1[col_y] = data_1.sort_values(by=['Player_ID', 'Year'], ascending=True)\
A value is trying to be set on a copy of a slice from a DataFrame.
Try usi

  data_1[col_y] = data_1.sort_values(by=['Player_ID', 'Year'], ascending=True)\
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_1[col_y] = data_1.sort_values(by=['Player_ID', 'Year'], ascending=True)\
  data_1[col_y] = data_1.sort_values(by=['Player_ID', 'Year'], ascending=True)\
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_1[col_y] = data_1.sort_values(by=['Player_ID', 'Year'], ascending=True)\
  data_1[col_y] = data_1.sort_values(by=['Player_ID', 'Year'], ascending=True)\
A value is trying to be set on a copy of a slice from a DataFrame.
Try usi

  data_1[col_y] = data_1.sort_values(by=['Player_ID', 'Year'], ascending=True)\
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_1[col_y] = data_1.sort_values(by=['Player_ID', 'Year'], ascending=True)\
  data_1[col_y] = data_1.sort_values(by=['Player_ID', 'Year'], ascending=True)\
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_1[col_y] = data_1.sort_values(by=['Player_ID', 'Year'], ascending=True)\
  data_1[col_y] = data_1.sort_values(by=['Player_ID', 'Year'], ascending=True)\
A value is trying to be set on a copy of a slice from a DataFrame.
Try usi

  data_1[col_y] = data_1.sort_values(by=['Player_ID', 'Year'], ascending=True)\
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_1[col_y] = data_1.sort_values(by=['Player_ID', 'Year'], ascending=True)\
  data_1[col_y] = data_1.sort_values(by=['Player_ID', 'Year'], ascending=True)\
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_1[col_y] = data_1.sort_values(by=['Player_ID', 'Year'], ascending=True)\
  data_1[col_y] = data_1.sort_values(by=['Player_ID', 'Year'], ascending=True)\
A value is trying to be set on a copy of a slice from a DataFrame.
Try usi

  data_1[col_y] = data_1.sort_values(by=['Player_ID', 'Year'], ascending=True)\
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_1[col_y] = data_1.sort_values(by=['Player_ID', 'Year'], ascending=True)\
  data_1[col_y] = data_1.sort_values(by=['Player_ID', 'Year'], ascending=True)\
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_1[col_y] = data_1.sort_values(by=['Player_ID', 'Year'], ascending=True)\
  data_1[col_y] = data_1.sort_values(by=['Player_ID', 'Year'], ascending=True)\
A value is trying to be set on a copy of a slice from a DataFrame.
Try usi

  data_1[col_y] = data_1.sort_values(by=['Player_ID', 'Year'], ascending=True)\
  data_1[col_cum] = data_1.groupby(['Player_ID'])[col_1].cumsum()
  data_1[col_1] = data_1.sort_values(by=['Player_ID', 'Year'], ascending=True)\
  data_1[col_y] = data_1.sort_values(by=['Player_ID', 'Year'], ascending=True)\
  data_1[col_y] = data_1.sort_values(by=['Player_ID', 'Year'], ascending=True)\
  data_1[col_y] = data_1.sort_values(by=['Player_ID', 'Year'], ascending=True)\
  data_1[col_y] = data_1.sort_values(by=['Player_ID', 'Year'], ascending=True)\
  data_1[col_cum] = data_1.groupby(['Player_ID'])[col_1].cumsum()
  data_1[col_1] = data_1.sort_values(by=['Player_ID', 'Year'], ascending=True)\
  data_1[col_y] = data_1.sort_values(by=['Player_ID', 'Year'], ascending=True)\
  data_1[col_y] = data_1.sort_values(by=['Player_ID', 'Year'], ascending=True)\
  data_1[col_y] = data_1.sort_values(by=['Player_ID', 'Year'], ascending=True)\
  data_1[col_y] = data_1.sort_values(by=['Player_ID', 'Year'], ascen

  data_1[col_y] = data_1.sort_values(by=['Player_ID', 'Year'], ascending=True)\
  data_1[col_y] = data_1.sort_values(by=['Player_ID', 'Year'], ascending=True)\
  data_1[col_y] = data_1.sort_values(by=['Player_ID', 'Year'], ascending=True)\
  data_1[col_cum] = data_1.groupby(['Player_ID'])[col_1].cumsum()
  data_1['Prev_AllNBAV_decay'] = data_1.sort_values(by=['Player_ID', 'Year'], ascending=True)\


In [8]:
data_1['Fut_AllNBA'] = data_1.sort_values(by=['Player_ID', 'Year'], ascending=False)\
                            .groupby(['Player_ID'])['AllNBA']\
                            .rolling(25, min_periods = 1, closed = 'both').sum()\
                            .reset_index(drop=True, level=0)

data_1['Fut_MVP_Share'] = data_1.sort_values(by=['Player_ID', 'Year'], ascending=False)\
                            .groupby(['Player_ID'])['MVP_Share']\
                            .rolling(25, min_periods = 1, closed = 'both').sum()\
                            .reset_index(drop=True, level=0)

data_1['Fut_AllNBA'].fillna(0, inplace=True)
data_1['Fut_MVP_Share'].fillna(0, inplace=True)

  data_1['Fut_AllNBA'] = data_1.sort_values(by=['Player_ID', 'Year'], ascending=False)\
  data_1['Fut_MVP_Share'] = data_1.sort_values(by=['Player_ID', 'Year'], ascending=False)\


In [9]:
per_game_cols = ['MP', 'PTS', 'FGM', 'FGA', 'FGM3', 'FGA3', 'FGM2', 'FGA2', 'FTM', 'FTA', 'OREB', 'DREB',
                 'REB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'OWS', 'DWS', 'WS', 'VORP', 'pMP', 'pPTS', 'pWS']

for c in per_game_cols:
    for y in range(1, years_back+1):
        col_y = 'Prev_' + str(y) + '_' + c
        col_y_gp = 'Prev_' + str(y) + '_GP'
        col_new = 'Prev_' + str(y) + '_' + c + '_g'
        data_1[col_new] = data_1[col_y] / data_1[col_y_gp]
    col_cum = 'Prev_' + c
    col_cum_pergame = 'Prev_' + c + '_g'
    data_1[col_cum_pergame] = data_1[col_cum] / data_1['Prev_GP']

data_1['Prev_1_FGp'] = data_1['Prev_1_FGM'] / data_1['Prev_1_FGA']
data_1['Prev_2_FGp'] = data_1['Prev_2_FGM'] / data_1['Prev_2_FGA']
data_1['Prev_3_FGp'] = data_1['Prev_3_FGM'] / data_1['Prev_3_FGA']
data_1['Prev_4_FGp'] = data_1['Prev_4_FGM'] / data_1['Prev_4_FGA']
data_1['Prev_5_FGp'] = data_1['Prev_5_FGM'] / data_1['Prev_5_FGA']
data_1['Prev_FGp'] = data_1['Prev_FGM'] / data_1['Prev_FGA']
    
data_1['Prev_1_FG3p'] = data_1['Prev_1_FGM3'] / data_1['Prev_1_FGA3']
data_1['Prev_2_FG3p'] = data_1['Prev_2_FGM3'] / data_1['Prev_2_FGA3']
data_1['Prev_3_FG3p'] = data_1['Prev_3_FGM3'] / data_1['Prev_3_FGA3']
data_1['Prev_4_FG3p'] = data_1['Prev_4_FGM3'] / data_1['Prev_4_FGA3']
data_1['Prev_5_FG3p'] = data_1['Prev_5_FGM3'] / data_1['Prev_5_FGA3']
data_1['Prev_FG3p'] = data_1['Prev_FGM3'] / data_1['Prev_FGA3']

data_1['Prev_1_FG2p'] = data_1['Prev_1_FGM2'] / data_1['Prev_1_FGA2']
data_1['Prev_2_FG2p'] = data_1['Prev_2_FGM2'] / data_1['Prev_2_FGA2']
data_1['Prev_3_FG2p'] = data_1['Prev_3_FGM2'] / data_1['Prev_3_FGA2']
data_1['Prev_4_FG2p'] = data_1['Prev_4_FGM2'] / data_1['Prev_4_FGA2']
data_1['Prev_5_FG2p'] = data_1['Prev_5_FGM2'] / data_1['Prev_5_FGA2']
data_1['Prev_FG2p'] = data_1['Prev_FGM2'] / data_1['Prev_FGA2']

data_1['Prev_1_FTp'] = data_1['Prev_1_FTM'] / data_1['Prev_1_FTA']
data_1['Prev_2_FTp'] = data_1['Prev_2_FTM'] / data_1['Prev_2_FTA']
data_1['Prev_3_FTp'] = data_1['Prev_3_FTM'] / data_1['Prev_3_FTA']
data_1['Prev_4_FTp'] = data_1['Prev_4_FTM'] / data_1['Prev_4_FTA']
data_1['Prev_5_FTp'] = data_1['Prev_5_FTM'] / data_1['Prev_5_FTA']
data_1['Prev_FTp'] = data_1['Prev_FTM'] / data_1['Prev_FTA']


# compute EFGp over multiple previous seasons, the formula is (FG + 0.5 * FGM3) / FGA
# compute TSp over multiple previous seasons, the formula is PTS / (2 * (FGA + 0.44 * FTA))
# compute FTr over multiple previous seasons, the formula is FTA / FGA

data_1['Prev_1_EFGp'] = (data_1['Prev_1_FGM2'] + 1.5 * data_1['Prev_1_FGM3']) / data_1['Prev_1_FGA']
data_1['Prev_2_EFGp'] = (data_1['Prev_2_FGM2'] + 1.5 * data_1['Prev_2_FGM3']) / data_1['Prev_2_FGA']
data_1['Prev_3_EFGp'] = (data_1['Prev_3_FGM2'] + 1.5 * data_1['Prev_3_FGM3']) / data_1['Prev_3_FGA']
data_1['Prev_4_EFGp'] = (data_1['Prev_4_FGM2'] + 1.5 * data_1['Prev_4_FGM3']) / data_1['Prev_4_FGA']
data_1['Prev_5_EFGp'] = (data_1['Prev_5_FGM2'] + 1.5 * data_1['Prev_5_FGM3']) / data_1['Prev_5_FGA']
data_1['Prev_EFGp'] = (data_1['Prev_FGM2'] + 1.5 * data_1['Prev_FGM3']) / data_1['Prev_FGA']

data_1['Prev_1_TSp'] = data_1['Prev_1_PTS'] / (2 * (data_1['Prev_1_FGA'] + 0.44 * data_1['Prev_1_FTA']))
data_1['Prev_2_TSp'] = data_1['Prev_2_PTS'] / (2 * (data_1['Prev_2_FGA'] + 0.44 * data_1['Prev_2_FTA']))
data_1['Prev_3_TSp'] = data_1['Prev_3_PTS'] / (2 * (data_1['Prev_3_FGA'] + 0.44 * data_1['Prev_3_FTA']))
data_1['Prev_4_TSp'] = data_1['Prev_4_PTS'] / (2 * (data_1['Prev_4_FGA'] + 0.44 * data_1['Prev_4_FTA']))
data_1['Prev_5_TSp'] = data_1['Prev_5_PTS'] / (2 * (data_1['Prev_5_FGA'] + 0.44 * data_1['Prev_5_FTA']))
data_1['Prev_TSp'] = data_1['Prev_PTS'] / (2 * (data_1['Prev_FGA'] + 0.44 * data_1['Prev_FTA']))

data_1['Prev_1_FTr'] = data_1['Prev_1_FTA'] / data_1['Prev_1_FGA']
data_1['Prev_2_FTr'] = data_1['Prev_2_FTA'] / data_1['Prev_2_FGA']
data_1['Prev_3_FTr'] = data_1['Prev_3_FTA'] / data_1['Prev_3_FGA']
data_1['Prev_4_FTr'] = data_1['Prev_4_FTA'] / data_1['Prev_4_FGA']
data_1['Prev_5_FTr'] = data_1['Prev_5_FTA'] / data_1['Prev_5_FGA']
data_1['Prev_FTr'] = data_1['Prev_FTA'] / data_1['Prev_FGA']

# use PW and PL to compute PW / (PW + PL) in the previous season

data_1['Prev_1_PWp'] = data_1['Prev_1_PW'] / (data_1['Prev_1_PW'] + data_1['Prev_1_PL'])
data_1['Prev_2_PWp'] = data_1['Prev_2_PW'] / (data_1['Prev_2_PW'] + data_1['Prev_2_PL'])
data_1['Prev_3_PWp'] = data_1['Prev_3_PW'] / (data_1['Prev_3_PW'] + data_1['Prev_3_PL'])
data_1['Prev_4_PWp'] = data_1['Prev_4_PW'] / (data_1['Prev_4_PW'] + data_1['Prev_4_PL'])
data_1['Prev_5_PWp'] = data_1['Prev_5_PW'] / (data_1['Prev_5_PW'] + data_1['Prev_5_PL'])
data_1['Prev_PWp'] = data_1['Prev_PW'] / (data_1['Prev_PW'] + data_1['Prev_PL'])

  data_1[col_new] = data_1[col_y] / data_1[col_y_gp]
  data_1[col_new] = data_1[col_y] / data_1[col_y_gp]
  data_1[col_new] = data_1[col_y] / data_1[col_y_gp]
  data_1[col_new] = data_1[col_y] / data_1[col_y_gp]
  data_1[col_new] = data_1[col_y] / data_1[col_y_gp]
  data_1[col_cum_pergame] = data_1[col_cum] / data_1['Prev_GP']
  data_1[col_new] = data_1[col_y] / data_1[col_y_gp]
  data_1[col_new] = data_1[col_y] / data_1[col_y_gp]
  data_1[col_new] = data_1[col_y] / data_1[col_y_gp]
  data_1[col_new] = data_1[col_y] / data_1[col_y_gp]
  data_1[col_new] = data_1[col_y] / data_1[col_y_gp]
  data_1[col_cum_pergame] = data_1[col_cum] / data_1['Prev_GP']
  data_1[col_new] = data_1[col_y] / data_1[col_y_gp]
  data_1[col_new] = data_1[col_y] / data_1[col_y_gp]
  data_1[col_new] = data_1[col_y] / data_1[col_y_gp]
  data_1[col_new] = data_1[col_y] / data_1[col_y_gp]
  data_1[col_new] = data_1[col_y] / data_1[col_y_gp]
  data_1[col_cum_pergame] = data_1[col_cum] / data_1['Prev_GP']
  data_1[col_

  data_1[col_cum_pergame] = data_1[col_cum] / data_1['Prev_GP']
  data_1[col_new] = data_1[col_y] / data_1[col_y_gp]
  data_1[col_new] = data_1[col_y] / data_1[col_y_gp]
  data_1[col_new] = data_1[col_y] / data_1[col_y_gp]
  data_1[col_new] = data_1[col_y] / data_1[col_y_gp]
  data_1[col_new] = data_1[col_y] / data_1[col_y_gp]
  data_1[col_cum_pergame] = data_1[col_cum] / data_1['Prev_GP']
  data_1[col_new] = data_1[col_y] / data_1[col_y_gp]
  data_1[col_new] = data_1[col_y] / data_1[col_y_gp]
  data_1[col_new] = data_1[col_y] / data_1[col_y_gp]
  data_1[col_new] = data_1[col_y] / data_1[col_y_gp]
  data_1[col_new] = data_1[col_y] / data_1[col_y_gp]
  data_1[col_cum_pergame] = data_1[col_cum] / data_1['Prev_GP']
  data_1[col_new] = data_1[col_y] / data_1[col_y_gp]
  data_1[col_new] = data_1[col_y] / data_1[col_y_gp]
  data_1[col_new] = data_1[col_y] / data_1[col_y_gp]
  data_1[col_new] = data_1[col_y] / data_1[col_y_gp]
  data_1[col_new] = data_1[col_y] / data_1[col_y_gp]
  data_1[col_

In [10]:
final_columns = ['Player_ID', 'Player', 'Pick', 'Age', 'Exp', 'YrsOff', 'Year']

stat_cols = ['GP', 'MP_g', 'PTS_g', 'FGM_g', 'FGA_g', 'FGp', 'FGM3_g', 'FGA3_g', 'FG3p', 'FGM2_g',
             'FGA2_g', 'FG2p', 'FTM_g', 'FTA_g', 'FTp', 'OREB_g', 'DREB_g', 'REB_g', 'AST_g', 'STL_g',
             'BLK_g', 'TOV_g', 'PF_g', 'OWS_g', 'DWS_g', 'WS_g', 'VORP_g', 'EFGp', 'TSp', 'FTr',
             'MP', 'PTS', 'FGM', 'FGA', 'FGM3', 'FGA3', 'FGM2', 'FGA2', 'FTM', 'FTA', 'OREB', 'DREB',
             'REB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'OWS', 'DWS', 'WS', 'VORP', 'pMP', 'pPTS', 'pWS',
             'pMP_g', 'pPTS_g', 'pWS_g', 'AllDef', 'AllDef1', 'AllDef2', 'AllRook', 'AllRook1',
             'AllRook2', 'AS', 'AllNBAV', 'AllNBA', 'AllNBA1', 'AllNBA2', 'AllNBA3', 'MVP_Share', 'PWp']

for c in stat_cols:
    for y in range(1, years_back+1):
        final_columns.append('Prev_' + str(y) + '_' + c)
    final_columns.append('Prev_' + c)
    if c == 'AllNBA3':
        final_columns.append('Prev_AllNBAV_decay')

# targets
final_columns.append('AllNBA')
final_columns.append('AllNBA1')
final_columns.append('AllNBA2')
final_columns.append('AllNBA3')
final_columns.append('MVP_Share')
final_columns.append('Fut_AllNBA')
final_columns.append('Fut_MVP_Share')

In [11]:
data_1

Unnamed: 0,Player_ID,Player,Pick,Exp,Age,YrsOff,Year,GP,MP,PTS,...,Prev_3_FTr,Prev_4_FTr,Prev_5_FTr,Prev_FTr,Prev_1_PWp,Prev_2_PWp,Prev_3_PWp,Prev_4_PWp,Prev_5_PWp,Prev_PWp
0,a.c._green_100463,A.C. Green,23,0,22,0,1986,82,1542,521,...,,,,,,,,,,
1,a.c._green_100463,A.C. Green,23,1,23,0,1987,79,2240,852,...,0.430412,0.430412,0.430412,0.430412,0.719512,0.719512,0.719512,0.719512,0.719512,0.719512
2,a.c._green_100463,A.C. Green,23,2,24,0,1988,82,2636,937,...,0.460513,0.460513,0.460513,0.460513,0.756098,0.737805,0.737805,0.737805,0.737805,0.737805
3,a.c._green_100463,A.C. Green,23,3,25,0,1989,82,2510,1088,...,0.512693,0.512693,0.512693,0.512693,0.682927,0.719512,0.719512,0.719512,0.719512,0.719512
4,a.c._green_100463,A.C. Green,23,4,26,0,1990,82,2709,1061,...,0.513854,0.500211,0.500211,0.500211,0.707317,0.695122,0.715447,0.716463,0.716463,0.716463
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18804,zydrunas_ilgauskas_060575,Zydrunas Ilgauskas,20,10,33,0,2009,65,1765,838,...,0.318235,0.372316,0.382686,0.396957,0.487805,0.560976,0.569106,0.557927,0.526829,0.459391
18805,zydrunas_ilgauskas_060575,Zydrunas Ilgauskas,20,11,34,0,2010,64,1339,474,...,0.254174,0.301065,0.349906,0.382931,0.792683,0.640244,0.638211,0.625000,0.604878,0.490805
18806,zydrunas_ilgauskas_060575,Zydrunas Ilgauskas,20,12,35,0,2011,72,1145,360,...,0.253797,0.250529,0.292775,0.375103,0.719512,0.756098,0.666667,0.658537,0.643902,0.510504
18807,zylan_cheatham_111795,Zylan Cheatham,65,0,24,0,2020,4,51,12,...,,,,,,,,,,


In [12]:
data_2 = data_1[final_columns].copy()

In [13]:
data_3 = data_2.copy()
data_3.fillna(0, inplace = True)
data_3.loc[data_3['Prev_1_FTr'] == np.inf, 'Prev_1_FTr'] = 1
data_3.loc[data_3['Prev_2_FTr'] == np.inf, 'Prev_2_FTr'] = 1
data_3.loc[data_3['Prev_3_FTr'] == np.inf, 'Prev_3_FTr'] = 1
data_3.loc[data_3['Prev_4_FTr'] == np.inf, 'Prev_4_FTr'] = 1
data_3.loc[data_3['Prev_5_FTr'] == np.inf, 'Prev_5_FTr'] = 1
data_3.loc[data_3['Prev_FTr'] == np.inf, 'Prev_FTr'] = 1
data_3

Unnamed: 0,Player_ID,Player,Pick,Age,Exp,YrsOff,Year,Prev_1_GP,Prev_2_GP,Prev_3_GP,...,Prev_4_PWp,Prev_5_PWp,Prev_PWp,AllNBA,AllNBA1,AllNBA2,AllNBA3,MVP_Share,Fut_AllNBA,Fut_MVP_Share
0,a.c._green_100463,A.C. Green,23,22,0,0,1986,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0,0,0,0,0.0,0.0,0.0
1,a.c._green_100463,A.C. Green,23,23,1,0,1987,82.0,82.0,82.0,...,0.719512,0.719512,0.719512,0,0,0,0,0.0,0.0,0.0
2,a.c._green_100463,A.C. Green,23,24,2,0,1988,79.0,161.0,161.0,...,0.737805,0.737805,0.737805,0,0,0,0,0.0,0.0,0.0
3,a.c._green_100463,A.C. Green,23,25,3,0,1989,82.0,161.0,243.0,...,0.719512,0.719512,0.719512,0,0,0,0,0.0,0.0,0.0
4,a.c._green_100463,A.C. Green,23,26,4,0,1990,82.0,164.0,243.0,...,0.716463,0.716463,0.716463,0,0,0,0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18804,zydrunas_ilgauskas_060575,Zydrunas Ilgauskas,20,33,10,0,2009,73.0,151.0,229.0,...,0.557927,0.526829,0.459391,0,0,0,0,0.0,0.0,0.0
18805,zydrunas_ilgauskas_060575,Zydrunas Ilgauskas,20,34,11,0,2010,65.0,138.0,216.0,...,0.625000,0.604878,0.490805,0,0,0,0,0.0,0.0,0.0
18806,zydrunas_ilgauskas_060575,Zydrunas Ilgauskas,20,35,12,0,2011,64.0,129.0,202.0,...,0.658537,0.643902,0.510504,0,0,0,0,0.0,0.0,0.0
18807,zylan_cheatham_111795,Zylan Cheatham,65,24,0,0,2020,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0,0,0,0,0.0,0.0,0.0


In [14]:
data_2.describe()

Unnamed: 0,Pick,Age,Exp,YrsOff,Year,Prev_1_GP,Prev_2_GP,Prev_3_GP,Prev_4_GP,Prev_5_GP,...,Prev_4_PWp,Prev_5_PWp,Prev_PWp,AllNBA,AllNBA1,AllNBA2,AllNBA3,MVP_Share,Fut_AllNBA,Fut_MVP_Share
count,18809.0,18809.0,18809.0,18809.0,18809.0,15298.0,15298.0,15298.0,15298.0,15298.0,...,15298.0,15298.0,15298.0,18809.0,18809.0,18809.0,18809.0,18809.0,18809.0,18809.0
mean,30.579138,26.606571,4.351215,0.058376,2003.579776,59.008432,110.113217,153.851092,190.867499,221.741992,...,0.499014,0.498028,0.49358,0.0319,0.011431,0.011431,0.009038,0.005857,0.223191,0.042799
std,25.047951,4.094304,3.926753,0.358845,12.449025,22.286635,43.782457,69.120108,95.911571,122.467644,...,0.117617,0.115093,0.110323,0.175738,0.106304,0.106304,0.094641,0.058216,1.135799,0.364823
min,1.0,18.0,0.0,0.0,1980.0,0.0,0.0,0.0,0.0,0.0,...,0.106061,0.106061,0.106061,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,10.0,23.0,1.0,0.0,1993.0,46.0,79.0,95.0,103.0,105.0,...,0.42,0.421951,0.42378,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,24.0,26.0,3.0,0.0,2004.0,66.0,122.0,169.0,214.0,240.0,...,0.5,0.5,0.496622,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,47.0,29.0,7.0,0.0,2015.0,78.0,147.0,213.0,276.0,332.0,...,0.582472,0.579365,0.568293,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,221.0,44.0,21.0,7.0,2023.0,85.0,166.0,248.0,330.0,412.0,...,0.853659,0.853659,0.853659,1.0,1.0,1.0,1.0,1.0,18.0,8.818


In [15]:
data_3.describe()

Unnamed: 0,Pick,Age,Exp,YrsOff,Year,Prev_1_GP,Prev_2_GP,Prev_3_GP,Prev_4_GP,Prev_5_GP,...,Prev_4_PWp,Prev_5_PWp,Prev_PWp,AllNBA,AllNBA1,AllNBA2,AllNBA3,MVP_Share,Fut_AllNBA,Fut_MVP_Share
count,18809.0,18809.0,18809.0,18809.0,18809.0,18809.0,18809.0,18809.0,18809.0,18809.0,...,18809.0,18809.0,18809.0,18809.0,18809.0,18809.0,18809.0,18809.0,18809.0,18809.0
mean,30.579138,26.606571,4.351215,0.058376,2003.579776,47.993567,89.558828,125.13233,155.239035,180.350311,...,0.405865,0.405063,0.401445,0.0319,0.011431,0.011431,0.009038,0.005857,0.223191,0.042799
std,25.047951,4.094304,3.926753,0.358845,12.449025,30.539222,58.309423,86.48448,114.074603,140.228021,...,0.221493,0.220073,0.216536,0.175738,0.106304,0.106304,0.094641,0.058216,1.135799,0.364823
min,1.0,18.0,0.0,0.0,1980.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,10.0,23.0,1.0,0.0,1993.0,18.0,35.0,38.0,39.0,39.0,...,0.325203,0.329268,0.329268,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,24.0,26.0,3.0,0.0,2004.0,58.0,105.0,146.0,164.0,174.0,...,0.466216,0.465854,0.46477,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,47.0,29.0,7.0,0.0,2015.0,76.0,142.0,205.0,263.0,316.0,...,0.560976,0.560606,0.54878,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,221.0,44.0,21.0,7.0,2023.0,85.0,166.0,248.0,330.0,412.0,...,0.853659,0.853659,0.853659,1.0,1.0,1.0,1.0,1.0,18.0,8.818


In [16]:
data_2.to_csv('preprocessed_stats_NAs.csv', index = False)
data_3.to_csv('preprocessed_stats.csv', index = False)