In [31]:
import pandas as pd
import numpy as np
from datetime import datetime
import pickle
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer

pd.__version__

'1.0.1'

In [2]:
with open("all_player_df.pkl",'rb') as f:
    dataset = pickle.load(f)

In [3]:
dataset.head(3)

Unnamed: 0,GP,PTS,FG3M,REB,OREB,DREB,AST,STL,BLK,TOV,...,TEAMS_PLAYED,AWARDS,DISPLAY_FIRST_LAST,PERSON_ID,BIRTHDATE,HEIGHT,WEIGHT,SCHOOL,POSITION,JERSEY
0,515,5020,0,3890,1776,2114,593,452,517,737,...,[OKC],[All-Rookie Team],Steven Adams,203500,1993-07-20,6.11,265,Pittsburgh,Center,12
1,204,2052,4,1529,421,1108,546,166,168,331,...,[MIA],[NBA Player of the Week],Bam Adebayo,1628389,1997-07-18,6.9,255,Kentucky,Center-Forward,13
2,1000,19552,180,8347,2679,5668,1977,729,1106,1569,...,"[POR, SAS]","[NBA Player of the Week, All-NBA, All-Rookie T...",LaMarcus Aldridge,200746,1985-07-19,6.11,250,Texas,Center-Forward,12


In [4]:
def convert_one_hot(df, column):
    df[column] = df[column].apply(lambda x: str(x))
    df[column].replace(to_replace = '<NA>', value = 'to_drop', inplace = True)
    new_df = pd.get_dummies(dataset[column].apply(pd.Series).stack()).sum(level=0)
    new_df.drop(columns = 'to_drop', inplace = True)
    return new_df

In [5]:
positions = convert_one_hot(dataset, 'POSITION')
positions

Unnamed: 0,Center,Center-Forward,Forward,Forward-Center,Forward-Guard,Guard,Guard-Forward
0,1,0,0,0,0,0,0
1,0,1,0,0,0,0,0
2,0,1,0,0,0,0,0
3,0,0,0,0,0,1,0
4,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...
4471,1,0,0,0,0,0,0
4472,0,0,1,0,0,0,0
4473,1,0,0,0,0,0,0
4474,0,0,0,0,0,1,0


## Check for <NA> in columns

In [6]:
def check_NA_column(col):
    return '<NA>' in [str(x) for x in list(set(dataset[col]))]

In [7]:
def average_wo_na_int(col):
    dataset[col] = dataset[col].apply(lambda x: str(x))
    mean_col = int(dataset[col][dataset[col]!='<NA>'].apply(lambda x: float(x)).mean())
    dataset[col].replace(to_replace = '<NA>', value = mean_col, inplace = True)
    dataset[col] = dataset[col].apply(lambda x: int(x))
    return dataset[col]

In [8]:
def average_wo_na_float(col):
    dataset[col] = dataset[col].apply(lambda x: str(x))
    mean_col = float(dataset[col][dataset[col]!='<NA>'].apply(lambda x: float(x)).mean())
    dataset[col].replace(to_replace = '<NA>', value = mean_col, inplace = True)
    dataset[col] = dataset[col].apply(lambda x: float(x))
    return dataset[col]

In [9]:
average_wo_na_int('WEIGHT')

0       265
1       255
2       250
3       205
4       198
       ... 
4471    240
4472    226
4473    240
4474    170
4475    195
Name: WEIGHT, Length: 4476, dtype: int64

In [10]:
average_wo_na_float('HEIGHT').mean()

6.4532074482035835

In [11]:
check_NA_column('DISPLAY_FIRST_LAST')

False

In [12]:
check_NA_column('PERSON_ID')

False

In [13]:
check_NA_column('BIRTHDATE')

False

In [14]:
check_NA_column('HEIGHT')

False

In [15]:
check_NA_column('WEIGHT')

False

In [16]:
check_NA_column('SCHOOL')

True

In [17]:
check_NA_column('POSITION')

False

In [18]:
check_NA_column('JERSEY')

True

In [19]:
## JERSEY
dataset['JERSEY'] = dataset['JERSEY'].apply(lambda x: str(x))
dataset['JERSEY'] = dataset['JERSEY'].apply(lambda x: x.split("-")[0])
dataset['JERSEY'] = dataset['JERSEY'].apply(lambda x: x.split(" ")[0])
dataset['JERSEY'].replace(to_replace = '<NA>', value = 0, inplace = True)
dataset['JERSEY'] = dataset['JERSEY'].apply(lambda x: int(x))
dataset['JERSEY'].loc[0]

12

In [20]:
check_NA_column('JERSEY')

False

In [21]:
## SCHOOL
dataset['SCHOOL'] = dataset['SCHOOL'].apply(lambda x: str(x))
dataset['SCHOOL'].replace(to_replace = '<NA>', value = None, inplace = True)

In [22]:
check_NA_column('SCHOOL')

False

## Convert columns to replace NAs

In [23]:
new_dataset = dataset[['GP',
                       'PTS',
                       'FG3M',
                       'REB',
                       'OREB',
                       'DREB',
                       'AST',
                       'STL',
                       'BLK',
                       'TOV',
                       'PF', 
                       'DISPLAY_FIRST_LAST',
                       'PERSON_ID',
                       'WEIGHT',
                       'HEIGHT',
                       'JERSEY',
                       'SCHOOL',
                       'TEAMS_PLAYED'
                      ]]
new_dataset.fillna(value=0, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  downcast=downcast,


In [24]:
## Teams Played
teams_played = pd.get_dummies(dataset['TEAMS_PLAYED'].apply(pd.Series).stack()).sum(level=0)

In [25]:
new_dataset = pd.concat((new_dataset, positions, teams_played), axis=1, join='inner')
new_dataset.tail()

Unnamed: 0,GP,PTS,FG3M,REB,OREB,DREB,AST,STL,BLK,TOV,...,STL.1,SYR,TCB,TOR,TOT,UTA,UTH,VAN,WAS,WAT
4471,19,23,0,35,11,24,4,2,5,3,...,0,0,0,0,0,0,0,0,0,0
4472,98,458,70,256,28,228,82,35,31,83,...,0,0,0,0,0,0,0,0,0,0
4473,7,2,0,8,3,5,1,1,3,4,...,0,0,0,0,0,0,0,0,0,0
4474,53,118,0,46,0,0,73,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4475,56,273,0,0,0,0,50,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [26]:
new_dataset['DISPLAY_FIRST_LAST']

0                   Steven Adams
1                    Bam Adebayo
2              LaMarcus Aldridge
3       Nickeil Alexander-Walker
4                  Grayson Allen
                  ...           
4471           Stephen Zimmerman
4472                 Paul Zipser
4473                    Jim Zoet
4474                   Bill Zopf
4475                  Matt Zunic
Name: DISPLAY_FIRST_LAST, Length: 4475, dtype: object

In [27]:
new_dataset['SCHOOL']

0              Pittsburgh
1                Kentucky
2                   Texas
3           Virginia Tech
4                    Duke
              ...        
4471                 UNLV
4472                 None
4473           Kent State
4474             Duquesne
4475    George Washington
Name: SCHOOL, Length: 4475, dtype: object

## Encoding Labels

In [32]:
school_encoder = LabelEncoder()
school_encoder.fit(new_dataset['SCHOOL'])
list(school_encoder.classes_)

['Abraham Lincoln',
 'Acadia (CAN)',
 'Ahvaz, Iran',
 'Akron',
 'Alabama',
 'Alabama A&M',
 'Alabama Huntsville',
 'Alabama State',
 'Alabama-Birmingham',
 'Albany State (GA)',
 'Alcorn State',
 'Alliance',
 'American',
 'American International',
 'American University',
 'Appalachian State',
 'Argentina',
 'Arizona',
 'Arizona State',
 'Arkansas',
 'Arkansas-Little Rock',
 'Army',
 'Assumption',
 'Auburn',
 'Auburn-Montgomery',
 'Augsburg',
 'Augusta State',
 'Augustana (SD)',
 'Aurora',
 'Austin Peay',
 'Averett',
 'Bakersfield',
 'Ball State',
 'Baltimore',
 'Barton County C.C.',
 'Baylor',
 'Belmont',
 'Belmont Abbey',
 'Beloit',
 'Bemidji State',
 'Bethel (TN)',
 'Blinn',
 'Bloomsburg',
 'Boise State',
 'Boston College',
 'Boston U.',
 'Boston University',
 'Bowling Green',
 'Bradley',
 'Brazil',
 'Bridgeport',
 'Brigham Young',
 'Brigham Young-Hawaii',
 'Brooklyn',
 'Brown',
 'Bucknell',
 'Budapest AEH',
 'Buffalo State',
 'Butler',
 'Butler Community College',
 'Cal Poly-Obispo',

In [33]:
new_dataset['SCHOOL'] = new_dataset['SCHOOL'].apply(lambda x: school_encoder.transform([x])[0])
new_dataset.tail()

Unnamed: 0,GP,PTS,FG3M,REB,OREB,DREB,AST,STL,BLK,TOV,...,STL.1,SYR,TCB,TOR,TOT,UTA,UTH,VAN,WAS,WAT
4471,19,23,0,35,11,24,4,2,5,3,...,0,0,0,0,0,0,0,0,0,0
4472,98,458,70,256,28,228,82,35,31,83,...,0,0,0,0,0,0,0,0,0,0
4473,7,2,0,8,3,5,1,1,3,4,...,0,0,0,0,0,0,0,0,0,0
4474,53,118,0,46,0,0,73,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4475,56,273,0,0,0,0,50,0,0,0,...,0,0,0,0,0,0,0,0,1,0


## Entity Embeddings

In [34]:
new_dataset[new_dataset['DISPLAY_FIRST_LAST']=='Michael Jordan']

Unnamed: 0,GP,PTS,FG3M,REB,OREB,DREB,AST,STL,BLK,TOV,...,STL.1,SYR,TCB,TOR,TOT,UTA,UTH,VAN,WAS,WAT
2355,1072,32292,581,6672,1668,5004,5633,2514,893,2924,...,0,0,0,0,0,0,0,0,1,0


In [35]:
new_dataset[new_dataset['DISPLAY_FIRST_LAST']=='Magic Johnson']['LAL']

2281    1
Name: LAL, dtype: uint8

In [43]:
new_dataset.sort_values('PTS', ascending=False)[['DISPLAY_FIRST_LAST','PTS']]

Unnamed: 0,DISPLAY_FIRST_LAST,PTS
514,Kareem Abdul-Jabbar,38387
2686,Karl Malone,36928
242,LeBron James,33785
969,Kobe Bryant,33643
2355,Michael Jordan,32292
...,...,...
4130,Slavko Vranes,0
2845,Chet McNabb,0
3427,Rob Rensberger,0
489,Quinndary Weatherspoon,0


## Some working stuff
1. https://towardsdatascience.com/deep-learning-for-tabular-data-using-pytorch-1807f2858320
2. https://github.com/fastai/fastai/blob/master/courses/dl1/lesson3-rossman.ipynb