# Trying PCA on NBA Players

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from functools import reduce
import glob
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import euclidean_distances
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
from sklearn.decomposition import PCA

In [2]:
def extract_data(folder):
    glued_data = pd.DataFrame()
    for file_name in glob.glob('C://Users//sabar//Downloads//nba_pca//'+folder+'//*.csv'):
        x = pd.read_csv(file_name, low_memory=False)
        glued_data = pd.concat([glued_data,x],axis=0)
    return glued_data

In [3]:
fouls = extract_data('fouls')
misc = extract_data('misc')
penalty_pts = extract_data('penalty_pts')
second_chance = extract_data('second_chance')
shot_distribution = extract_data('shot_distribution')
FT_source = extract_data('FT_source')
turnovers = extract_data('turnovers')
rebounds = extract_data('rebounds')
assists = extract_data('assists')
scoring = extract_data('scoring')

# print(fouls.shape)
# print(misc.shape)
# print(penalty_pts.shape)
# print(second_chance.shape)
# print(shot_distribution.shape)
# print(FT_source.shape)
# print(turnovers.shape)
# print(rebounds.shape)
# print(assists.shape)
# print(scoring.shape)

(592, 17)
(592, 26)
(592, 33)
(592, 33)
(592, 43)
(592, 14)
(592, 16)
(592, 33)
(592, 13)
(592, 31)


In [4]:

#some players have played for two or three teams. Dropping the teams for whom they played least number of matches
fouls = fouls.sort_values(by='Minutes').drop_duplicates('Name', keep = 'last')
misc = misc.sort_values(by='Minutes').drop_duplicates('Name', keep = 'last')
penalty_pts = penalty_pts.sort_values(by='Minutes').drop_duplicates('Name', keep = 'last')
second_chance = second_chance.sort_values(by='Minutes').drop_duplicates('Name', keep = 'last')
shot_distribution = shot_distribution.sort_values(by='Minutes').drop_duplicates('Name', keep = 'last')
FT_source = FT_source.sort_values(by='Minutes').drop_duplicates('Name', keep = 'last')
turnovers = turnovers.sort_values(by='Minutes').drop_duplicates('Name', keep = 'last')
assists = assists.sort_values(by='Minutes').drop_duplicates('Name', keep = 'last')
rebounds = rebounds.sort_values(by='Minutes').drop_duplicates('Name', keep = 'last')
scoring = scoring.sort_values(by='Minutes').drop_duplicates('Name', keep = 'last')


# print(fouls.shape)
# print(misc.shape)
# print(penalty_pts.shape)
# print(second_chance.shape)
# print(shot_distribution.shape)
# print(FT_source.shape)
# print(turnovers.shape)
# print(rebounds.shape)
# print(assists.shape)
# print(scoring.shape)



(530, 17)
(530, 26)
(530, 33)
(530, 33)
(530, 43)
(530, 14)
(530, 16)
(530, 33)
(530, 13)
(530, 31)


In [5]:
# FT_source.columns

Index(['Name', 'TeamAbbreviation', 'GamesPlayed', 'Minutes', 'FTA',
       'Technical Free Throw Trips', 'TwoPtShootingFoulsDrawn',
       '2pt And 1 Free Throw Trips', 'ThreePtShootingFoulsDrawn',
       '3pt And 1 Free Throw Trips', 'NonShootingFoulsDrawn',
       'ShootingFoulsDrawnPct', 'TwoPtShootingFoulsDrawnPct',
       'ThreePtShootingFoulsDrawnPct'],
      dtype='object')

In [6]:
df_list = [fouls,misc,penalty_pts,second_chance,shot_distribution, FT_source, turnovers, assists, rebounds, scoring]

df_merged = reduce(lambda  left,right: pd.merge(left,right,on=['Name','TeamAbbreviation', 'GamesPlayed','Minutes'],
                                            how='inner'), df_list)
#print(df_merged.shape)

(530, 223)


In [7]:
df_merged = df_merged.loc[df_merged['GamesPlayed'] > 10]
df_merged['mpg'] = df_merged['Minutes']/df_merged['GamesPlayed']
df_merged.drop(columns = ['Minutes','GamesPlayed'], inplace = True)
df_merged.sort_values(by='Name').head()
df_merged.drop(columns = 'ShotQualityAvg_y', inplace = True)
df_merged.rename(columns = {'ShotQualityAvg_x':'ShotQualityAvg'}, inplace = True)
#print(df_merged.shape)

(452, 221)


In [8]:

#clustering based on only some characteristics
#All features already stored in variable 'features'
features = list(df_merged.columns[2:].values)
only_fouls = list(fouls.columns[4:])
only_misc = list(misc.columns[4:])
only_penalty_pts = list(penalty_pts.columns[4:])
only_sec_chance = list(second_chance.columns[4:])
only_shot_dist = list(shot_distribution.columns[4:])
only_FT_source = list(FT_source.columns[4:])
only_turovers = list(turnovers.columns[4:])
only_rebs = list(rebounds.columns[4:])
only_assits = list(assists.columns[4:])
only_scoring = list(scoring.columns[4:])


In [9]:
def calc_pca(input_df, feat, var_pres = 0.9):
    scaler = MinMaxScaler()
    input_df[feat] = scaler.fit_transform(input_df[feat])
    x = input_df.loc[:, feat].values
    x = np.nan_to_num(x)
    pca = PCA(.9)
    pca_components = pca.fit_transform(x)
    print('Number of PCA components to preserve {}% variance is {} '.format(var_pres*100, pca.n_components_))
    principal_dataframe = pd.DataFrame(data = pca_components)
    principal_dataframe['Name'] = input_df['Name'].values
    return principal_dataframe


In [10]:
def calc_dist(input_df):
    dist_df = euclidean_distances(input_df[input_df.columns[:-1].values],input_df[input_df.columns[:-1].values])
    dist_df = pd.DataFrame(dist_df)
    dist_df.index = input_df['Name'].values
    dist_df.columns = input_df['Name'].values
    return dist_df


def sorted_dist_df(distance_matrix):
    return (pd.DataFrame(distance_matrix.columns[np.argsort(distance_matrix.values, axis=1)],index=distance_matrix.index))

In [11]:
# player_names = df_merged['Name'].sort_values()

# pca_df = calc_pca(df_merged, only_sec_chance)
# dist_df = calc_dist(pca_df)
# sorted_distances = sorted_dist_df(dist_df)

# def closest_neighbor(player, num_close):
#     return sorted_distances.loc[player,1:num_close]

# def closest_neighbor(player, num_close):
#     return sorted_distances.loc[player,1:num_close]

In [12]:
#interact(closest_neighbor, player = player_names, num_close = range(1,10))

In [13]:
options = {
'overall': features ,
'scoring': only_fouls, 
'other' : only_misc ,
'penalty points' :only_penalty_pts ,
'second chance points' : only_sec_chance ,
'shot distances' :only_shot_dist,
'free throw' : only_FT_source ,
'turnovers': only_turovers ,
'rebounds' : only_rebs ,
'assists': only_assits ,
'scoring':only_scoring 
}

player_names = df_merged['Name'].sort_values()

In [14]:
def closest_neighbor(player, ft, num_close):
    pca_df = calc_pca(df_merged, options[ft])
    dist_df = calc_dist(pca_df)
    sorted_distances = sorted_dist_df(dist_df)
    
    plt.figure(figsize=(10,10))
    plt.scatter(pca_df[0], pca_df[1])

    for label, x, y in zip(pca_df[pca_df['Name'] == player]['Name'], 
                       pca_df[pca_df['Name'] == player][0], 
                       pca_df[pca_df['Name'] == player][1]):
        plt.annotate(label,
        xy=(x, y), xytext=(-20, 20),
        textcoords='offset pixels', ha='left', va='bottom',
        bbox=dict(boxstyle='round,pad=0.5', fc='yellow', alpha=0.5),
        arrowprops=dict(arrowstyle = '->', connectionstyle='arc3,rad=0'))
        plt.xlabel('1st Principal Component', fontsize = 20)
        plt.ylabel('2nd Principal Component', fontsize = 20)
        plt.title('1st and 2nd Principal Component of NBA Players.(2019-20)', fontsize=20)
        plt.xticks([], [])
    
    
    return sorted_distances.loc[player,1:num_close]

In [15]:
interact(closest_neighbor, 
         player = player_names,
         ft = list(options.keys()),
         num_close = range(1,11))

interactive(children=(Dropdown(description='player', options=('Aaron Gordon', 'Aaron Holiday', 'Abdel Nader', …

<function __main__.closest_neighbor(player, ft, num_close)>