In [124]:
import pandas as pd
import numpy as np
import math

In [125]:
# https://www.nbastuffer.com/2016-2017-nba-regular-season-player-stats/
nba = pd.read_csv('NBA_Player_Stats_16-17.csv', encoding = "ISO-8859-1")
nba = pd.DataFrame(nba)

# Normalize quantitative variables
cols_to_norm = nba.columns[4:]
nba[cols_to_norm] = nba[cols_to_norm].apply(lambda x: (x - x.min()) / (x.max() - x.min()))
nba.columns

Index(['Player', 'Team', 'Pos', 'Class', 'Age', 'GP', 'MPG', 'MIN_PERC',
       'USG_PERC', 'Tor', 'FTA', 'FT_PERC', '2PA', '2P_PERC', '3PA', '3P_PERC',
       'TS_PERC', 'PPG', 'RPG', 'TRB_PERC', 'APG', 'AST_PERC', 'SPG', 'BPG',
       'VI'],
      dtype='object')

In [126]:
train = nba.sample(frac=0.7)
train = pd.DataFrame(train)

test = nba.drop(train.index)
test = pd.DataFrame(test)

list_of_players = np.array(nba.iloc[0:]["Player"])
list_of_players_test = np.array(test.iloc[0:]["Player"])
list_of_players_train = np.array(train.iloc[0:]["Player"])
#random_state=nba.shape[0]

In [127]:
# Function returns tuple (actual distance from player 1, player 2 name)
def EuclidianDistance(Player_1, Player_2, data_set):
    player1 = data_set.loc[data_set['Player'] == Player_1]
    #player1.drop(player.columns[[0, 1, 2, 3]], axis=1, inplace= True)
                
    player2 = data_set.loc[data_set['Player'] == Player_2]
    #player2.drop(player.columns[[0, 1, 2, 3]], axis=1, inplace= True)

    distance = math.sqrt(sum(player1.iloc[0][4:24] - player2.iloc[0][4:24])**2)
    return distance, player2.iloc[0][0]

In [153]:
# itertate through all players and compare to Russell Westbrook. Store tuple of distances, player names 
def ClosestPlayersTo(Player_Name):
    distances = ()
    for i in range(len(list_of_players)):
        distances = distances + EuclidianDistance(Player_Name, list_of_players[i], nba)

    # convert stored tuples of distances and players into seperate numpy arrays and 
    # combine into pandas data frame and sort by dist.
    dist = []
    for i in range(0,len(distances), 2):
        dist.append(distances[i])
    dist = np.array(dist)
    players = []
    for i in range(1, len(distances), 2):
        players.append(distances[i])
    players = np.array(players)
    data = pd.DataFrame({'Distances':dist, 'Players':players})
    data = data.sort_values('Distances')
    return data.iloc[1:10]

In [154]:
###### Question 1
ClosestPlayersTo("Russell Westbrook")

Unnamed: 0,Distances,Players
175,0.604736,James Harden
270,1.887375,LeBron James
209,2.315093,John Wall
27,2.481884,Anthony Davis
145,2.600688,Giannis Antetokounmp
100,2.621476,DeMarcus Cousins
398,2.75504,Stephen Curry
236,3.015687,Karl-Anthony Towns
198,3.052168,Jimmy Butler


In [130]:
#nba.loc[nba['Player'] == "LeBron James"]
#nba.loc[nba['Player'] == "John Wall"]

In [131]:
###### Question 2, KNN Algorithm

In [132]:
# Function returns actual distance from player 1, player 2
def EuclidianDistance_KNN(Player_1, Player_2):
    player1 = train.loc[train['Player'] == Player_1]
    #player1.drop(player.columns[[0, 1, 2, 3]], axis=1, inplace= True)
                
    player2 = test.loc[test['Player'] == Player_2]
    #player2.drop(player.columns[[0, 1, 2, 3]], axis=1, inplace= True)

    distance = math.sqrt(sum(player1.iloc[0][4:24] - player2.iloc[0][4:24])**2)
    return distance

In [133]:
# gets neigbors for only 1 observation in test set
import operator
def getNeighbors(train_set, Test_Player, k):
    distances = []
    #testInstance = list_of_players_test[0]
    for x in range(len(train_set)):
        dist = EuclidianDistance_KNN(str(train_set.iloc[x][0]), Test_Player)
        distances.append((train_set.iloc[x][0], dist))
    distances.sort(key=operator.itemgetter(1))
    #print(distances)
    neighbors = []
    for x in range(k):
        neighbors.append(distances[x][0])
    return neighbors
    #print(neighbors)

In [134]:
# Get ALL Top K Neighbors for each observation in test set
def getALLNeighbors(train_set, k):
    neighbors_array = []
    #list_of_players_test = np.array(test.iloc[0:]["Player"])
    #list_of_players_train = np.array(train.iloc[0:]["Player"])
    
    for i in range(5): # Look at first 5 test observations only for now
        neighbors_array.append(getNeighbors(train, list_of_players_test[i], k))
    return neighbors_array

In [135]:
# Resize 1d list into mutlidimensional list
def to_matrix(l, n):
    return [l[i:i+n] for i in range(0, len(l), n)]
    #[myList[i:i+n] for i in range(0, len(myList), n)]

In [136]:
def GetLabels(neighbors_array, k):
    labels_train = []
    for i in range(len(neighbors_array)):
        for j in range(len(neighbors_array[i])):
            player = train.loc[train['Player'] == neighbors_array[i][j]]
            #print(player.iloc[0][2])
            labels_train.append(player.iloc[0][2])

    labels_train = to_matrix(labels_train, k)
    labels_train = np.array(labels_train)
    #print(labels_train)

    labels_test = []
    for i in list_of_players_test[0:5]: # Look at first 4 test observations only for now
        player = test.loc[test['Player'] == i] 
        label = player.iloc[0][2]
        labels_test.append(label)
    labels_test = np.array(labels_test)
    return labels_train, labels_test

In [137]:
# For now, use k = 9
neighbors_array = getALLNeighbors(train, 9)
knn_labels, true_labels = GetLabels(neighbors_array, 9)

In [147]:
import operator
def getResponse(neighbors_arrar):
    answer = []
    for x in range(len(neighbors_array)):
        classVotes = {}
        for i in range(len(neighbors_array[x])):
            #print(neighbors_array[i])
            player = train.loc[train['Player'] == neighbors_array[x][i]]
            response = player.iloc[0][2]
            if response in classVotes:
                classVotes[response] += 1
            else: 
                classVotes[response] = 1
        answer.append(classVotes)
    sortedValues = sorted(classVotes.items(), key=operator.itemgetter(1), reverse=True)
    return answer

In [152]:
import operator
answer = getResponse(neighbors_array)

predict = []
for i in answer:
    predict.append(max(i.items(), key=operator.itemgetter(1))[0])
predict

['C', 'PF', 'PF', 'PF', 'SG']

In [150]:
print(true_labels)

['PF' 'SG' 'SG' 'SF' 'C']
