In [42]:
import pandas as pd
import numpy as np
import math
import operator

In [43]:
# https://www.nbastuffer.com/2016-2017-nba-regular-season-player-stats/
# https://www.nbastuffer.com/2017-2018-nba-player-stats/
nba = pd.read_csv('NBA_Player_Stats_16-17.csv', encoding = "ISO-8859-1")
nba = pd.DataFrame(nba)

# Normalize quantitative variables
cols_to_norm = nba.columns[5:]
nba[cols_to_norm] = nba[cols_to_norm].apply(lambda x: (x - x.min()) / (x.max() - x.min()))

In [44]:
train = nba.sample(frac=0.7)
train = pd.DataFrame(train)

test = nba.drop(train.index)
test = pd.DataFrame(test)

list_of_players = np.array(nba.iloc[0:]["Player"])
list_of_players_test = np.array(test.iloc[0:]["Player"])
list_of_players_train = np.array(train.iloc[0:]["Player"])
#random_state=nba.shape[0]

In [45]:
# Function returns tuple (actual distance from player 1, player 2 name)
def EuclidianDistance(Player_1, Player_2, data_set):
    player1 = data_set.loc[data_set['Player'] == Player_1]
    #player1.drop(player.columns[[0, 1, 2, 3]], axis=1, inplace= True)
                
    player2 = data_set.loc[data_set['Player'] == Player_2]
    #player2.drop(player.columns[[0, 1, 2, 3]], axis=1, inplace= True)

    distance = math.sqrt(sum(player1.iloc[0][5:24] - player2.iloc[0][5:24])**2)
    return distance, player2.iloc[0][0]

In [46]:
# itertate through all players and compare to Russell Westbrook. Store tuple of distances, player names 
def ClosestPlayersTo(Player_Name):
    distances = ()
    for i in range(len(list_of_players)):
        distances = distances + EuclidianDistance(Player_Name, list_of_players[i], nba)

    # convert stored tuples of distances and players into seperate numpy arrays and 
    # combine into pandas data frame and sort by dist.
    dist = []
    for i in range(0,len(distances), 2):
        dist.append(distances[i])
    dist = np.array(dist)
    players = []
    for i in range(1, len(distances), 2):
        players.append(distances[i])
    players = np.array(players)
    data = pd.DataFrame({'Distances':dist, 'Players':players})
    data = data.sort_values('Distances')
    return data.iloc[1:10]

In [47]:
###### Question 1
ClosestPlayersTo("Russell Westbrook")

Unnamed: 0,Distances,Players
175,0.638827,James Harden
270,1.96692,LeBron James
209,2.40979,John Wall
398,2.694433,Stephen Curry
100,2.9889,DeMarcus Cousins
198,3.063531,Jimmy Butler
161,3.08321,Isaiah Thomas
145,3.172658,Giannis Antetokounmp
27,3.175066,Anthony Davis


In [48]:
#nba.loc[nba['Player'] == "LeBron James"]
#nba.loc[nba['Player'] == "John Wall"]

In [49]:
###### Question 2, KNN Algorithm

In [57]:
# Function returns actual distance from player 1, player 2
def EuclidianDistance_KNN(Player_1, Player_2):
    player1 = train.loc[train['Player'] == Player_1]
    #player1.drop(player.columns[[0, 1, 2, 3]], axis=1, inplace= True)
                
    player2 = test.loc[test['Player'] == Player_2]
    #player2.drop(player.columns[[0, 1, 2, 3]], axis=1, inplace= True)

    distance = math.sqrt(sum(player1.iloc[0][5:24] - player2.iloc[0][5:24])**2)
    return distance

In [58]:
# gets neighbors for only 1 observation in test set
import operator
def getNeighbors(train_set, Test_Player, k):
    distances = []
    #testInstance = list_of_players_test[0]
    for x in range(len(train_set)):
        dist = EuclidianDistance_KNN(str(train_set.iloc[x][0]), Test_Player)
        distances.append((train_set.iloc[x][0], dist))
    distances.sort(key=operator.itemgetter(1))
    #print(distances)
    neighbors = []
    for x in range(k):
        neighbors.append(distances[x][0])
    return neighbors
    #print(neighbors)

In [59]:
# Get ALL Top K Neighbors for each observation in test set
def getALLNeighbors(train_set, k):
    neighbors_array = []
    #list_of_players_test = np.array(test.iloc[0:]["Player"])
    #list_of_players_train = np.array(train.iloc[0:]["Player"])
    #for i in range (5):  # Look at first 5 test observations only for now
    for i in range(len(test)):
        neighbors_array.append(getNeighbors(train, list_of_players_test[i], k))
    return neighbors_array

In [60]:
# Resize 1d list into mutlidimensional list
def to_matrix(l, n):
    return [l[i:i+n] for i in range(0, len(l), n)]
    #[myList[i:i+n] for i in range(0, len(myList), n)]

In [61]:
def GetLabels(neighbors_array, k):
    labels_train = []
    for i in range(len(neighbors_array)):
        for j in range(len(neighbors_array[i])):
            player = train.loc[train['Player'] == neighbors_array[i][j]]
            #print(player.iloc[0][2])
            labels_train.append(player.iloc[0][3])

    labels_train = to_matrix(labels_train, k)
    labels_train = np.array(labels_train)
    #print(labels_train)

    labels_test = []
    #for i in list_of_players_test[0:5]: # Look at first 4 test observations only for now
    for i in list_of_players_test:
        player = test.loc[test['Player'] == i] 
        label = player.iloc[0][3]
        labels_test.append(label)
    labels_test = np.array(labels_test)
    return labels_train, labels_test

In [62]:
import operator
def getResponse(neighbors_arrar):
    answer = []
    for x in range(len(neighbors_array)):
        classVotes = {}
        for i in range(len(neighbors_array[x])):
            #print(neighbors_array[i])
            player = train.loc[train['Player'] == neighbors_array[x][i]]
            response = player.iloc[0][3]
            if response in classVotes:
                classVotes[response] += 1
            else: 
                classVotes[response] = 1
        np.array(answer.append(classVotes))
    sortedValues = sorted(classVotes.items(), key=operator.itemgetter(1), reverse=True)
    return answer

In [63]:
def Accuracy(predictions, actual):
    N = len(predictions)
    count = 0
    for i in range(len(predictions)):
        if abs(actual[i] - predictions[i]) <= 1:
            count = count + 1
    return count / N

In [64]:
# Iterate through k-values and see how accuracy changes 
accuracies = []
for i in range(1,11):
    neighbors_array = getALLNeighbors(train, i)
    knn_labels, true_labels = GetLabels(neighbors_array, i)
    #print(knn_labels)
    #print(true_labels)
    answer = getResponse(neighbors_array)
    predict = []
    for j in answer:
        np.array(predict.append(max(j.items(), key=operator.itemgetter(1))[0]))
    accuracy = Accuracy(predict, true_labels)
    accuracies.append(accuracy)
accuracies

[0.49635036496350365,
 0.4744525547445255,
 0.45255474452554745,
 0.5182481751824818,
 0.48905109489051096,
 0.5109489051094891,
 0.48905109489051096,
 0.4744525547445255,
 0.5109489051094891,
 0.48175182481751827]

In [66]:
np.mean(accuracies)

0.48978102189781031