In [1]:
import pandas
with open("nba_2013.csv", 'r') as csvfile:
    nba_raw = pandas.read_csv(csvfile)
    nba = nba_raw.dropna(axis=0, how='any', thresh=None, subset=None, inplace=False)
print(nba.columns.values)

['player' 'pos' 'age' 'bref_team_id' 'g' 'gs' 'mp' 'fg' 'fga' 'fg.' 'x3p'
 'x3pa' 'x3p.' 'x2p' 'x2pa' 'x2p.' 'efg.' 'ft' 'fta' 'ft.' 'orb' 'drb'
 'trb' 'ast' 'stl' 'blk' 'tov' 'pf' 'pts' 'season' 'season_end']


In [2]:
import math
selected_player = nba[nba["player"] == "LeBron James"].iloc[0]

# Choose only the numeric columns
distance_columns = ['age', 'g', 'gs', 'mp', 'fg', 'fga', 'fg.', 'x3p', 'x3pa', 'x3p.', 'x2p', 'x2pa', 'x2p.', 'efg.', 'ft', 'fta', 'ft.', 'orb', 'drb', 'trb', 'ast', 'stl', 'blk', 'tov', 'pf', 'pts']

def euclidean_distance(row):
    """
    A simple euclidean distance function
    """
    inner_value = 0
    for k in distance_columns:
        inner_value += (row[k] - selected_player[k]) ** 2
    return math.sqrt(inner_value)

# Find the distance from each player in the dataset to lebron.
lebron_distance = nba.apply(euclidean_distance, axis=1)


In [3]:
# Select only the numeric columns from the NBA dataset
nba_numeric = nba[distance_columns]
# Normalize all of the numeric columns
nba_normalized = (nba_numeric - nba_numeric.mean()) / nba_numeric.std()

In [4]:
from scipy.spatial import distance

# Fill in NA values in nba_normalized
nba_normalized.fillna(0, inplace=True)

# Find the normalized vector for lebron james.
lebron_normalized = nba_normalized[nba["player"] == "LeBron James"]

# Find the distance between lebron james and everyone else.
euclidean_distances = nba_normalized.apply(lambda row: distance.euclidean(row, lebron_normalized), axis=1)

# Create a new dataframe with distances.
distance_frame = pandas.DataFrame(data={"dist": euclidean_distances, "idx": euclidean_distances.index})
distance_frame.sort_values("dist", inplace=True)
# Find the most similar player to lebron (the lowest distance to lebron is lebron, the second smallest is the most similar non-lebron player)
second_smallest = distance_frame.iloc[1]["idx"]
most_similar_to_lebron = nba.loc[int(second_smallest)]["player"]

In [5]:
import random
from numpy.random import permutation

# Randomly shuffle the index of nba.
random_indices = permutation(nba.index)

# Set a cutoff for how many items we want in the test set (in this case 1/3 of the items)
test_cutoff = math.floor(len(nba)/3)

# Generate the test set by taking the first 1/3 of the randomly shuffled indices.
test = nba.loc[random_indices[1:test_cutoff]]

# Generate the train set with the rest of the data.
train = nba.loc[random_indices[test_cutoff:]]

In [6]:
# The columns that we will be making predictions with.
x_columns = ['age', 'g', 'gs', 'mp', 'fg', 'fga', 'fg.', 'x3p', 'x3pa', 'x3p.', 'x2p', 'x2pa', 'x2p.', 'efg.', 'ft', 'fta', 'ft.', 'orb', 'drb', 'trb', 'ast', 'stl', 'blk', 'tov', 'pf']
# The column that we want to predict.
y_column = ["pts"]

from sklearn.neighbors import KNeighborsRegressor
# Create the knn model.
# Look at the five closest neighbors.
knn = KNeighborsRegressor(n_neighbors=5)
# Fit the model on the training data.
knn.fit(train[x_columns], train[y_column])
# Make point predictions on the test set using the fit model.
predictions = knn.predict(test[x_columns])


In [7]:
predictions

array([[ 282. ],
       [ 796.4],
       [ 505.4],
       [ 900.2],
       [1085.6],
       [ 614. ],
       [1188.2],
       [1317.2],
       [ 965.8],
       [1405. ],
       [ 792.4],
       [  83.8],
       [ 781.8],
       [  36.4],
       [ 286.8],
       [ 679.6],
       [ 141. ],
       [ 184.2],
       [ 321.4],
       [ 349.2],
       [ 749.8],
       [1355.6],
       [1048.6],
       [  74. ],
       [  85.6],
       [ 338. ],
       [1023. ],
       [ 895.8],
       [ 358. ],
       [ 107.6],
       [  32.6],
       [ 229.8],
       [ 207.8],
       [1100. ],
       [ 969.4],
       [1049.6],
       [ 824.6],
       [ 320. ],
       [ 813.2],
       [ 115.8],
       [  59.2],
       [ 184.8],
       [  84. ],
       [ 535. ],
       [  58.6],
       [ 600.8],
       [ 703.4],
       [ 198. ],
       [  23. ],
       [ 455.8],
       [1104.8],
       [ 142.8],
       [1232.8],
       [ 176. ],
       [ 174. ],
       [ 889.4],
       [ 101.4],
       [  86.2],
       [ 327. 

In [8]:
# Get the actual values for the test set.
actual = test[y_column]

# Compute the mean squared error of our predictions.
mse = (((predictions - actual) ** 2).sum()) / len(predictions)


In [9]:
print(mse)

pts    4344.651128
dtype: float64
