In [1]:
import math
import sys
import csv
import pandas as pd

from collections import Counter

In [2]:
#Path to file used in this script
ATHLETES_FILE = './datasets/athletes.csv'

In [3]:
# Original column indices
AGE_COL    = 2
HEIGHT_COL = 3
WEIGHT_COL = 4
GENDER_COL = 5
SPORT_COL  = 12
EVENTS_COL = 13

# Added-on scaled column indices 
#   (assumes events were condensed into a single column)
SCALED_AGE_COL = 14
SCALED_HEIGHT_COL = 15
SCALED_WEIGHT_COL = 16

# These column indices define a point
POINT_COLS = [AGE_COL, HEIGHT_COL, WEIGHT_COL]
SCALED_POINT_COLS = [SCALED_AGE_COL, SCALED_HEIGHT_COL, SCALED_WEIGHT_COL]

In [4]:
def load_athletes_lazy(filename): 
    
    df = pd.read_csv(filename)
    df[['age', 'height', 'weight']]
    
    return df

In [5]:
def get_input():
    """
    Prompts user for an age, height, and weight.
    """

    age = input("Age (years)? ")
    height = input("Height (cm)? ")
    weight = input("Weight (kg)? ")

    return (age, height, weight)

In [6]:
def load_athletes(filename):
    """
    Loads athlete data from 'filename' into a list of tuples.
    Returns a list of tuples of each athlete's attributes, where
      the last element of each tuple is a list of events the athlete
      competed in.
      (age, height, weight) 
    The header line is skipped, and rows are removed if missing a value
      for the age, height, or weight.
    For example:
    [...,
     ['Zhaoxu Zhang', "People's Republic of China", 
      '24', '221', '110', 'M', '11/18/1987', 
      '', '0', '0', '0', '0', 
      'Basketball', ["Men's Basketball"]],
     ...
    ]
    """
    #Testing approach to account errors
    assert(type(filename) == str and len(filename) > 0)

    athletes = []

    #read in data with csv module
    with open(filename, 'r') as fin:
        
        reader = csv.reader(fin)
        
        next(reader)            # Skip the header
         
        # Places all athletes into a list
        athletes = list(reader)
        
        # Remove rows with empty age/height/weight
        # NOTE: Could alternatively replace empty values with the column mean
        athletes = [row for row in athletes if (all(field != '' for field in row[2:5]))]

        # Place all events in one list in column 'EVENTS_COL'
        athletes = [row[:EVENTS_COL] + [row[EVENTS_COL:]] for row in athletes]

        print athletes[0]

    return athletes

In [7]:
def dist(x, y):
    """ 
    Euclidean distance between vectors x and y. 
    Each element of x and y must be numeric or a numeric string.
    Requires that len(x) == len(y).
    For example: 
        (age, height, weight) 
        dist((0, 0, 0), (0, 5, 0)) == 5.0
        dist((1, 1, 1), (2, 2, 2)) == 1.7320508075688772
        dist(('1', '1', '1'), ('2', '2', '2')) == 1.7320508075688772
    """
    #testing 
    assert(len(x) == len(y))
    
    sq_distances = ((float(x[index]) - float(y[index]))**2 for index in range(len(x)))
    
    return math.sqrt(sum(sq_distances)) 

#print dist((0, 0, 0), (0, 5, 0))

In [8]:
def nearest_athletes(point, athletes, k = 1):
    """
    Returns the 'k' athletes closest to 'point'.
    Sorts the athletes based on distance to 'point', then return the closest.
    """
    
    # calculate the distance between test_point with every athlete
    # sort the list of distances
    
    nearest = sorted(athletes, key=lambda athlete: dist(point, athlete[2:5]))
    
    return nearest[:k]


In [9]:
def most_common_event(athletes):
    """
    Returns the most frequently occuring event in all 'athletes'.
    Consider using Counter.
    """
    events_list = []
    
    for event in athletes:
        events_list.append(event[EVENTS_COL][0])
    
    events_count = Counter(events_list)

    return events_count.most_common(1)[0][0]


In [10]:
# for event in athletes[:3]:
# print event[EVENTS_COL][0]
    
#print athletes[0][EVENTS_COL]

# events = Counter(event[-1] for athlete in athletes for event in athletes[0][EVENTS_COL])
# events

In [11]:
#MAIN IMPLEMENTATION AREA

In [12]:
# load in the file, prints the first record as a sample
athletes = load_athletes(ATHLETES_FILE)

['Lamusi A', "People's Republic of China", '23', '170', '60', 'M', '6/2/1989', 'NEIMONGGOL (CHN)', '0', '0', '0', '0', 'Judo', ["Men's -60kg"]]


In [13]:
#add any test point that you want
test_point = (24, 150, 65) #age, height cm, weight kg
print "TEST POINT: ", test_point

TEST POINT:  (24, 150, 65)


In [14]:
#Perform KNN
nearest = nearest_athletes(test_point, athletes, k=5)

print "NEAREST ATHLETE(S): ", nearest

# Find the most common event of the nearest athletes
event = most_common_event(nearest)
print("RECOMMENDED EVENT: ", event)

NEAREST ATHLETE(S):  [['Jillian Tyler', 'Canada', '23', '152', '65', 'F', '9/5/1988', 'DIDSBURY (CAN)', '0', '0', '0', '0', 'Swimming', ["Women's 100m Breaststroke"]], ['Ogho-Oghene Egwero', 'Nigeria', '23', '152', '64', 'M', '11/26/1988', 'EGBO', '0', '0', '0', '0', 'Athletics', ["Men's 100m"]], ['Lisa Dahlkvist', 'Sweden', '25', '147', '68', 'F', '2/6/1987', '', '0', '0', '0', '0', 'Football', ["Women's Football"]], ['Mira Suhonen', 'Finland', '27', '153', '63', 'F', '7/9/1985', 'KUORTANE (FIN)', '0', '0', '0', '0', 'Shooting', ["Women's 10m Air Pistol"]], ['Geraldine Lee', 'Singapore', '25', '154', '62', 'F', '6/19/1987', '', '0', '0', '0', '0', 'Canoe Sprint', ["Women's Kayak Single (K1) 200m, Women's Kayak Single (K1) 500m"]]]
('RECOMMENDED EVENT: ', "Women's 100m Breaststroke")


In [37]:
targets = []
for i in athletes: 
    targets.append(i[-1][0])
    
print np.array(targets)

["Men's -60kg" "Men's Hammer Throw" "Men's 1500m" ..., "Men's Handball"
 "Women's 400m Hurdles, Women's 4 x 400m Relay" "Men's Basketball"]


In [31]:
###Using KNN appplication to this dataset
from sklearn import datasets, neighbors, metrics
import pandas as pd
import numpy as np

knn = neighbors.KNeighborsClassifier(n_neighbors=5, weights='uniform') # add in the weights parameter here
knn.fit(athletes[2:5], np.array(targets))
print knn.predict(athletes[2:5])

ValueError: cannot set an array element with a sequence