In [1]:
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from sklearn.cross_validation import train_test_split
from sklearn.metrics import average_precision_score
import numpy as np
import numexpr as ne

In [2]:
train = pd.read_csv("train.csv", index_col="row_id").values

xy = train[:,0:2]
x = train[:,0] # units are kilometers
y = train[:,1]
accuracy = train[:,2]* 0.001 # assume accuracy is reported in meters so convert to kilometers
time = train[:,3] # units are minutes
time_of_day = train[:,3] % 1440 # minutes
time_of_week = train[:,3] % 10080 # minutes
place_id = train[:,4]

In [3]:
neigh = NearestNeighbors(n_jobs=-1, algorithm='kd_tree')
%time neigh.fit(xy, place_id)

CPU times: user 1min 51s, sys: 623 ms, total: 1min 51s
Wall time: 1min 51s


NearestNeighbors(algorithm='kd_tree', leaf_size=30, metric='minkowski',
         metric_params=None, n_jobs=-1, n_neighbors=5, p=2, radius=1.0)

In [4]:
permutation = np.random.permutation(len(xy))

In [5]:
limit = 400000

validation_indicies = permutation[slice(0,min(limit,len(xy)))]

X_validation = train[validation_indicies]

In [6]:
day = 1440
hour = 60
week = 10080

# parameters
parameters = { 'kNN': 400, 
               'a_scale': 1,
               'a_min': 1,
               'a_bias': 0,
               'day_w1': 0,
               'day_w2': 12*hour,
               'day_mp': 0,
               'week_w1': 0,
               'week_w2': 3.5*day,
               'week_mp': 0,
               'time_w1': 20*week,
               'time_w2': 50*week,
               'time_mp': 1
             }

def time_difference(time1, time2, period=None):
    """Find the different in time even if measure is periodic."""
    if period:
        hp = 0.5 * period
#         print("time1.shape", time1.shape)
#         print("time2.shape", time2.shape)
        return ne.evaluate('hp-abs(abs(time1-time2) - hp)')
    else:
        return ne.evaluate('abs(time1-time2)')

def prob_overlap_time(diff, w1, w2, mp):
    """Compute the probability the the time difference is significant."""
    # derive equation of line that connects end of w1 and w2
    # points: (w1, 1), (w2, mp)
    # dy = mp-1, dx = w2-w1, m = (mp-1)/(w2-w1)
    # y = m * x + b
    # substitude in point 1
    # 1 = (mp-1)/(w2-w1) * w1 + b
    # solve for b
    # b = 1 - (mp-1)/(w2-w1) * w1
    # y = (mp-1)/(w2-w1) * x + 1 - (mp-1)/(w2-w1)
    prob = ne.evaluate('(mp-1)/(w2-w1) * diff + 1 - (mp-1)/(w2-w1) * w1')
    prob = np.where(diff < w1, 1, prob)
    return np.where(diff > w2, mp, prob)

def uniqify(seq):
    """Removes duplicates from sequence and maintains order."""
    seen = set()
    seen_add = seen.add
    return np.fromiter((x for x in seq if not (x in seen or seen_add(x))), dtype=np.int64)

def prob_overlap_locations(x1, y1, x2, y2, accuracy1, accuracy2):
    """Compute the probability that location measurements represent the same point."""
    return ne.evaluate('exp(-0.5 * ((x1-x2)**2+(y1-y2)**2) / (accuracy1 ** 2 + accuracy2 ** 2)) / \
                        (accuracy1 ** 2 + accuracy2 ** 2)') # / (2 * np.pi)

def sum_by_group(values, groups):
    """Sum a list of values by groups."""
    order = np.argsort(groups)
    groups = groups[order]
    values = values[order]
    values.cumsum(out=values)
    index = np.ones(len(groups), 'bool')
    index[:-1] = groups[1:] != groups[:-1]
    values = values[index]
    groups = groups[index]
    values[1:] = values[1:] - values[:-1]
    return values, groups

    
def predict(X, neighbors, parameters, self_validation=False):
    
    x_test = X[:,0].reshape((-1,1)) # units are kilometers
    y_test = X[:,1].reshape((-1,1)) # units are kilometers
    a_test = X[:,2].reshape((-1,1)) * 0.001
    time_test = X[:,3].reshape((-1,1))
    day_test = X[:,3].reshape((-1,1)) % 1440
    week_test = X[:,3].reshape((-1,1)) % 10080
    
    def scale_accuracy(accuracy):
        scale = parameters['a_scale']
        bias = parameters['a_bias']
        a_min = parameters['a_min']
        return np.maximum(accuracy + bias, a_min) * scale

    neighbor_accuracies = scale_accuracy(accuracy[neighbors])
    test_accuracy = scale_accuracy(a_test)
    colocation_prob = prob_overlap_locations(x_test, y_test, x[neighbors], y[neighbors], test_accuracy, neighbor_accuracies)
    
    time_of_day_diff = time_difference(day_test, time_of_day[neighbors], day)
    time_of_day_prob = prob_overlap_time(time_of_day_diff, parameters['day_w1'], parameters['day_w2'],
                                         parameters['day_mp'])
    
    time_of_week_diff = time_difference(week_test, time_of_week[neighbors], week)
    time_of_week_prob = prob_overlap_time(time_of_week_diff, parameters['week_w1'], parameters['week_w2'],
                                          parameters['week_mp'])
    
    time_diff = time_difference(time_test, time[neighbors])
    time_prob = prob_overlap_time(time_diff, parameters['time_w1'], parameters['time_w2'], parameters['time_mp'])
    
    total_prob = ne.evaluate('colocation_prob * time_of_day_prob * time_of_week_prob * time_prob')
    
    s = slice(1,None) if self_validation else slice(0,None) # skip the first neighbor if self validating
    predictions = np.zeros((len(X),3))
    for i, (prob, places) in enumerate(zip(total_prob[:,s], place_id[neighbors][:,s])):
        # append a few zeros just incase there is only one nearby place
        # we need three for the precision calculation
        prob, places = sum_by_group(np.append(prob, [0,0]), np.append(places, [0,1]))
        prob, places = zip(*sorted(zip(prob, places),reverse=True))
        predictions[i,:] = places[:3]
    return predictions
        
def mean_average_precision3(true, test):
    precision = np.array([1, 1/2, 1/3])
    return ne.evaluate('sum((true == test) * precision)') / len(true)

In [7]:
print("find nearest neighbors")
%time neighbors = neigh.kneighbors(X_validation[:,0:2], n_neighbors=parameters['kNN'], \
                                   return_distance=False).astype(np.int32)

# del neigh # free up lots of memory

find nearest neighbors
CPU times: user 1min 21s, sys: 9.71 s, total: 1min 30s
Wall time: 22.6 s


In [8]:
print("predict")
%time predictions = predict(X_validation, neighbors, parameters, self_validation=True)

print("evaluate")
%time mean_average_precision3(place_id[validation_indicies, None], predictions)

predict
CPU times: user 1min 48s, sys: 1min 52s, total: 3min 40s
Wall time: 3min 43s
evaluate
CPU times: user 62.9 ms, sys: 187 ms, total: 250 ms
Wall time: 305 ms


0.57452000000021741