In [1]:
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from sklearn.cross_validation import train_test_split
from sklearn.metrics import average_precision_score
import numpy as np
import numexpr as ne

In [2]:
train = pd.read_csv("train.csv", index_col="row_id")

xy = train.iloc[:,:2].values # units are kilometers
accuracy = train.iloc[:,2].values * 0.001 # assume accuracy is reported in meters so convert to kilometers
time = train.iloc[:,3].values # units are minutes
time_of_day = train.iloc[:,3].values % 1440 # minutes
time_of_week = train.iloc[:,3].values % 10080 # minutes
place_id = train.iloc[:,4].values

In [3]:
neigh = NearestNeighbors(n_jobs=-1, algorithm='kd_tree')
%time neigh.fit(xy, place_id)

CPU times: user 1min 50s, sys: 718 ms, total: 1min 51s
Wall time: 1min 51s


NearestNeighbors(algorithm='kd_tree', leaf_size=30, metric='minkowski',
         metric_params=None, n_jobs=-1, n_neighbors=5, p=2, radius=1.0)

In [4]:
permutation = np.random.permutation(len(xy))

In [5]:
limit = 400000

validation_indicies = permutation[slice(0,min(limit,len(xy)))]

In [11]:
day = 1440
hour = 60
week = 10080

# parameters
parameters = { 'accuracy' : { 'scale': 1, # 1 assumes unit is meters
                              'min': 1,
                              'bias': 0
                            },
               'time': { 
                         'day': { 'window_max': 0*hour, 'window_not_min': 12*hour, 'min_prob': 0 },
                         'week': { 'window_max': 0*day, 'window_not_min': 3.5*day, 'min_prob': 0 },
                         'abs': { 'window_max': 20*week, 'window_not_min': 50*week, 'min_prob': 1 }
                       },
               'kNN': 400
             }

def time_difference(time1, time2, period=None):
    if period:
        hp = 0.5 * period
        return ne.evaluate('hp-abs(abs(time2-time1) - hp)')
    else:
        return ne.evaluate('abs(time2-time2)')

def prob_overlap_time(diff, params):
    w1 = params['window_max']
    w2 = params['window_not_min']
    mp = params['min_prob']
    # derive equation of line that connects end of w1 and w2
    # (w1, 1), (w2, mp)
    # dy = mp-1, dx = w2-w1, m = (mp-1)/(w2-w1)
    # 1 = (mp-1)/(w2-w1) * w1 + b
    # b = 1 - (mp-1)/(w2-w1) * w1
    # y = (mp-1)/(w2-w1) * x + 1 - (mp-1)/(w2-w1)
    prob = ne.evaluate('(mp-1)/(w2-w1) * diff + 1 - (mp-1)/(w2-w1) * w1')
    prob = np.where(diff < w1, 1, prob)
    return np.where(diff > w2, mp, prob)

def uniqify(seq):
    seen = set()
    seen_add = seen.add
    return np.fromiter((x for x in seq if not (x in seen or seen_add(x))), dtype=np.int64)

def prob_overlap_locations(dist, accuracy1, accuracy2):
    return ne.evaluate('exp(-0.5 * dist * dist / (accuracy1 ** 2 + accuracy2 ** 2)) / \
                        (accuracy1 ** 2 + accuracy2 ** 2)') # / (2 * np.pi)

def sum_by_group(values, groups):
    order = np.argsort(groups)
    groups = groups[order]
    values = values[order]
    values.cumsum(out=values)
    index = np.ones(len(groups), 'bool')
    index[:-1] = groups[1:] != groups[:-1]
    values = values[index]
    groups = groups[index]
    values[1:] = values[1:] - values[:-1]
    return values, groups

def predict_xy_accuracy_time(test_points, distances, neighbors, parameters, self_validation=False):
    def scale_accuracy(accuracy):
        scale = parameters['accuracy']['scale']
        bias = parameters['accuracy']['bias']
        a_min = parameters['accuracy']['min']
        return np.maximum(accuracy + bias, a_min) * scale
    
    neighbor_accuracies = scale_accuracy(accuracy[neighbors])
    test_accuracy = scale_accuracy(accuracy[test_points, None])
    colocation_prob = prob_overlap_locations(distances, test_accuracy, neighbor_accuracies)
    
    time_of_day_diff = time_difference(time_of_day[test_points, None], time_of_day[neighbors], day)
    time_of_day_prob = prob_overlap_time(time_of_day_diff, parameters['time']['day'])
    
    time_of_week_diff = time_difference(time_of_week[test_points, None], time_of_week[neighbors], week)
    time_of_week_prob = prob_overlap_time(time_of_week_diff, parameters['time']['week'])
    
    time_abs_diff = time_difference(time[test_points, None], time[neighbors])
    time_abs_prob = prob_overlap_time(time_abs_diff, parameters['time']['abs'])
    
    total_prob = ne.evaluate('colocation_prob * time_of_day_prob * time_of_week_prob * time_abs_prob')
    
    s = slice(1,None) if self_validation else slice(0,None) # skip the first neighbor is self validating
    predictions = np.zeros((len(distances),3))
    for i, (prob, places) in enumerate(zip(total_prob[:,s], place_id[neighbors][:,s])):
        # append a few zeros just incase there is only one nearby place
        # we need three for the precision calculation
        prob, places = sum_by_group(np.append(prob, [0,0]), np.append(places, [0,1]))
        prob, places = zip(*sorted(zip(prob, places),reverse=True))
        predictions[i,:] = places[:3]
    return predictions
        
def mean_average_precision3(true, test):
    values = np.array([1, 1/2, 1/3])
    return ne.evaluate('sum((true == test) * values)') / len(true)

In [7]:
print("find nearest neighbors")
%time distances, neighbors = neigh.kneighbors(xy[validation_indicies], n_neighbors=parameters['kNN'])

# del neigh # free up lots of memory

find nearest neighbors
CPU times: user 1min 20s, sys: 14.2 s, total: 1min 34s
Wall time: 30.3 s


In [12]:
print("predict")
%time predictions = predict_xy_accuracy_time(validation_indicies, distances, neighbors, parameters, \
                                             self_validation=True)

print("evaluate")
%time mean_average_precision3(place_id[validation_indicies, None], predictions)

predict
CPU times: user 1min 38s, sys: 1min 51s, total: 3min 30s
Wall time: 3min 34s
evaluate
CPU times: user 60.2 ms, sys: 401 ms, total: 461 ms
Wall time: 498 ms


0.57452375000021716

# parameters

* accuracy multiplier
* accuracy bias
* time window in day for full match
* time window in day for least match
* fraction of least match time to full match
* relative weight of time vs distance metric