In [9]:
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from sklearn.cross_validation import train_test_split
from sklearn.metrics import average_precision_score
import numpy as np

In [10]:
df = pd.read_csv("train.csv", index_col="row_id")

max_coordinate = 2

subset = df.loc[(df.x < max_coordinate) & (df.y < max_coordinate), :]
del df

xy = subset.iloc[:,:2].values
accuracy = subset.iloc[:,2].values
time = subset.iloc[:,3].values % 1440 # convert to minutes of the day
place_id = subset.iloc[:,4].values

In [73]:
neigh = NearestNeighbors(n_jobs=-1, algorithm='kd_tree', leaf_size=60)
%time neigh.fit(xy, place_id)

CPU times: user 2.48 s, sys: 25.3 ms, total: 2.5 s
Wall time: 2.5 s


NearestNeighbors(algorithm='kd_tree', leaf_size=60, metric='minkowski',
         metric_params=None, n_jobs=-1, n_neighbors=5, p=2, radius=1.0)

In [74]:
%time all_distances, all_neighbor_indicies = neigh.kneighbors(xy, n_neighbors=200)

CPU times: user 2min 1s, sys: 27.5 s, total: 2min 29s
Wall time: 1min 9s


In [69]:
def time_difference(time1, time2, period=1440):
    return period/2-np.absolute(np.absolute(time2 - time1) - period/2)

def prob_overlap_time(time1, time2, period=1440):
    return 1 - 2*time_difference(time1, time2)/period

In [88]:
def uniqify(seq):
    seen = set()
    seen_add = seen.add
    return np.fromiter((x for x in seq if not (x in seen or seen_add(x))), dtype=np.int64)

def prob_overlap_locations(dist, accuracy1, accuracy2):
    inv_sumsq = 1 / (np.square(accuracy1) + np.square(accuracy2))
    # if in the end the final result is a product then the 2 * np.pi constant can be removed
    return np.exp(-0.5 * np.square(dist) * inv_sumsq) * inv_sumsq # / (2 * np.pi)

def sum_by_group(values, groups):
    order = np.argsort(groups)
    groups = groups[order]
    values = values[order]
    values.cumsum(out=values)
    index = np.ones(len(groups), 'bool')
    index[:-1] = groups[1:] != groups[:-1]
    values = values[index]
    groups = groups[index]
    values[1:] = values[1:] - values[:-1]
    return values, groups

# assume accuracy is meters
accuracy_scale = 0.001

def predict_xy_accuracy_time(distances, neighbor_indicies):
    neighbor_accuracies = accuracy[neighbor_indicies] * accuracy_scale
    test_accuracy = accuracy[slice(0,len(neighbor_accuracies)), None] * accuracy_scale
    neighbor_place_id = place_id[neighbor_indicies]
    colocation_prob = prob_overlap_locations(distances, test_accuracy, neighbor_accuracies)
    
    neighbor_time = time[neighbor_indicies]
    test_time = time[slice(0,len(neighbor_accuracies)), None]
    time_prob = prob_overlap_time(test_time, neighbor_time)
    
    total_prob = colocation_prob * time_prob
    
    # TODO: remove the following line for real data just in case a duplicate point is tested
    s = slice(1,None) if distances[0][0] == 0 else slice(0,None) # skip the first neighbor which will be itself
    predictions = np.zeros((len(distances),3))
    for i, (prob, places) in enumerate(zip(total_prob[:,s], neighbor_place_id[:,s])):
        prob, places = sum_by_group(np.append(prob, [0,0]), np.append(places, [0,0]))
        prob, places = zip(*sorted(zip(prob, places),reverse=True))
        predictions[i,:] = places[:3]
    return predictions
        
limit = 200000
slice_predictions = slice(0,min(limit+1,len(all_distances)))
%time predictions = predict_xy_accuracy_time(all_distances[slice_predictions], all_neighbor_indicies[slice_predictions])

def mean_average_precision3(true, test):
    return np.average(np.sum((true == test) * np.array([1, 1/2, 1/3]), axis=1))

%time mean_average_precision3(place_id[slice(0,len(predictions)), None], predictions)

CPU times: user 23.2 s, sys: 4.37 s, total: 27.6 s
Wall time: 29.3 s
CPU times: user 11.6 ms, sys: 2.34 ms, total: 13.9 ms
Wall time: 13.8 ms


0.58352458237708815

# parameters

* accuracy multiplier
* accuracy bias
* time window in day for full match
* time window in day for least match
* fraction of least match time to full match
* relative weight of time vs distance metric