In [66]:
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from sklearn.cross_validation import train_test_split
from sklearn.metrics import average_precision_score
import numpy as np
import numexpr as ne
ne.set_num_threads(8)

8

In [2]:
df = pd.read_csv("train.csv", index_col="row_id")

xy = df.iloc[:,:2].values
accuracy = df.iloc[:,2].values
time = df.iloc[:,3].values % 1440 # convert to minutes of the day
place_id = df.iloc[:,4].values

In [3]:
neigh = NearestNeighbors(n_jobs=-1, algorithm='kd_tree', leaf_size=30)
%time neigh.fit(xy, place_id)

CPU times: user 1min 56s, sys: 1.22 s, total: 1min 57s
Wall time: 1min 57s


NearestNeighbors(algorithm='kd_tree', leaf_size=30, metric='minkowski',
         metric_params=None, n_jobs=-1, n_neighbors=5, p=2, radius=1.0)

In [104]:
limit = 400000

subset_indicies = np.random.permutation(len(xy))[:limit]

xy_subset = xy[subset_indicies]
accuracy_subset = accuracy[subset_indicies, None]
time_subset = time[subset_indicies, None]
place_id_subset = place_id[subset_indicies, None]

print("find nearest neighbors")
%time distances_subset, neighbor_indicies_subset = neigh.kneighbors(xy_subset, n_neighbors=100)

find nearest neighbors
CPU times: user 25.6 s, sys: 3.26 s, total: 28.9 s
Wall time: 6.3 s


In [105]:
# assume accuracy is meters
accuracy_scale = 0.001

def time_difference(time1, time2, period=1440):
    return ne.evaluate('period/2-abs(abs(time2-time1) - period/2)')

def prob_overlap_time(time1, time2, period=1440):
    return ne.evaluate('1 - 2*(period/2-abs(abs(time2-time1) - period/2))/period')

def uniqify(seq):
    seen = set()
    seen_add = seen.add
    return np.fromiter((x for x in seq if not (x in seen or seen_add(x))), dtype=np.int64)

def prob_overlap_locations(dist, accuracy1, accuracy2):
    return ne.evaluate('exp(-0.5 * dist * dist / (accuracy1 ** 2 + accuracy2 ** 2)) / \
                        (accuracy1 ** 2 + accuracy2 ** 2)') # / (2 * np.pi)

def sum_by_group(values, groups):
    order = np.argsort(groups)
    groups = groups[order]
    values = values[order]
    values.cumsum(out=values)
    index = np.ones(len(groups), 'bool')
    index[:-1] = groups[1:] != groups[:-1]
    values = values[index]
    groups = groups[index]
    values[1:] = values[1:] - values[:-1]
    return values, groups

def predict_xy_accuracy_time(distances, neighbor_indicies):
    neighbor_accuracies = accuracy[neighbor_indicies] * accuracy_scale
    test_accuracy = accuracy_subset * accuracy_scale
    neighbor_place_id = place_id[neighbor_indicies]
    colocation_prob = prob_overlap_locations(distances, test_accuracy, neighbor_accuracies)
    
    neighbor_time = time[neighbor_indicies]
    time_prob = prob_overlap_time(time_subset, neighbor_time)
    
    total_prob = ne.evaluate('colocation_prob * time_prob')
    
    # TODO: remove the following line for real data just in case a duplicate point is tested
    s = slice(1,None) if distances[0][0] == 0 else slice(0,None) # skip the first neighbor which will be itself
    predictions = np.zeros((len(distances),3))
    for i, (prob, places) in enumerate(zip(total_prob[:,s], neighbor_place_id[:,s])):
        # append a few zeros just incase there is only one nearby place
        # we need three for the precision calculation
        prob, places = sum_by_group(np.append(prob, [0,0]), np.append(places, [0,1]))
        prob, places = zip(*sorted(zip(prob, places),reverse=True))
        predictions[i,:] = places[:3]
    return predictions
        
def mean_average_precision3(true, test):
    values = np.array([1, 1/2, 1/3])
    return ne.evaluate('sum((true == test) * values)') / len(true)

print("predict")
%time predictions = predict_xy_accuracy_time(distances_subset, neighbor_indicies_subset)

print("evaluate")
%time mean_average_precision3(place_id_subset, predictions)

predict
CPU times: user 40.9 s, sys: 6.24 s, total: 47.1 s
Wall time: 42.9 s
evaluate
CPU times: user 14.4 ms, sys: 6.85 ms, total: 21.3 ms
Wall time: 21.4 ms


0.56473166666687025

# parameters

* accuracy multiplier
* accuracy bias
* time window in day for full match
* time window in day for least match
* fraction of least match time to full match
* relative weight of time vs distance metric