In [1]:
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from sklearn.cross_validation import train_test_split
from sklearn.metrics import average_precision_score
import numpy as np
import numexpr as ne

In [2]:
train = pd.read_csv("train.csv", index_col="row_id")

xy = train.iloc[:,:2].values # units are kilometers
accuracy = train.iloc[:,2].values * 0.001 # assume accuracy is reported in meters so convert to kilometers
time = train.iloc[:,3].values # units are minutes
time_of_day = train.iloc[:,3].values % 1440 # minutes
time_of_week = train.iloc[:,3].values % 10080 # minutes
place_id = train.iloc[:,4].values

In [3]:
neigh = NearestNeighbors(n_jobs=-1, algorithm='kd_tree')
%time neigh.fit(xy, place_id)

CPU times: user 1min 51s, sys: 931 ms, total: 1min 52s
Wall time: 1min 52s


NearestNeighbors(algorithm='kd_tree', leaf_size=30, metric='minkowski',
         metric_params=None, n_jobs=-1, n_neighbors=5, p=2, radius=1.0)

In [4]:
permutation = np.random.permutation(len(xy))

In [5]:
limit = 400000

validation_indicies = permutation[slice(0,min(limit,len(xy)))]

In [6]:
day_period = 1440
week_period = 10080

# parameters
accuracy_scale = 1 # 1 assumes accuracy is reported in meters
accuracy_min = 0 # meters
accuracy_bias = 0 # meters
time_of_day_max_prob_diff = 0
time_of_day_min_prob = 0
time_of_week_max_prob_diff = week_period/7
time_of_week_min_prob = 0.1
time_abs_max_prob_diff = week_period * 26 # 6-months
time_abs_min_prob_diff = week_period * 52 # 12-months
time_abs_min_prob = 0.1

def time_difference(time1, time2, half_period):
    return ne.evaluate('half_period-abs(abs(time2-time1) - half_period)')

def prob_time_for_period(time1, time2, period, max_prob_diff, min_prob):
    hp = 0.5 * period
    diff = time_difference(time1, time2, hp)
    w = max_prob_diff
    #  (hp, 0), (w, 1)
    #  dy = 1, dx = w-hp
    #   y = x /(w-hp) + b
    #   0 = hp/(w-hp) + b
    #   b = -hp/(w-hp) = -1/(w/hp-1) = 1/(1-w/hp)
    #   y = x/(w-hp) + 1/(1-w/hp)
    prob = ne.evaluate('diff * 1/(w-hp) + 1/(1-w/hp)')
    prob = np.where(diff < max_prob_diff, 1, prob)
    return np.maximum(prob, min_prob)

def prob_time_diff(time1, time2):
    w1 = time_abs_max_prob_diff
    w2 = time_abs_min_prob_diff
    mp = time_abs_min_prob
    diff = np.abs(time1-time2)
    # (w1, 1), (w2, mp)
    # dy = mp-1, dx = w2-w1, m = (mp-1)/(w2-w1)
    # 1 = (mp-1)/(w2-w1) * w1 + b
    # b = 1 - (mp-1)/(w2-w1) * w1
    # y = (mp-1)/(w2-w1) * x + 1 - (mp-1)/(w2-w1)
    prob = ne.evaluate('(mp-1)/(w2-w1) * diff + 1 - (mp-1)/(w2-w1)')
    prob = np.where(diff < w1, 1, prob)
    return np.where(diff > w2, mp, prob)

def uniqify(seq):
    seen = set()
    seen_add = seen.add
    return np.fromiter((x for x in seq if not (x in seen or seen_add(x))), dtype=np.int64)

def prob_overlap_locations(dist, accuracy1, accuracy2):
    return ne.evaluate('exp(-0.5 * dist * dist / (accuracy1 ** 2 + accuracy2 ** 2)) / \
                        (accuracy1 ** 2 + accuracy2 ** 2)') # / (2 * np.pi)

def sum_by_group(values, groups):
    order = np.argsort(groups)
    groups = groups[order]
    values = values[order]
    values.cumsum(out=values)
    index = np.ones(len(groups), 'bool')
    index[:-1] = groups[1:] != groups[:-1]
    values = values[index]
    groups = groups[index]
    values[1:] = values[1:] - values[:-1]
    return values, groups

def predict_xy_accuracy_time(test_points, distances, neighbors, self_validation=False):
    
    def scale_accuracy(accuracy):
        return np.maximum(accuracy + accuracy_bias, accuracy_min) * accuracy_scale
    neighbor_accuracies = scale_accuracy(accuracy[neighbors])
    test_accuracy = scale_accuracy(accuracy[test_points, None])
    colocation_prob = prob_overlap_locations(distances, test_accuracy, neighbor_accuracies)
    
    time_of_day_prob = prob_time_for_period(time_of_day[test_points, None], time_of_day[neighbors], \
                                            day_period, time_of_day_max_prob_diff, time_of_day_min_prob)
    time_of_week_prob = prob_time_for_period(time_of_day[test_points, None], time_of_day[neighbors], \
                                             week_period, time_of_week_max_prob_diff, time_of_week_min_prob)
    time_abs_prob = prob_time_diff(time[test_points, None], time[neighbors])
    
    total_prob = ne.evaluate('colocation_prob * time_of_day_prob * time_of_week_prob * time_abs_prob')
    
    s = slice(1,None) if self_validation else slice(0,None) # skip the first neighbor is self validating
    predictions = np.zeros((len(distances),3))
    for i, (prob, places) in enumerate(zip(total_prob[:,s], place_id[neighbors][:,s])):
        # append a few zeros just incase there is only one nearby place
        # we need three for the precision calculation
        prob, places = sum_by_group(np.append(prob, [0,0]), np.append(places, [0,1]))
        prob, places = zip(*sorted(zip(prob, places),reverse=True))
        predictions[i,:] = places[:3]
    return predictions
        
def mean_average_precision3(true, test):
    values = np.array([1, 1/2, 1/3])
    return ne.evaluate('sum((true == test) * values)') / len(true)

In [7]:
print("find nearest neighbors")
%time distances, neighbors = neigh.kneighbors(xy[validation_indicies], n_neighbors=400)

# del neigh # free up lots of memory

find nearest neighbors
CPU times: user 1min 18s, sys: 11.2 s, total: 1min 29s
Wall time: 26.9 s


In [10]:
print("predict")
%time predictions = predict_xy_accuracy_time(validation_indicies, distances, neighbors, self_validation=True)

print("evaluate")
%time mean_average_precision3(place_id[validation_indicies, None], predictions)

predict
CPU times: user 1min 43s, sys: 1min 38s, total: 3min 22s
Wall time: 3min 20s
evaluate
CPU times: user 59 ms, sys: 420 ms, total: 479 ms
Wall time: 891 ms


0.57287333333354773

In [9]:
def batch_kneighbors(indicies, batch_size = 10):
    distances = []
    neighbors = []
    for i, batch in enumerate(np.split(indicies, batch_size)):
        print("batch {}, neighbors".format(i))
        d, n = neigh.kneighbors(xy[batch], n_neighbors=400)
        distances.append(d)
        neighbors.append(n)
    return np.array(distances).reshape((len(indicies),-1)), np.array(neighbors).reshape((len(indicies),-1))

# %time distances, neighbors = batch_kneighbors(validation_indicies, 20)

100-NN

10k   .56245
20k   .56590 .00345
40k   .56674 .00084
80k   .56489 .00185
160k  .56626 .00137
320k  .56594 .00032
640k  .56600 .00006
1280k .56582 .00018
2560k .56622 .00040

200-NN

10k   .56620
20k   .57058 .00438
40k   .57209 .00151
80k   .57021 .00188
160k  .57116 .00095
320k  .57082 .00034
640k  .57072 .00010
1280k .57061 .00011

400-NN

10k   .56042
20k   .56524 .00482
40k   .56731 .00207
80k   .56527 .00204
160k  .56579 .00052
320k  .56575 .00004
640k  .56709 .00134
1280k .56557 .00152
2560k .activ

# parameters

* accuracy multiplier
* accuracy bias
* time window in day for full match
* time window in day for least match
* fraction of least match time to full match
* relative weight of time vs distance metric