In [1]:
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from sklearn.cross_validation import train_test_split
from sklearn.metrics import average_precision_score
import numpy as np
import numexpr as ne
from sklearn.base import BaseEstimator, ClassifierMixin

In [2]:
train = pd.read_csv("train.csv", index_col="row_id").values

X_train = train[:,0:4]
y_train = train[:,4]

# x = train[:,0] # units are kilometers
# y = train[:,1]
# accuracy = train[:,2]* 0.001 # assume accuracy is reported in meters so convert to kilometers
# time = train[:,3] # units are minutes
# time_of_day = train[:,3] % 1440 # minutes
# time_of_week = train[:,3] % 10080 # minutes
# place_id = train[:,4]

In [3]:
# neigh = NearestNeighbors(n_jobs=-1, algorithm='kd_tree')
# %time neigh.fit(train[:,0:2], place_id)

In [11]:
day = 1440
hour = 60
week = 10080

def np_hash(np_array):
    return hash(str(np_array)+str(np_array.shape)+str(np_array.size))

class FacebookCheckins(BaseEstimator, ClassifierMixin):
    
    X = None
    place_id = None
    neigh = None
    neighbors = None
    X_test = None
    y_test = None
    kNN = None
    
    def __init__(self, X, place_id, kNN=400, a_scale=1, a_min=1, a_bias=0, time_model='proximity',
                 day_w1=0, day_w2=12*hour, day_mp=0,
                 week_w1=0, week_w2=3.5*day, week_mp=0,
                 time_w1=20*week, time_w2=50*week, time_mp=1):
        if not self.X or np_hash(X) != np_hash(self.X):
            neigh = None
            neighbors = None
        if self.kNN == None or self.kNN != kNN:
            neighbors = None
        
        self.X = X
        self.x = X[:,0] # kilometers
        self.y = X[:,1] # kilometers
        self.accuracy = X[:,2] * 0.001 # convert meters to kilometers
        self.time = X[:,3] # units are minutes
        self.time_of_day = X[:,3] % 1440 # minutes
        self.time_of_week = X[:,3] % 10080 # minutes
        self.place_id = place_id
        self.kNN = kNN
        self.a_scale = a_scale
        self.a_min = a_min
        self.a_bias = a_bias
        self.time_model = time_model
        self.day_w1 = day_w1
        self.day_w2 = day_w2
        self.day_mp = day_mp
        self.week_w1 = week_w1
        self.week_w2 = week_w2
        self.week_mp = week_mp
        self.time_w1 = time_w1
        self.time_w2 = time_w2
        self.time_mp = time_mp
        
    def time_difference(self, time1, time2, period=None):
        """Find the different in time even if measure is periodic."""
        if period:
            hp = 0.5 * period
            return ne.evaluate('hp-abs(abs(time1-time2) - hp)')
        else:
            return ne.evaluate('abs(time1-time2)')

    def prob_overlap_time(self, diff, w1, w2, mp):
        """Compute the probability the the time difference is significant."""
        # derive equation of line that connects end of w1 and w2
        # points: (w1, 1), (w2, mp)
        # dy = mp-1, dx = w2-w1, m = (mp-1)/(w2-w1)
        # y = m * x + b
        # substitude in point 1
        # 1 = (mp-1)/(w2-w1) * w1 + b
        # solve for b
        # b = 1 - (mp-1)/(w2-w1) * w1
        # y = (mp-1)/(w2-w1) * x + 1 - (mp-1)/(w2-w1)
        prob = ne.evaluate('(mp-1)/(w2-w1) * diff + 1 - (mp-1)/(w2-w1) * w1')
        prob = np.where(diff < w1, 1, prob)
        return np.where(diff > w2, mp, prob)

    def prob_overlap_locations(self, x1, y1, x2, y2, accuracy1, accuracy2):
        """Compute the probability that location measurements represent the same point."""
        return ne.evaluate('exp(-0.5 * ((x1-x2)**2+(y1-y2)**2) / (accuracy1 ** 2 + accuracy2 ** 2)) / \
                            (accuracy1 ** 2 + accuracy2 ** 2)') # / (2 * np.pi)

    def sum_by_group(self, values, groups):
        """Sum a list of values by groups."""
        order = np.argsort(groups)
        groups = groups[order]
        values = values[order]
        values.cumsum(out=values)
        index = np.ones(len(groups), 'bool')
        index[:-1] = groups[1:] != groups[:-1]
        values = values[index]
        groups = groups[index]
        values[1:] = values[1:] - values[:-1]
        return values, groups


    def predict_internal(self, X, neighbors, self_validation=False):

        x_test = X[:,0].reshape((-1,1)) # units are kilometers
        y_test = X[:,1].reshape((-1,1)) # units are kilometers
        a_test = X[:,2].reshape((-1,1)) * 0.001
        time_test = X[:,3].reshape((-1,1))
        day_test = X[:,3].reshape((-1,1)) % 1440
        week_test = X[:,3].reshape((-1,1)) % 10080

        def scale_accuracy(accuracy):
            scale = self.a_scale
            bias = self.a_bias
            a_min = self.a_min
            return np.maximum(accuracy + bias, a_min) * scale

        neighbor_accuracies = scale_accuracy(accuracy[neighbors])
        test_accuracy = scale_accuracy(a_test)
        colocation_prob = self.prob_overlap_locations(x_test, y_test, x[neighbors], y[neighbors], test_accuracy, neighbor_accuracies)

        time_of_day_diff = self.time_difference(day_test, time_of_day[neighbors], day)
        time_of_day_prob = self.prob_overlap_time(time_of_day_diff, self.day_w1, self.day_w2, self.day_mp)

        time_of_week_diff = self.time_difference(week_test, time_of_week[neighbors], week)
        time_of_week_prob = self.prob_overlap_time(time_of_week_diff, self.week_w1, self.week_w2, self.week_mp)

        time_diff = self.time_difference(time_test, time[neighbors])
        time_prob = self.prob_overlap_time(time_diff, self.time_w1, self.time_w2, self.time_mp)

        total_prob = ne.evaluate('colocation_prob * time_of_day_prob * time_of_week_prob * time_prob')

        s = slice(1,None) if self_validation else slice(0,None) # skip the first neighbor if self validating
        self.predictions = np.zeros((len(X),3))
        for i, (prob, places) in enumerate(zip(total_prob[:,s], place_id[neighbors][:,s])):
            # append a few zeros just incase there is only one nearby place
            # we need three for the precision calculation
            prob, places = self.sum_by_group(np.append(prob, [0,0]), np.append(places, [0,1]))
            prob, places = zip(*sorted(zip(prob, places),reverse=True))
            predictions[i,:] = places[:3]
        return predictions

    def mean_average_precision3(self, true, test):
        precision = np.array([1, 1/2, 1/3])
        return ne.evaluate('sum((true == test) * precision)') / len(true)
    
    def fit(self, X_test, y_test):
        if not self.neigh:
            print("Setup NearestNeighbors and fit")
            self.neigh = NearestNeighbors(n_jobs=-1, algorithm='kd_tree').fit(self.X[:,0:2], self.place_id)
        if not self.neighbors or np_hash(self.X_test) != np_hash(X_test):
            print("Find nearest neighbors to test points")
            self.X_test = X_test
            self.neighbors = self.neigh.kneighbors(X_test[:,0:2], n_neighbors=self.kNN, return_distance=False).astype(np.int32)
        self.X_test = X_test
        self.y_test = y_test
        print("predict")
        self.predictions = self.predict_internal(X_test, self.neighbors, self_validation=True)
        
        return self    
    
    def predict(self, X):
        return self.predict_internal(self.X_test, self.neighbors, self_validation=False)
    
    def score(self, X=None, y=None, sample_weight=None):
        return self.mean_average_precision3(self.y_test, self.predictions)
        

In [12]:
limit = 4000

permutation = np.random.permutation(len(X_train))
test_indicies = permutation[slice(0,min(limit,len(X_train)))]

X_test = train[test_indicies, 0:4]
y_test = train[test_indicies, 4]

In [13]:
# parameters
parameters = { 'kNN': 400, 
               'a_scale': 1,
               'a_min': 1,
               'a_bias': 0,
               'time_model': 'proximity', # or 'histogram'
               'day_w1': 0,
               'day_w2': 12*hour,
               'day_mp': 0,
               'week_w1': 0,
               'week_w2': 3.5*day,
               'week_mp': 0,
               'time_w1': 20*week,
               'time_w2': 50*week,
               'time_mp': 1
             }

clf = FacebookCheckins(X=X_train, place_id=y_train)
clf.fit(X_test, y_test)
clf.score()

Setup NearestNeighbors and fit
Find nearest neighbors to test points
predict


NameError: name 'accuracy' is not defined

print("find nearest neighbors")
%time neighbors = neigh.kneighbors(X_validation[:,0:2], n_neighbors=parameters['kNN'], \
                                   return_distance=False).astype(np.int32)

# del neigh # free up lots of memory

print("predict")
%time predictions = predict_internal(X_validation, neighbors, parameters, self_validation=True)

print("evaluate")
%time mean_average_precision3(place_id[validation_indicies, None], predictions)