_author_ = https://www.kaggle.com\shubhamp05

In [1]:
import numpy as np
import pandas as pd
import datetime
import gc
from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

In [2]:
## Function to prepare data

def remove_infrequent_place_id(df):
    places,count = np.unique(df.place_id.values,return_counts=True)
    places = places[count>=10]
    df = df.loc[df.place_id.isin(places)]
    return(df)

In [3]:
## This is a way to calculate MAP@k with only one truth values
def mapkprecision(truth,prediction):
    z = (prediction == truth[:,None]).astype(np.float32)
    w = 1./(np.arange(prediction.shape[1],dtype = np.float32) +1.)
    z = z * w[None,:]
    return(np.mean(np.sum(z,axis=1)))

In [4]:
def load_data():
    types = {'row_id': np.dtype(np.int32),
         'x': np.dtype(float),
         'y' : np.dtype(float),
         'accuracy': np.dtype(np.int16),
         'place_id': np.int64,
         'time': np.dtype(np.int32)}
    train = pd.read_csv('train.csv/train.csv',dtype=types)
    test = pd.read_csv('test.csv/test.csv',dtype=types)
    return(train,test)

In [5]:
## Calculation distance
def calc_distance(distance):
    return(distance ** -2.225)

In [33]:
def process_cell(train,test,fw,th,n_neighbors):
    # remove infrequent places
    train = remove_infrequent_place_id(train)
    
    ## storing row_ids of test to generate submission
    row_id = test['row_id'].values
    test.drop('row_id',axis=1,inplace=True)
    
    ## preparing data
    le = LabelEncoder()
    y = le.fit_transform(train.place_id.values)
    train.drop('place_id',axis=1,inplace=True)
    
    ## Applying the classifier
    knn = KNeighborsClassifier(n_neighbors=n_neighbors,weights=calc_distance,n_jobs=2,leaf_size=15,p=1)
    
    knn.fit(train,y)
    
    preds = knn.predict_proba(test)
    preds = le.inverse_transform(np.argsort(preds,axis=1)[:,::-1][:,0:3])
    return(preds)

In [28]:
## Create time_dict, to not to create in every iteration
def create_time_dict(t_cuts,time_factor,time_expansion):
    t_slice = 24/t_cuts
    time_dict = dict()
    
    for t in range(t_cuts):
        t_min = 2*np.pi*(t*t_slice*12 / 288)
        t_max = 2*np.pi*(((t+1)*t_slice*12 -1)/288)
        sin_t_start = np.sin(t_min) * time_factor
        sin_t_stop = np.sin(t_max) * time_factor
        cos_t_start = np.cos(t_min) * time_factor
        cos_t_stop = np.cos(t_max) * time_factor
        
        sin_t_min = min(sin_t_start,sin_t_stop)
        sin_t_max = max(sin_t_start,sin_t_stop)
        cos_t_min = min(cos_t_start,cos_t_stop)
        cos_t_max = max(cos_t_start,cos_t_stop)\
        
        time_dict[t] = [sin_t_min,sin_t_max,cos_t_min,cos_t_max]
        
        t_min = 2 * np.pi * ((t * t_slice - time_expansion) * 12 / 288)
        t_max = 2 * np.pi * ((((t + 1) * t_slice + time_expansion)* 12 - 1) / 288)
        sin_t_start = np.round(np.sin(t_min)+1, 4) * time_factor
        sin_t_stop = np.round(np.sin(t_max)+1, 4) * time_factor
        cos_t_start = np.round(np.cos(t_min)+1, 4) * time_factor
        cos_t_stop = np.round(np.cos(t_max)+1, 4) * time_factor
        sin_t_min = min((sin_t_start, sin_t_stop, sin_t_min))
        sin_t_max = max((sin_t_start, sin_t_stop, sin_t_max))
        cos_t_min = min((cos_t_start, cos_t_stop, cos_t_min))
        cos_t_max = max((cos_t_start, cos_t_stop, cos_t_max))
        time_dict[t] += [sin_t_min, sin_t_max, cos_t_min, cos_t_max]
        
        return(time_dict)

In [8]:
## Feature Engineering
## In this function I am going to utilize periodic property of Sine & Cosine and use "time" feature to extract more features

def feature_engineering(df):
    minute = 2*np.pi*((df.time//5)%288)/288
    
    ## sine feature of minute
    df['minute_sine'] = np.sin(minute)
    
    ## Cosine feature of minute
    df['minute_cosine'] = np.cos(minute)
    
    ## Weekday feature
    weekday = 2*np.pi*((df.time/(60*24))%7)
    
    ## sine of weekday
    df['weekday_sine'] = np.sin(weekday)
    
    ## Cosine of weekday
    df['weekday_cosine'] = np.cos(weekday)
    
    ## Day feature
    day = 2*np.pi*((df.time/(60*24))%365)
    
    ## Sine of day
    df['day_sine'] = np.sin(day)
    
    ## Cosine of day
    df['day_cosine'] = np.cos(day)
    
    ## Year feature
    year = 2*np.pi*((df.time/(60*24*365)))
    
    df['year_sine'] = np.sin(year)
    df['year_cosine'] = np.cos(year)
    
    ## Month Feature
    month = 2*np.pi*((df.time/(60*24*30))%12)
    
    df['month_sine'] = np.sin(month)
    df['month_cosine'] = np.cos(month)
    
    return(df)

In [31]:
## To Process grids
def  grid_process(train,test,x_cuts,y_cuts,t_cuts,x_border_expansion,y_border_expansion,time_epansion,fw,th,n_neighbors):
    preds_list = []
    
    ## Calculating x,y & time slice in train dataset
    x_slice = train['x'].max()/x_cuts
    y_slice = train['y'].max()/y_cuts
    time_max = train['minute_sine'].max()
    time_factor = time_max / 2
    
    ## Creating time_dict
    time_dict = create_time_dict(t_cuts,time_factor,time_expansion)
    
    ## Looping over x-axis cuts
    
    for i in range(x_cuts):
        x_min = x_slice * i
        x_max = x_slice * (i+1)
        x_max += int((i+1) == x_cuts)
        
        mask = (test['x'] >= x_min)& (test['x'] < x_max)
        
        df_test = test.loc[mask]
        x_min -= x_border_expansion
        x_max += x_border_expansion
        
        mask = (train['x']>=x_min)& (train['x']<x_max)
        df_train = train.loc[mask]
        
        for j in range(y_cuts):
            y_min = y_slice * j
            y_max = y_slice * (j+1)
            
            y_max += int((j+1) == y_cuts)
            
            mask = (test['y'] >= y_min) & (test['y']>y_max)
            
            df_test = df_test[mask]
            
            y_min -= y_border_expansion
            y_max += y_border_expansion
            
            mask = (train['y']>=x_min)& (train['y']<x_max)
            df_train = train.loc[mask]
        
            for t in range(t_cuts):
                #print(df_row_test.shape, df_row_train.shape)
                t_lim = time_dict[t]
                mask = df_test['minute_sine'].between(t_lim[0], t_lim[1])
                mask = mask & df_test['minute_cosine'].between(t_lim[2], t_lim[3])
                df_cell_test = df_test[mask].copy()
                mask = df_train['minute_sine'].between(t_lim[4], t_lim[5])
                mask = mask & df_train['minute_cosine'].between(t_lim[6], t_lim[7])
                df_cell_train = df_train[mask].copy()
                cell_pred = process_cell(df_cell_train.copy(), 
                                             df_cell_test.copy(), 
                                             fw, th, n_neighbors)
                preds_list.append(cell_pred)
        elapsed = (time.time() - row_start_time)
        print('Row', i, 'completed in:', timedelta(seconds=elapsed))
    preds = np.vstack(preds_list)
    return(preds)

In [9]:
def validation_split(df,validation_start_day):
    day = df['time']//1440
    df_val = df[day>=validation_start_day]
    df = df[day<validation_start_day]
    return(df,df_val)

In [10]:
def apply_weights(df,weights):
    df['accuracy'] *= weights[0]
    df['day_sine'] *= weights[1]
    df['day_cosine'] *= weights[1]
    df['minute_sine'] *= weights[2]
    df['minute_cosine'] *= weights[2]
    df['weekday_sine'] *= weights[3]
    df['weekday_cosine'] *= weights[3]
    df.x *= weights[4]
    df.y *= weights[5]
    df['year_sine'] *= weights[6]
    df['year_cosine'] *= weights[6]
    return(df)

In [11]:
def prepare_data(validation_start_day,fw):
    
    ## Loading data from input location
    train = pd.read_csv('train.csv/train.csv')
    test = pd.read_csv('test.csv/test.csv')
    
    print("the length of train before validation_split: ",len(train))
    
    ## Create Validation data from validation_split function
    ## validation is done on the time axis to remain aligned with kaggle private leaderboard
    
    train,validation = validation_split(train,validation_start_day)
    truth_val = validation.place_id.values
    validation.drop('place_id',axis=1,inplace=True)
    
    print("the length of train after validation_split: ",len(train))
    ## Some feature Engineering on Train
    
    ## Drop 'row_id' as row_id is just an enumerate field
    train.drop('row_id',axis = 1,inplace = True)
    
    print("length of train:",len(train))
    ## Drop infrequent Place_id's from train
    train = remove_infrequent_place_id(train)
    
    print(train.columns)
    
    ## Add new features derived
    train = feature_engineering(train)
    
    print("the length of train: ",len(train))
    
    train = apply_weights(train,fw)
    print("the length of train after apply weights: ",len(train))
    
    if validation_start_day == 0:
        test = feature_engineering(test)
        test = apply_weights(test,fw)
    
    return(train,test,truth_val)

In [14]:
def generate_submission(preds):    
    print('Writing submission file')
    print('Pred shape:', preds.shape)
    with open('KNN_submission.csv', "w") as out:
        out.write("row_id,place_id\n")
        rows = ['']*8607230
        n=0
        for num in range(8607230):
            rows[n]='%d,%d %d %d\n' % (preds[num,0],preds[num,1],preds[num,2],preds[num,3])
            n=n+1
        out.writelines(rows)

In [22]:
# Defining the parameters to be used in initializing the grid (x*y)
fw = [0.61,0.32435, 0.56525, 0.2670, 22, 52, 0.51885]
x_cuts = 20
y_cuts = 40
t_cuts = 4 ## Dividing the 24 hr time circle into 4 parts
x_border_expansion = 0.05
y_border_expansion = 0.017
time_expansion = 2
n_neighbors = 35
val_start_day = 0
th = 10 ## Threshold to seperate place_id. having less than 10 occurances
features = ['x','y','minute_sine','minute_cosine','accuracy','day_sine','day_cosine','weekday_sine','weekday_cosine','year','place_id']

In [34]:
train,test,truth_val = prepare_data(0,fw)
preds = grid_process(train, test, x_cuts, y_cuts, t_cuts,
                     x_border_expansion, y_border_expansion, time_expansion, 
                     fw, th, n_neighbors)

if val_start_day > 0:
    preds = preds[preds[:, 0] > 0] # only use rows predicted
    labels = val_label.loc[preds[:, 0]].values
    score = mapkprecision(labels, preds[:, 1:])
    print('Final score:', score)
else:
    generate_submission(preds)

('the length of train before validation_split: ', 29118021)
('the length of train after validation_split: ', 0)
('length of train:', 0)
Index([u'x', u'y', u'accuracy', u'time', u'place_id'], dtype='object')
('the length of train: ', 0)
('the length of train after apply weights: ', 0)




ValueError: Found array with 0 sample(s) (shape=(0, 14)) while a minimum of 1 is required.