In [1]:
import numpy as np 
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import SGDClassifier
import xgboost as xgb
from xgboost import XGBClassifier

In [2]:
def prep_data(df_train,df_test,nx_cell,ny_cell):
    print('Feature Engineering ..')
    eps = 0.00001
    #feature x/y
    df_train['x_d_y']=df_train.x.values/(df_train.y.values +eps)
    df_test['x_d_y']=df_test.x.values/(df_test.y.values +eps)
    #feature x*y
    df_train['x_m_y']=df_train.x.values*df_train.y.values
    df_test['x_m_y']=df_test.x.values*df_test.y.values

    #feature date and times
    startdate=np.datetime64('2014-01-01T01:01',dtype='datetime64[m]')
    #minutes
    #train
    d_times=pd.DatetimeIndex(startdate +np.timedelta64(int(mn),'m')for mn in df_train.time.values)
    df_train['hour']=d_times.hour
    df_train['weekday'] = d_times.weekday
    df_train['day'] = d_times.day
    df_train['month'] = d_times.month
    df_train['year'] = d_times.year
    df_train = df_train.drop(['time'], axis=1)
    #test
    d_times=pd.DatetimeIndex(startdate +np.timedelta64(int(mn),'m')for mn in df_test.time.values)
    df_test['hour']=d_times.hour
    df_test['weekday'] = d_times.weekday
    df_test['day'] = d_times.day
    df_test['month'] = d_times.month
    df_test['year'] = d_times.year
    df_test = df_test.drop(['time'], axis=1)
    
    #grid 
    
    size_x=10./nx_cell
    size_y=10./ny_cell
    #train
    x_x=np.where(df_train.x.values<eps,0,df_train.x.values-eps)
    y_y=np.where(df_train.y.values<eps,0,df_train.y.values-eps)
    positionx=(x_x/size_x).astype(np.int)
    positiony=(y_y/size_y).astype(np.int)
    df_train['grid_cell']=positiony*nx_cell+positionx
    #test
    x_x=np.where(df_test.x.values<eps,0,df_test.x.values-eps)
    y_y=np.where(df_test.y.values<eps,0,df_test.y.values-eps)
    positionx=(x_x/size_x).astype(np.int)
    positiony=(y_y/size_y).astype(np.int)
    df_test['grid_cell']=positiony*nx_cell+positionx
    #feature normalization
    
    columns = ['x', 'y', 'x_d_y', 'x_m_y', 'hour','weekday', 'day', 'month', 'year']    
    for i in columns:
        mean=df_train[i].mean()
        std_dev=df_train[i].std()
        df_train[i]=(df_train[i].values - mean)/std_dev
        df_test[i]=(df_test[i].values - mean)/std_dev
    return df_train,df_test


    
    
    

In [3]:
def process_cell(df_train,df_test,grid_id,threshold):
    #to pick instances with more than threshold place_ids
    #on train
    df_cell_train=df_train.loc[df_train.grid_cell==grid_id]
    place_id_counter=df_cell_train.place_id.value_counts()
    selector=(place_id_counter[df_cell_train.place_id.values]>=threshold)
    df_cell_train=df_cell_train.loc[selector.values]
    #test
    df_cell_test = df_test.loc[df_test.grid_cell == grid_id]
    row_ids = df_cell_test.index

    #data prep
    le= LabelEncoder()
    y=le.fit_transform(df_cell_train.place_id.values)
    X=df_cell_train.drop(['place_id','grid_cell'],axis=1).values.astype(int)
    X_test = df_cell_test.drop(['grid_cell'], axis = 1).values.astype(int)

    #classifier
    clf=xgb.XGBClassifier()
    clf.fit(X,y)
    y_pred= clf.predict_proba(X_test)
    
    pred_labels=le.inverse_transform(np.argsort(y_pred,axis=1)[:,::-1][:,:3])
    return pred_labels,row_ids


In [4]:
def process_grid(df_train,df_test,df_sub,threshold,n_cells):
    for g_id in range(n_cells):
        if g_id % 10 == 0:
            print('iteration: %s' %(g_id))
    print('processing grid..')
    pred_labels,row_ids=process_cell(df_train,df_test,g_id,threshold)
    str_labels=np.apply_along_axis(lambda x: ''.join(x.astype(str)),1,pred_labels)
    df_sub.loc[row_ids]=str_labels.reshape(-1,1)
    return df_sub


In [5]:
if __name__ == '__main__':
    print('Loading data ...')
    df_train = pd.read_csv('/home/prajwal/Documents/facebook /train.csv',
                           usecols=['row_id','x','y','time','place_id'], 
                           index_col = 0,)
    
    df_test = pd.read_csv('/home/prajwal/Documents/facebook /test.csv',
                          usecols=['row_id','x','y','time'],
                          index_col = 0)
    #Defining the size of the grid
    nx_cell = 20
    ny_cell = 40 
  
    df_train, df_test = prep_data(df_train, df_test, nx_cell, ny_cell)
  
    #Solving classification problems inside each grid cell
    threshold = 500 #Threshold on place_id inside each cell. Only place_ids with at 
            #least th occurrences inside each grid_cell are considered. This
            #is to avoid classes with very few samples and speed-up the 
            #computation.
    df_sub = pd.read_csv('/home/prajwal/Documents/facebook /sample_submission.csv', index_col = 0)   
   
    df_submission  = process_grid(df_train, df_test, df_sub, threshold, 
                                  nx_cell * ny_cell)                                 
    print('Generating submission file ...')
    df_submission.to_csv("sub.csv", index=True) 

Loading data ...
Feature Engineering ..
iteration: 0
iteration: 10
iteration: 20
iteration: 30
iteration: 40
iteration: 50
iteration: 60
iteration: 70
iteration: 80
iteration: 90
iteration: 100
iteration: 110
iteration: 120
iteration: 130
iteration: 140
iteration: 150
iteration: 160
iteration: 170
iteration: 180
iteration: 190
iteration: 200
iteration: 210
iteration: 220
iteration: 230
iteration: 240
iteration: 250
iteration: 260
iteration: 270
iteration: 280
iteration: 290
iteration: 300
iteration: 310
iteration: 320
iteration: 330
iteration: 340
iteration: 350
iteration: 360
iteration: 370
iteration: 380
iteration: 390
iteration: 400
iteration: 410
iteration: 420
iteration: 430
iteration: 440
iteration: 450
iteration: 460
iteration: 470
iteration: 480
iteration: 490
iteration: 500
iteration: 510
iteration: 520
iteration: 530
iteration: 540
iteration: 550
iteration: 560
iteration: 570
iteration: 580
iteration: 590
iteration: 600
iteration: 610
iteration: 620
iteration: 630
iteration: 