In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv('D:/Projects/Rapido/rapido_clean.csv', parse_dates=['ts'])
data.head()

Unnamed: 0,ts,number,pick_lat,pick_lng,drop_lat,drop_lng
0,2018-04-07 07:07:17,14626,12.313621,76.658195,12.287301,76.60228
1,2018-04-07 07:32:27,85490,12.943947,77.560745,12.954014,77.54377
2,2018-04-07 07:36:44,5408,12.899603,77.5873,12.93478,77.56995
3,2018-04-07 07:38:00,58940,12.918229,77.607544,12.968971,77.636375
4,2018-04-07 07:39:29,5408,12.89949,77.58727,12.93478,77.56995


In [5]:
data.dtypes

ts          datetime64[ns]
number               int64
pick_lat           float64
pick_lng           float64
drop_lat           float64
drop_lng           float64
dtype: object

## Feature Engineering

- We will round up latitudes and longitudes upto 3 decimal points

In [6]:
data.pick_lat = round(data.pick_lat,3)
data.pick_lng = round(data.pick_lng,3)

data.drop_lat = round(data.drop_lat,3)
data.drop_lng = round(data.drop_lng,3)

In [7]:
data.head()

Unnamed: 0,ts,number,pick_lat,pick_lng,drop_lat,drop_lng
0,2018-04-07 07:07:17,14626,12.314,76.658,12.287,76.602
1,2018-04-07 07:32:27,85490,12.944,77.561,12.954,77.544
2,2018-04-07 07:36:44,5408,12.9,77.587,12.935,77.57
3,2018-04-07 07:38:00,58940,12.918,77.608,12.969,77.636
4,2018-04-07 07:39:29,5408,12.899,77.587,12.935,77.57


## Predict pick_lat

In [8]:
data[data.number == 14626].shape[0]

4

In [9]:
data[data.number == 14626]

Unnamed: 0,ts,number,pick_lat,pick_lng,drop_lat,drop_lng
0,2018-04-07 07:07:17,14626,12.314,76.658,12.287,76.602
197885,2018-04-28 17:45:54,14626,12.983,77.545,12.952,77.543
197899,2018-04-28 18:20:10,14626,12.972,77.538,12.952,77.543
2802153,2018-11-15 09:54:30,14626,12.927,77.615,12.85,77.669


## Booking frequency of a user from a particular latitude

In [10]:
user_lat = data.groupby(['number','pick_lat']).count()['ts'].reset_index()

In [11]:
unique_user = user_lat.number.unique()

In [12]:
predictions = {}

for each_user in unique_user:
    current_data = user_lat[user_lat.number == each_user]
    current_data = current_data.sort_values(by='ts',ascending=False)
    location = current_data.iloc[0].pick_lat
    
    predictions[each_user] = location

In [13]:
predictions

{-1: 12.928,
 0: 12.958,
 1: 12.92,
 2: 12.955,
 3: 12.984,
 4: 12.995,
 5: 12.975,
 6: 12.938,
 7: 13.05,
 8: 12.935,
 9: 12.957,
 10: 12.888,
 11: 12.906,
 12: 12.929,
 13: 12.959,
 14: 12.974,
 15: 12.952,
 16: 12.947,
 17: 12.91,
 18: 12.979,
 19: 12.95,
 20: 12.315,
 21: 12.959,
 22: 12.906,
 23: 12.956,
 24: 12.919,
 25: 12.975,
 26: 13.032,
 27: 12.982,
 28: 17.47,
 29: 12.964,
 30: 12.905,
 31: 12.925,
 32: 12.971,
 33: 13.025,
 34: 12.931,
 35: 12.929,
 36: 12.948,
 38: 12.972,
 39: 12.922,
 40: 12.953,
 41: 12.91,
 42: 12.954,
 43: 12.92,
 44: 13.066,
 45: 12.94,
 46: 12.995,
 47: 17.434,
 48: 12.956,
 49: 12.972,
 50: 12.906,
 51: 12.921,
 52: 12.931,
 53: 12.888,
 54: 12.971,
 55: 12.981,
 56: 12.926,
 57: 12.947,
 58: 12.921,
 59: 12.992,
 60: 13.007,
 61: 12.956,
 62: 12.926,
 63: 12.907,
 64: 12.974,
 65: 12.853,
 66: 12.92,
 67: 12.95,
 68: 12.993,
 69: 12.324,
 70: 12.932,
 71: 12.993,
 72: 12.951,
 73: 12.955,
 74: 13.017,
 75: 12.96,
 76: 13.011,
 77: 12.949,
 78: 12

## Writing and saving it to a pickle file

In [14]:
import pickle

In [15]:
with open('model_1.pkl', 'wb') as handle:
    pickle.dump(predictions, handle, protocol=pickle.HIGHEST_PROTOCOL)