In [58]:
project_name = "reco-tut-arr"; branch = "main"; account = "sparsh-ai"

In [59]:
import os

if not os.path.exists('/content/reco-tut-arr'):
    !cp /content/drive/MyDrive/mykeys.py /content
    import mykeys
    !rm /content/mykeys.py
    path = "/content/" + project_name; 
    !mkdir "{path}"
    %cd "{path}"
    import sys; sys.path.append(path)
    !git config --global user.email "arr@recohut.com"
    !git config --global user.name  "reco-tut-arr"
    !git init
    !git remote add origin https://"{mykeys.git_token}":x-oauth-basic@github.com/"{account}"/"{project_name}".git
    !git pull origin "{branch}"
    !git checkout main
else:
    %cd '/content/reco-tut-arr'

/content/reco-tut-arr
Initialized empty Git repository in /content/reco-tut-arr/.git/
remote: Enumerating objects: 32, done.[K
remote: Counting objects: 100% (32/32), done.[K
remote: Compressing objects: 100% (26/26), done.[K
remote: Total 32 (delta 7), reused 26 (delta 3), pack-reused 0[K
Unpacking objects: 100% (32/32), done.
From https://github.com/sparsh-ai/reco-tut-arr
 * branch            main       -> FETCH_HEAD
 * [new branch]      main       -> origin/main
Branch 'main' set up to track remote branch 'main' from 'origin'.
Switched to a new branch 'main'


In [None]:
!git status
!git add . && git commit -m 'commit' && git push origin main

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score,classification_report
from sklearn.preprocessing import LabelEncoder

import gc
import datetime
from tqdm.notebook import tqdm

import warnings
warnings.filterwarnings("ignore")

np.random.seed(0)

In [None]:
data_path = dict()

for dirname, _, filenames in os.walk('./data/bronze'):
    for filename in filenames:
        if filename.endswith('.parquet.gz'):
            name = filename.split('.')[0]
            data_path[name] = os.path.join(dirname, filename)

data_path

{'orders': './data/bronze/orders.parquet.gz',
 'test_customers': './data/bronze/test_customers.parquet.gz',
 'test_locations': './data/bronze/test_locations.parquet.gz',
 'train_customers': './data/bronze/train_customers.parquet.gz',
 'train_locations': './data/bronze/train_locations.parquet.gz',
 'vendors': './data/bronze/vendors.parquet.gz'}

In [None]:
orders = pd.read_parquet(data_path['orders'])

vendors = pd.read_parquet(data_path['vendors'])
vendors = vendors.add_prefix('v_')

In [None]:
test_customers = pd.read_parquet(data_path['test_customers'])
test_customers = test_customers[test_customers.duplicated('akeed_customer_id', keep='first')==False].reset_index(drop=True)
test_customers.rename(columns={'akeed_customer_id': 'customer_id'}, inplace=True)

test_locations = pd.read_parquet(data_path['test_locations'])

test_customer_detail = pd.merge(test_locations, test_customers, on='customer_id', how='left')
test_customer_detail = test_customer_detail.add_prefix('c_')
test = test_customer_detail.assign(key=1).merge(vendors.assign(key=1), on='key').drop('key', axis=1)

test_customers.shape, test_locations.shape, vendors.shape, test_customer_detail.shape, test.shape

((9753, 8), (16720, 5), (100, 59), (16720, 12), (1672000, 71))

In [None]:
test.head()

Unnamed: 0,c_customer_id,c_location_number,c_location_type,c_latitude,c_longitude,c_gender,c_dob,c_status,c_verified,c_language,c_created_at,c_updated_at,v_id,v_authentication_id,v_latitude,v_longitude,v_vendor_category_en,v_vendor_category_id,v_delivery_charge,v_serving_distance,v_is_open,v_OpeningTime,v_OpeningTime2,v_prepration_time,v_commission,v_is_akeed_delivering,v_discount_percentage,v_status,v_verified,v_rank,v_language,v_vendor_rating,v_sunday_from_time1,v_sunday_to_time1,v_sunday_from_time2,v_sunday_to_time2,v_monday_from_time1,v_monday_to_time1,v_monday_from_time2,v_monday_to_time2,v_tuesday_from_time1,v_tuesday_to_time1,v_tuesday_from_time2,v_tuesday_to_time2,v_wednesday_from_time1,v_wednesday_to_time1,v_wednesday_from_time2,v_wednesday_to_time2,v_thursday_from_time1,v_thursday_to_time1,v_thursday_from_time2,v_thursday_to_time2,v_friday_from_time1,v_friday_to_time1,v_friday_from_time2,v_friday_to_time2,v_saturday_from_time1,v_saturday_to_time1,v_saturday_from_time2,v_saturday_to_time2,v_primary_tags,v_open_close_flags,v_vendor_tag,v_vendor_tag_name,v_one_click_vendor,v_country_id,v_city_id,v_created_at,v_updated_at,v_device_type,v_display_orders
0,Z59FTQD,0,,126.032278,-9.106019,,,1.0,1.0,,2020-02-09 21:54:25,2020-02-09 21:54:41,4,118597.0,-0.588596,0.754434,Restaurants,2.0,0.0,6.0,1.0,11:00AM-11:30PM,-,15,0.0,Yes,0.0,1.0,1,11,EN,4.4,00:00:00,00:30:00,08:00:00,23:59:00,00:00:00,00:30:00,08:00:00,23:59:00,00:00:00,00:30:00,08:00:00,23:59:00,00:00:00,00:30:00,08:00:00,23:59:00,00:00:00,00:30:00,08:00:00,23:59:00,00:00:00,00:30:00,10:00:00,23:59:00,00:00:00,00:30:00,10:00:00,23:59:00,"{""primary_tags"":""4""}",1.0,2458912212241623,"Arabic,Breakfast,Burgers,Desserts,Free Deliver...",Y,1.0,1.0,2018-01-30 14:42:04,2020-04-07 15:12:43,3,1
1,Z59FTQD,0,,126.032278,-9.106019,,,1.0,1.0,,2020-02-09 21:54:25,2020-02-09 21:54:41,13,118608.0,-0.471654,0.74447,Restaurants,2.0,0.7,5.0,1.0,08:30AM-10:30PM,-,14,0.0,Yes,0.0,1.0,1,11,EN,4.7,00:00:00,01:30:00,08:00:00,23:59:00,00:00:00,01:30:00,08:00:00,23:59:00,00:00:00,01:30:00,08:00:00,23:59:00,00:00:00,01:30:00,08:00:00,19:30:00,00:00:00,01:30:00,08:00:00,19:30:00,00:00:00,01:30:00,08:00:00,23:59:00,00:00:00,01:30:00,08:00:00,23:59:00,"{""primary_tags"":""7""}",1.0,44151342715241628,"Breakfast,Cakes,Crepes,Italian,Pasta,Pizzas,Sa...",Y,1.0,1.0,2018-05-03 12:32:06,2020-04-05 20:46:03,3,1
2,Z59FTQD,0,,126.032278,-9.106019,,,1.0,1.0,,2020-02-09 21:54:25,2020-02-09 21:54:41,20,118616.0,-0.407527,0.643681,Restaurants,2.0,0.0,8.0,1.0,08:00AM-10:45PM,-,19,0.0,Yes,0.0,1.0,1,1,EN,4.5,08:00:00,22:45:00,,,08:00:00,22:45:00,,,08:00:00,22:45:00,,,08:00:00,22:45:00,,,08:00:00,22:45:00,,,08:00:00,22:45:00,,,08:00:00,22:45:00,,,"{""primary_tags"":""71""}",1.0,489110,"Breakfast,Desserts,Free Delivery,Indian",Y,1.0,1.0,2018-05-04 22:28:22,2020-04-07 16:35:55,3,1
3,Z59FTQD,0,,126.032278,-9.106019,,,1.0,1.0,,2020-02-09 21:54:25,2020-02-09 21:54:41,23,118619.0,-0.585385,0.753811,Restaurants,2.0,0.0,5.0,1.0,10:59AM-10:30PM,-,16,0.0,Yes,0.0,1.0,1,11,EN,4.5,09:00:00,23:30:00,,,09:00:00,23:30:00,,,09:00:00,23:30:00,,,09:00:00,23:30:00,,,09:00:00,23:45:00,,,09:00:00,23:45:00,,,09:00:00,23:45:00,,,"{""primary_tags"":""46""}",1.0,583024,"Burgers,Desserts,Fries,Salads",Y,1.0,1.0,2018-05-06 19:20:48,2020-04-02 00:56:17,3,1
4,Z59FTQD,0,,126.032278,-9.106019,,,1.0,1.0,,2020-02-09 21:54:25,2020-02-09 21:54:41,28,118624.0,0.480602,0.55285,Restaurants,2.0,0.7,15.0,1.0,11:00AM-11:45PM,-,10,0.0,Yes,0.0,1.0,1,11,EN,4.4,00:01:00,00:30:00,11:00:00,23:59:00,00:01:00,00:30:00,11:00:00,23:59:00,00:01:00,00:30:00,11:00:00,23:59:00,00:01:00,00:30:00,11:00:00,23:59:00,00:01:00,00:30:00,11:00:00,23:59:00,00:01:00,01:30:00,17:45:00,23:59:00,00:01:00,01:30:00,17:45:00,23:59:00,"{""primary_tags"":""32""}",1.0,5,Burgers,Y,1.0,1.0,2018-05-17 22:12:38,2020-04-05 15:57:41,3,1


In [None]:
train_customers = pd.read_parquet(data_path['train_customers'])
train_customers = train_customers[train_customers.duplicated('akeed_customer_id', keep='first')==False].reset_index(drop=True)
train_customers.rename(columns={'akeed_customer_id': 'customer_id'}, inplace=True)

train_locations = pd.read_parquet(data_path['train_locations'])
train_customer_detail = pd.merge(train_locations, train_customers, on='customer_id', how='left')
train_customer_detail = train_customer_detail.add_prefix('c_')

train = train_customer_detail.assign(key=1).merge(vendors.assign(key=1), on='key').drop('key', axis=1)

train_customers.shape, train_locations.shape, train_customer_detail.shape, train.shape

((34523, 8), (59503, 5), (59503, 12), (5950300, 71))

In [None]:
train.head()

Unnamed: 0,c_customer_id,c_location_number,c_location_type,c_latitude,c_longitude,c_gender,c_dob,c_status,c_verified,c_language,c_created_at,c_updated_at,v_id,v_authentication_id,v_latitude,v_longitude,v_vendor_category_en,v_vendor_category_id,v_delivery_charge,v_serving_distance,v_is_open,v_OpeningTime,v_OpeningTime2,v_prepration_time,v_commission,v_is_akeed_delivering,v_discount_percentage,v_status,v_verified,v_rank,v_language,v_vendor_rating,v_sunday_from_time1,v_sunday_to_time1,v_sunday_from_time2,v_sunday_to_time2,v_monday_from_time1,v_monday_to_time1,v_monday_from_time2,v_monday_to_time2,v_tuesday_from_time1,v_tuesday_to_time1,v_tuesday_from_time2,v_tuesday_to_time2,v_wednesday_from_time1,v_wednesday_to_time1,v_wednesday_from_time2,v_wednesday_to_time2,v_thursday_from_time1,v_thursday_to_time1,v_thursday_from_time2,v_thursday_to_time2,v_friday_from_time1,v_friday_to_time1,v_friday_from_time2,v_friday_to_time2,v_saturday_from_time1,v_saturday_to_time1,v_saturday_from_time2,v_saturday_to_time2,v_primary_tags,v_open_close_flags,v_vendor_tag,v_vendor_tag_name,v_one_click_vendor,v_country_id,v_city_id,v_created_at,v_updated_at,v_device_type,v_display_orders
0,02SFNJH,0,,1.682392,-78.789737,,,1.0,1.0,EN,2019-06-17 00:33:23,2019-10-01 18:46:33,4,118597.0,-0.588596,0.754434,Restaurants,2.0,0.0,6.0,1.0,11:00AM-11:30PM,-,15,0.0,Yes,0.0,1.0,1,11,EN,4.4,00:00:00,00:30:00,08:00:00,23:59:00,00:00:00,00:30:00,08:00:00,23:59:00,00:00:00,00:30:00,08:00:00,23:59:00,00:00:00,00:30:00,08:00:00,23:59:00,00:00:00,00:30:00,08:00:00,23:59:00,00:00:00,00:30:00,10:00:00,23:59:00,00:00:00,00:30:00,10:00:00,23:59:00,"{""primary_tags"":""4""}",1.0,2458912212241623,"Arabic,Breakfast,Burgers,Desserts,Free Deliver...",Y,1.0,1.0,2018-01-30 14:42:04,2020-04-07 15:12:43,3,1
1,02SFNJH,0,,1.682392,-78.789737,,,1.0,1.0,EN,2019-06-17 00:33:23,2019-10-01 18:46:33,13,118608.0,-0.471654,0.74447,Restaurants,2.0,0.7,5.0,1.0,08:30AM-10:30PM,-,14,0.0,Yes,0.0,1.0,1,11,EN,4.7,00:00:00,01:30:00,08:00:00,23:59:00,00:00:00,01:30:00,08:00:00,23:59:00,00:00:00,01:30:00,08:00:00,23:59:00,00:00:00,01:30:00,08:00:00,19:30:00,00:00:00,01:30:00,08:00:00,19:30:00,00:00:00,01:30:00,08:00:00,23:59:00,00:00:00,01:30:00,08:00:00,23:59:00,"{""primary_tags"":""7""}",1.0,44151342715241628,"Breakfast,Cakes,Crepes,Italian,Pasta,Pizzas,Sa...",Y,1.0,1.0,2018-05-03 12:32:06,2020-04-05 20:46:03,3,1
2,02SFNJH,0,,1.682392,-78.789737,,,1.0,1.0,EN,2019-06-17 00:33:23,2019-10-01 18:46:33,20,118616.0,-0.407527,0.643681,Restaurants,2.0,0.0,8.0,1.0,08:00AM-10:45PM,-,19,0.0,Yes,0.0,1.0,1,1,EN,4.5,08:00:00,22:45:00,,,08:00:00,22:45:00,,,08:00:00,22:45:00,,,08:00:00,22:45:00,,,08:00:00,22:45:00,,,08:00:00,22:45:00,,,08:00:00,22:45:00,,,"{""primary_tags"":""71""}",1.0,489110,"Breakfast,Desserts,Free Delivery,Indian",Y,1.0,1.0,2018-05-04 22:28:22,2020-04-07 16:35:55,3,1
3,02SFNJH,0,,1.682392,-78.789737,,,1.0,1.0,EN,2019-06-17 00:33:23,2019-10-01 18:46:33,23,118619.0,-0.585385,0.753811,Restaurants,2.0,0.0,5.0,1.0,10:59AM-10:30PM,-,16,0.0,Yes,0.0,1.0,1,11,EN,4.5,09:00:00,23:30:00,,,09:00:00,23:30:00,,,09:00:00,23:30:00,,,09:00:00,23:30:00,,,09:00:00,23:45:00,,,09:00:00,23:45:00,,,09:00:00,23:45:00,,,"{""primary_tags"":""46""}",1.0,583024,"Burgers,Desserts,Fries,Salads",Y,1.0,1.0,2018-05-06 19:20:48,2020-04-02 00:56:17,3,1
4,02SFNJH,0,,1.682392,-78.789737,,,1.0,1.0,EN,2019-06-17 00:33:23,2019-10-01 18:46:33,28,118624.0,0.480602,0.55285,Restaurants,2.0,0.7,15.0,1.0,11:00AM-11:45PM,-,10,0.0,Yes,0.0,1.0,1,11,EN,4.4,00:01:00,00:30:00,11:00:00,23:59:00,00:01:00,00:30:00,11:00:00,23:59:00,00:01:00,00:30:00,11:00:00,23:59:00,00:01:00,00:30:00,11:00:00,23:59:00,00:01:00,00:30:00,11:00:00,23:59:00,00:01:00,01:30:00,17:45:00,23:59:00,00:01:00,01:30:00,17:45:00,23:59:00,"{""primary_tags"":""32""}",1.0,5,Burgers,Y,1.0,1.0,2018-05-17 22:12:38,2020-04-05 15:57:41,3,1


In [None]:
test["CID X LOC_NUM X VENDOR"] = test["c_customer_id"].astype(str)+' X '+ test["c_location_number"].astype(str)+' X '+ test["v_id"].astype(str)
train["CID X LOC_NUM X VENDOR"] = train["c_customer_id"].astype(str)+' X '+ train["c_location_number"].astype(str)+' X '+ train["v_id"].astype(str)

In [None]:
train['target'] = 0
mask = (train["CID X LOC_NUM X VENDOR"].isin(list(set(train['CID X LOC_NUM X VENDOR']).intersection(set(orders['CID X LOC_NUM X VENDOR'])))))
train['target'][mask] = 1

In [None]:
del test_customers
del test_locations
del vendors
del test_customer_detail
del train_customers
del train_locations
del train_customer_detail
del orders
del mask
gc.collect()

202

In [None]:
test_id=test['CID X LOC_NUM X VENDOR']

cols_drop = ['v_is_akeed_delivering', 'v_open_close_flags','v_one_click_vendor',
             'v_country_id','v_city_id', 'v_display_orders',
            'c_customer_id','CID X LOC_NUM X VENDOR','v_authentication_id',
             'c_language','v_language','v_vendor_tag']

train.drop(cols_drop, axis = 1,inplace=True)
test.drop(cols_drop, axis = 1,inplace=True)

In [None]:
train.to_parquet('./data/silver/train.parquet.gzip', compression='gzip')
test.to_parquet('./data/silver/test.parquet.gzip', compression='gzip')

In [None]:
train = pd.read_parquet('./data/silver/train.parquet.gzip')
test = pd.read_parquet('./data/silver/test.parquet.gzip')

In [5]:
train.c_gender=train.c_gender.str.strip()
test.c_gender=test.c_gender.str.strip()
train.c_gender[pd.isnull(train.c_gender)]  = 'NaN'
test.c_gender[pd.isnull(test.c_gender)]  = 'NaN'

train.replace({'c_gender': {'male': 'Male', '': 'NaN','?????':'NaN'}},inplace=True)
test.replace({'c_gender': {'male': 'Male', '': 'NaN','?????':'NaN'}},inplace=True)

In [6]:
train.replace({'v_OpeningTime2': {'-': 'NaN'}},inplace=True)
test.replace({'v_OpeningTime2': {'-': 'NaN'}},inplace=True)

In [7]:
def label_encoder(collist, old_map=None):
  map = {}
  index=0
  if old_map:
    map = old_map
    index = max(old_map.values())+1
  for x in tqdm(collist):
    if x not in map.keys():
      map[x] = index
      index+=1
  map[np.NaN] = np.NaN
  return map

In [8]:
maps = {}
cols = ['c_location_type', 'c_gender', 'v_vendor_category_en',
        'v_OpeningTime', 'v_OpeningTime2', 'v_sunday_from_time1', 'v_sunday_to_time1',
        'v_sunday_from_time2', 'v_sunday_to_time2', 'v_monday_from_time1', 'v_monday_to_time1',
       'v_monday_from_time2', 'v_monday_to_time2', 'v_tuesday_from_time1',
       'v_tuesday_to_time1', 'v_tuesday_from_time2', 'v_tuesday_to_time2',
       'v_wednesday_from_time1', 'v_wednesday_to_time1',
       'v_wednesday_from_time2', 'v_wednesday_to_time2',
       'v_thursday_from_time1', 'v_thursday_to_time1', 'v_thursday_from_time2',
       'v_thursday_to_time2', 'v_friday_from_time1', 'v_friday_to_time1',
       'v_friday_from_time2', 'v_friday_to_time2', 'v_saturday_from_time1',
       'v_saturday_to_time1', 'v_saturday_from_time2', 'v_saturday_to_time2',
        'v_primary_tags', 'v_vendor_tag_name']

for col in cols:
    maps[col] = label_encoder(train[col].tolist())

HBox(children=(FloatProgress(value=0.0, max=5950300.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5950300.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5950300.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5950300.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5950300.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5950300.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5950300.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5950300.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5950300.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5950300.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5950300.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5950300.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5950300.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5950300.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5950300.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5950300.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5950300.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5950300.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5950300.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5950300.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5950300.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5950300.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5950300.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5950300.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5950300.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5950300.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5950300.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5950300.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5950300.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5950300.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5950300.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5950300.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5950300.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5950300.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5950300.0), HTML(value='')))




In [15]:
for k,v in maps.items():
    print('\n{}\n'.format('='*100))
    print('Mapping for {}\n'.format(k))
    print(v)



Mapping for c_location_type

{None: 0, 'Home': 1, 'Work': 2, 'Other': 3, nan: nan}


Mapping for c_gender

{'NaN': 0, 'Female': 1, 'Male': 2, nan: nan}


Mapping for v_vendor_category_en

{'Restaurants': 0, 'Sweets & Bakes': 1, nan: nan}


Mapping for v_OpeningTime

{'11:00AM-11:30PM': 0, '08:30AM-10:30PM': 1, '08:00AM-10:45PM': 2, '10:59AM-10:30PM': 3, '11:00AM-11:45PM': 4, '11:00AM-10:30PM': 5, '09:00AM-11:30PM': 6, '05:00PM-11:00PM': 7, '08:00AM-11:30PM': 8, '08:30AM-09:30PM': 9, '11:00AM-11:00PM': 10, '11:59AM-2:15 am': 11, '08:00AM-12:30PM': 12, '08:00AM-11:59PM': 13, '10:00AM-11:45PM': 14, '11:59AM-11:30PM': 15, '08:30AM-11:45PM': 16, '11:59AM-11:45PM': 17, '11:00AM-10:45PM': 18, '10:59AM-11:30PM': 19, '11:15AM-10:00PM': 20, '10:59AM-10:59PM': 21, '09:59AM-11:45PM': 22, '8:00AM-09:45PM': 23, '04:00PM-11:45PM': 24, '08:00AM-11:45PM': 25, '11:15AM-10:30PM': 26, '10:59AM-3:30PM': 27, '11.30am-11:30PM': 28, '11:00AM-11:59PM': 29, '09:00AM-09:01AM': 30, '11:59AM-10:45PM': 31, '7:58A

In [27]:
for col in cols:
    train.loc[:,col] = train[col].map(maps[col])
    test.loc[:,col] = test[col].map(maps[col])
    train.loc[:,col] = train.loc[:,col].astype('int')
    test.loc[:,col] = test.loc[:,col].astype('int')
    gc.collect()

In [28]:
train['c_created_at'] = pd.to_datetime(train['c_created_at'], yearfirst=True)
test['c_created_at'] = pd.to_datetime(test['c_created_at'], yearfirst=True)
train['c_updated_at'] = pd.to_datetime(train['c_updated_at'], yearfirst=True)
test['c_updated_at'] = pd.to_datetime(test['c_updated_at'], yearfirst=True)
train['v_created_at'] = pd.to_datetime(train['v_created_at'], yearfirst=True)
test['v_created_at'] = pd.to_datetime(test['v_created_at'], yearfirst=True)
train['v_updated_at'] = pd.to_datetime(train['v_updated_at'], yearfirst=True)
test['v_updated_at'] = pd.to_datetime(test['v_updated_at'], yearfirst=True)

In [29]:
def timediff(duration):
    duration_in_s = duration.total_seconds()
    days = divmod(duration_in_s, 86400)[0]
    return days

train['c_diff_update_create']=train['c_updated_at']-train['c_created_at']
train['v_diff_update_create']=train['v_updated_at']-train['v_created_at']
train['c_v_diff_create']=train['v_created_at']-train['c_created_at']
train['c_v_diff_update']=train['v_updated_at']-train['c_updated_at']

train['c_diff_update_create']=train['c_diff_update_create'].apply(timediff)
train['v_diff_update_create']=train['v_diff_update_create'].apply(timediff)
train['c_v_diff_create']=train['c_v_diff_create'].apply(timediff)
train['c_v_diff_update']=train['c_v_diff_update'].apply(timediff)

test['c_diff_update_create']=test['c_updated_at']-test['c_created_at']
test['v_diff_update_create']=test['v_updated_at']-test['v_created_at']
test['c_v_diff_create']=test['v_created_at']-test['c_created_at']
test['c_v_diff_update']=test['v_updated_at']-test['c_updated_at']

test['c_diff_update_create']=test['c_diff_update_create'].apply(timediff)
test['v_diff_update_create']=test['v_diff_update_create'].apply(timediff)
test['c_v_diff_create']=test['c_v_diff_create'].apply(timediff)
test['c_v_diff_update']=test['c_v_diff_update'].apply(timediff)

In [31]:
train['year_c_created_at'] = train['c_created_at'].dt.year
train['month_c_created_at'] = train['c_created_at'].dt.month
train['doy_c_created_at'] = train['c_created_at'].dt.dayofyear

train['year_c_updated_at'] = train['c_updated_at'].dt.year
train['month_c_updated_at'] = train['c_updated_at'].dt.month
train['doy_c_updated_at'] = train['c_updated_at'].dt.dayofyear

train['year_v_created_at'] = train['v_created_at'].dt.year
train['month_v_created_at'] = train['v_created_at'].dt.month
train['doy_v_created_at'] = train['v_created_at'].dt.dayofyear

train['year_v_updated_at'] = train['v_updated_at'].dt.year
train['month_v_updated_at'] = train['v_updated_at'].dt.month
train['doy_v_updated_at'] = train['v_updated_at'].dt.dayofyear

test['year_c_created_at'] = test['c_created_at'].dt.year
test['month_c_created_at'] = test['c_created_at'].dt.month
test['doy_c_created_at'] = test['c_created_at'].dt.dayofyear

test['year_c_updated_at'] = test['c_updated_at'].dt.year
test['month_c_updated_at'] = test['c_updated_at'].dt.month
test['doy_c_updated_at'] = test['c_updated_at'].dt.dayofyear

test['year_v_created_at'] = test['v_created_at'].dt.year
test['month_v_created_at'] = test['v_created_at'].dt.month
test['doy_v_created_at'] = test['v_created_at'].dt.dayofyear

test['year_v_updated_at'] = test['v_updated_at'].dt.year
test['month_v_updated_at'] = test['v_updated_at'].dt.month
test['doy_v_updated_at'] = test['v_updated_at'].dt.dayofyear

In [32]:
train.drop(['c_created_at','c_updated_at','v_created_at','v_updated_at'], axis = 1, inplace=True)
test.drop(['c_created_at','c_updated_at','v_created_at','v_updated_at'], axis = 1, inplace=True)

In [44]:
train.to_parquet('./data/gold/train.parquet.gzip', compression='gzip')
test.to_parquet('./data/gold/test.parquet.gzip', compression='gzip')

In [None]:
!git add . && git commit -m 'commit' && git push origin main

In [38]:
def haversine_array(lat1, lng1, lat2, lng2):
    lat1, lng1, lat2, lng2 = map(np.radians, (lat1, lng1, lat2, lng2))
    AVG_EARTH_RADIUS = 6371  # in km
    lat = lat2 - lat1
    lng = lng2 - lng1
    d = np.sin(lat * 0.5) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(lng * 0.5) ** 2
    h = 2 * AVG_EARTH_RADIUS * np.arcsin(np.sqrt(d))
    return h

def dummy_manhattan_distance(lat1, lng1, lat2, lng2):
    a = haversine_array(lat1, lng1, lat1, lng2)
    b = haversine_array(lat1, lng1, lat2, lng1)
    return a + b

def bearing_array(lat1, lng1, lat2, lng2):
    AVG_EARTH_RADIUS = 6371  # in km
    lng_delta_rad = np.radians(lng2 - lng1)
    lat1, lng1, lat2, lng2 = map(np.radians, (lat1, lng1, lat2, lng2))
    y = np.sin(lng_delta_rad) * np.cos(lat2)
    x = np.cos(lat1) * np.sin(lat2) - np.sin(lat1) * np.cos(lat2) * np.cos(lng_delta_rad)
    return np.degrees(np.arctan2(y, x))

train['center_latitude'] = (train['c_latitude'].values + train['v_latitude'].values) / 2
train['center_longitude'] = (train['c_longitude'].values + train['v_longitude'].values) / 2
train['harvesine_dist']=haversine_array(train['c_latitude'], train['c_longitude'], train['v_latitude'], train['v_longitude'])
train['manhattan_dist']=dummy_manhattan_distance(train['c_latitude'], train['c_longitude'], train['v_latitude'], train['v_longitude'])
train['bearing']=bearing_array(train['c_latitude'], train['c_longitude'], train['v_latitude'], train['v_longitude'])

test['center_latitude'] = (test['c_latitude'].values + test['v_latitude'].values) / 2
test['center_longitude'] = (test['c_longitude'].values + test['v_longitude'].values) / 2
test['harvesine_dist']=haversine_array(test['c_latitude'], test['c_longitude'], test['v_latitude'], test['v_longitude'])
test['manhattan_dist']=dummy_manhattan_distance(test['c_latitude'], test['c_longitude'], test['v_latitude'], test['v_longitude'])
test['bearing']=bearing_array(test['c_latitude'], test['c_longitude'], test['v_latitude'], test['v_longitude'])

In [41]:
train.head()

Unnamed: 0,c_location_number,c_location_type,c_latitude,c_longitude,c_gender,c_dob,c_status,c_verified,v_id,v_latitude,v_longitude,v_vendor_category_en,v_vendor_category_id,v_delivery_charge,v_serving_distance,v_is_open,v_OpeningTime,v_OpeningTime2,v_prepration_time,v_commission,v_discount_percentage,v_status,v_verified,v_rank,v_vendor_rating,v_sunday_from_time1,v_sunday_to_time1,v_sunday_from_time2,v_sunday_to_time2,v_monday_from_time1,v_monday_to_time1,v_monday_from_time2,v_monday_to_time2,v_tuesday_from_time1,v_tuesday_to_time1,v_tuesday_from_time2,v_tuesday_to_time2,v_wednesday_from_time1,v_wednesday_to_time1,v_wednesday_from_time2,v_wednesday_to_time2,v_thursday_from_time1,v_thursday_to_time1,v_thursday_from_time2,v_thursday_to_time2,v_friday_from_time1,v_friday_to_time1,v_friday_from_time2,v_friday_to_time2,v_saturday_from_time1,v_saturday_to_time1,v_saturday_from_time2,v_saturday_to_time2,v_primary_tags,v_vendor_tag_name,v_device_type,target,c_diff_update_create,v_diff_update_create,c_v_diff_create,c_v_diff_update,year_c_created_at,month_c_created_at,doy_c_created_at,year_c_updated_at,month_c_updated_at,doy_c_updated_at,year_v_created_at,month_v_created_at,doy_v_created_at,year_v_updated_at,month_v_updated_at,doy_v_updated_at,center_latitude,center_longitude,harvesine_dist,manhattan_dist,bearing
0,0,0,1.682392,-78.789737,0,,1.0,1.0,4,-0.588596,0.754434,0,2.0,0.0,6.0,1.0,0,0,15,0.0,0.0,1.0,1,11,4.4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,106.0,798.0,-503.0,188.0,2019.0,6.0,168.0,2019.0,10.0,274.0,2018,1,30,2020,4,98,0.546898,-39.017652,8847.430953,9092.859475,90.908648
1,0,0,1.682392,-78.789737,0,,1.0,1.0,13,-0.471654,0.74447,0,2.0,0.7,5.0,1.0,1,0,14,0.0,0.0,1.0,1,11,4.7,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,1,0,1,0,1,0,1,1,0,0,1,1,0,1,1,3,0,106.0,703.0,-410.0,187.0,2019.0,6.0,168.0,2019.0,10.0,274.0,2018,5,123,2020,4,96,0.605369,-39.022633,8845.913268,9078.748986,90.790117
2,0,0,1.682392,-78.789737,0,,1.0,1.0,20,-0.407527,0.643681,0,2.0,0.0,8.0,1.0,2,0,19,0.0,0.0,1.0,1,1,4.5,1,2,1,1,1,2,1,1,1,2,1,1,1,2,1,2,1,2,1,2,1,2,2,1,1,2,2,1,2,2,3,0,106.0,703.0,-409.0,188.0,2019.0,6.0,168.0,2019.0,10.0,274.0,2018,5,124,2020,4,98,0.637432,-39.073028,8834.488717,9060.419253,90.728136
3,0,0,1.682392,-78.789737,0,,1.0,1.0,23,-0.585385,0.753811,0,2.0,0.0,5.0,1.0,3,0,16,0.0,0.0,1.0,1,11,4.5,2,3,1,1,2,3,1,1,2,3,1,1,2,3,1,2,2,3,1,2,2,3,2,1,2,3,2,1,3,3,3,0,106.0,696.0,-407.0,183.0,2019.0,6.0,168.0,2019.0,10.0,274.0,2018,5,126,2020,4,93,0.548503,-39.017963,8847.350413,9092.433174,90.905405
4,0,0,1.682392,-78.789737,0,,1.0,1.0,28,0.480602,0.55285,0,2.0,0.7,15.0,1.0,4,0,10,0.0,0.0,1.0,1,11,4.4,3,0,2,0,3,0,2,0,3,0,2,0,3,0,2,0,3,0,2,0,3,1,3,0,3,1,3,0,4,4,3,0,106.0,688.0,-396.0,186.0,2019.0,6.0,168.0,2019.0,10.0,274.0,2018,5,137,2020,4,96,1.081497,-39.118443,8821.455686,8951.571321,89.827713


In [42]:
test.head()

Unnamed: 0,c_location_number,c_location_type,c_latitude,c_longitude,c_gender,c_dob,c_status,c_verified,v_id,v_latitude,v_longitude,v_vendor_category_en,v_vendor_category_id,v_delivery_charge,v_serving_distance,v_is_open,v_OpeningTime,v_OpeningTime2,v_prepration_time,v_commission,v_discount_percentage,v_status,v_verified,v_rank,v_vendor_rating,v_sunday_from_time1,v_sunday_to_time1,v_sunday_from_time2,v_sunday_to_time2,v_monday_from_time1,v_monday_to_time1,v_monday_from_time2,v_monday_to_time2,v_tuesday_from_time1,v_tuesday_to_time1,v_tuesday_from_time2,v_tuesday_to_time2,v_wednesday_from_time1,v_wednesday_to_time1,v_wednesday_from_time2,v_wednesday_to_time2,v_thursday_from_time1,v_thursday_to_time1,v_thursday_from_time2,v_thursday_to_time2,v_friday_from_time1,v_friday_to_time1,v_friday_from_time2,v_friday_to_time2,v_saturday_from_time1,v_saturday_to_time1,v_saturday_from_time2,v_saturday_to_time2,v_primary_tags,v_vendor_tag_name,v_device_type,c_diff_update_create,v_diff_update_create,c_v_diff_create,c_v_diff_update,year_c_created_at,month_c_created_at,doy_c_created_at,year_c_updated_at,month_c_updated_at,doy_c_updated_at,year_v_created_at,month_v_created_at,doy_v_created_at,year_v_updated_at,month_v_updated_at,doy_v_updated_at,center_latitude,center_longitude,harvesine_dist,manhattan_dist,bearing
0,0,0,126.032278,-9.106019,0,,1.0,1.0,4,-0.588596,0.754434,0,2.0,0.0,6.0,1.0,0,0,15,0.0,0.0,1.0,1,11,4.4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0.0,798.0,-741.0,57.0,2020.0,2.0,40.0,2020.0,2.0,40.0,2018,1,30,2020,4,98,62.721841,-4.175792,14010.898478,14724.044206,167.779615
1,0,0,126.032278,-9.106019,0,,1.0,1.0,13,-0.471654,0.74447,0,2.0,0.7,5.0,1.0,1,0,14,0.0,0.0,1.0,1,11,4.7,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,1,0,1,0,1,0,1,1,0,0,1,1,0,1,1,3,0.0,703.0,-648.0,55.0,2020.0,2.0,40.0,2020.0,2.0,40.0,2018,5,123,2020,4,96,62.780312,-4.180774,13998.133826,14710.390678,167.809792
2,0,0,126.032278,-9.106019,0,,1.0,1.0,20,-0.407527,0.643681,0,2.0,0.0,8.0,1.0,2,0,19,0.0,0.0,1.0,1,1,4.5,1,2,1,1,1,2,1,1,1,2,1,1,1,2,1,2,1,2,1,2,1,2,2,1,1,2,2,1,2,2,3,0.0,703.0,-646.0,57.0,2020.0,2.0,40.0,2020.0,2.0,40.0,2018,5,124,2020,4,98,62.812375,-4.231169,13992.442453,14696.683277,167.942978
3,0,0,126.032278,-9.106019,0,,1.0,1.0,23,-0.585385,0.753811,0,2.0,0.0,5.0,1.0,3,0,16,0.0,0.0,1.0,1,11,4.5,2,3,1,1,2,3,1,1,2,3,1,1,2,3,1,2,2,3,1,2,2,3,2,1,2,3,2,1,3,3,3,0.0,696.0,-645.0,52.0,2020.0,2.0,40.0,2020.0,2.0,40.0,2018,5,126,2020,4,93,62.723446,-4.176104,14010.552771,14723.646466,167.780873
4,0,0,126.032278,-9.106019,0,,1.0,1.0,28,0.480602,0.55285,0,2.0,0.7,15.0,1.0,4,0,10,0.0,0.0,1.0,1,11,4.4,3,0,2,0,3,0,2,0,3,0,2,0,3,0,2,0,3,0,2,0,3,1,3,0,3,1,3,0,4,4,3,0.0,688.0,-633.0,55.0,2020.0,2.0,40.0,2020.0,2.0,40.0,2018,5,137,2020,4,96,63.25644,-4.276584,13895.649154,14592.000654,168.186138


In [62]:
!git status

On branch main
Your branch is up to date with 'origin/main'.

Changes not staged for commit:
  (use "git add/rm <file>..." to update what will be committed)
  (use "git checkout -- <file>..." to discard changes in working directory)

	[31mdeleted:    data/silver/test_stage_2.parquet.gzip[m
	[31mdeleted:    data/silver/train_stage_2.parquet.gzip[m

Untracked files:
  (use "git add <file>..." to include in what will be committed)

	[31mdata/gold/[m

no changes added to commit (use "git add" and/or "git commit -a")


In [63]:
!git add . && git commit -m 'Add data layer gold' && git push origin main

[main 4530365] Add data layer gold
 2 files changed, 0 insertions(+), 0 deletions(-)
 rename data/{silver/test_stage_2.parquet.gzip => gold/test.parquet.gzip} (100%)
 rename data/{silver/train_stage_2.parquet.gzip => gold/train.parquet.gzip} (100%)
Counting objects: 5, done.
Delta compression using up to 2 threads.
Compressing objects: 100% (5/5), done.
Writing objects: 100% (5/5), 512 bytes | 512.00 KiB/s, done.
Total 5 (delta 2), reused 0 (delta 0)
remote: Resolving deltas: 100% (2/2), completed with 2 local objects.[K
To https://github.com/sparsh-ai/reco-tut-arr.git
   7e8d9dc..4530365  main -> main
