In [None]:
# !cp /content/drive/MyDrive/TempData/BookingChallenge.zip /content
# !unzip /content/BookingChallenge.zip

## ItemPop Model

In [3]:
import pandas as pd

In [4]:
train_set = pd.read_csv('train_set.csv').sort_values(by=['utrip_id','checkin'])

print(train_set.shape)
train_set.head()

(1166835, 9)


Unnamed: 0,user_id,checkin,checkout,city_id,device_class,affiliate_id,booker_country,hotel_country,utrip_id
0,1000027,2016-08-13,2016-08-14,8183,desktop,7168,Elbonia,Gondal,1000027_1
1,1000027,2016-08-14,2016-08-16,15626,desktop,7168,Elbonia,Gondal,1000027_1
2,1000027,2016-08-16,2016-08-18,60902,desktop,7168,Elbonia,Gondal,1000027_1
3,1000027,2016-08-18,2016-08-21,30628,desktop,253,Elbonia,Gondal,1000027_1
4,1000033,2016-04-09,2016-04-11,38677,mobile,359,Gondal,Cobra Island,1000033_1


In [5]:
test_set = pd.read_csv('test_set.csv').sort_values(by=['utrip_id','checkin'])

print(test_set.shape)
test_set.head()

(378667, 9)


Unnamed: 0,user_id,checkin,checkout,device_class,affiliate_id,booker_country,utrip_id,city_id,hotel_country
0,1000066,2016-07-21,2016-07-23,desktop,9924,Gondal,1000066_2,56430,Urkesh
1,1000066,2016-07-23,2016-07-25,desktop,9924,Gondal,1000066_2,41971,Urkesh
2,1000066,2016-07-25,2016-07-28,desktop,9924,Gondal,1000066_2,5797,Urkesh
3,1000066,2016-07-28,2016-07-31,mobile,2436,Gondal,1000066_2,0,
4,1000270,2016-02-08,2016-02-09,mobile,9452,The Devilfire Empire,1000270_1,50075,The Devilfire Empire


Generate Dummy Predictions - use top 4 cities in the trainset as benchmark recommendation

In [7]:
topcities = train_set.city_id.value_counts().index[:4]

test_trips = (test_set[['utrip_id']].drop_duplicates()).reset_index().drop('index', axis=1)
cities_prediction = pd.DataFrame([topcities]*test_trips.shape[0]
                                 , columns= ['city_id_1','city_id_2','city_id_3','city_id_4'])

Create Submission file according to the format

In [8]:
submission = pd.concat([test_trips,cities_prediction], axis =1)
print(submission.shape)
submission.head()

(70662, 5)


Unnamed: 0,utrip_id,city_id_1,city_id_2,city_id_3,city_id_4
0,1000066_2,47499,23921,36063,17013
1,1000270_1,47499,23921,36063,17013
2,1000441_1,47499,23921,36063,17013
3,100048_1,47499,23921,36063,17013
4,1000543_1,47499,23921,36063,17013


In [13]:
submission.to_csv('submission.csv',index=False)

Read submission file and ground truth

In [14]:
ground_truth = pd.read_csv('ground_truth.csv',index_col=[0])
submission = pd.read_csv('submission.csv',index_col=[0])

In [15]:
print(ground_truth.shape)
ground_truth.head()

(70662, 2)


Unnamed: 0_level_0,city_id,hotel_country
utrip_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1038944_1,54085,Sokovia
1068715_1,29319,Cobra Island
1075528_1,55763,Bozatta
1110462_4,11930,Alvonia
1132565_1,58659,Axphain


Evaluate - use accuracy at 4 to evaluate the prediction

In [16]:
def evaluate_accuracy_at_4(submission,ground_truth):
    '''checks if the true city is within the four recommended cities'''
    data = submission.join(ground_truth, on='utrip_id')

    hits = ((data['city_id']==data['city_id_1'])|(data['city_id']==data['city_id_2'])|
        (data['city_id']==data['city_id_3'])|(data['city_id']==data['city_id_4']))*1
    return hits.mean()

In [17]:
evaluate_accuracy_at_4(submission,ground_truth)

0.05271574537941185

## X

In [None]:
# # Install RAPIDS
# !git clone https://github.com/rapidsai/rapidsai-csp-utils.git
# !bash rapidsai-csp-utils/colab/rapids-colab.sh stable

# import sys, os

# dist_package_index = sys.path.index('/usr/local/lib/python3.7/dist-packages')
# sys.path = sys.path[:dist_package_index] + ['/usr/local/lib/python3.7/site-packages'] + sys.path[dist_package_index:]
# sys.path
# exec(open('rapidsai-csp-utils/colab/update_modules.py').read(), globals())

In [4]:
import pandas as pd
from sklearn.model_selection import GroupKFold

import cudf
from numba import cuda

In [8]:
train = cudf.read_csv('train_set.csv').sort_values(by=['user_id','checkin'])
test = cudf.read_csv('test_set.csv').sort_values(by=['user_id','checkin'])

print(train.shape, test.shape)
train.head()

(1166835, 9) (378667, 9)


Unnamed: 0,user_id,checkin,checkout,city_id,device_class,affiliate_id,booker_country,hotel_country,utrip_id
413669,29,2016-07-09,2016-07-11,47054,desktop,1601,Elbonia,Elbonia,29_1
413670,29,2016-07-11,2016-07-13,34444,desktop,1601,Elbonia,Elbonia,29_1
413671,29,2016-07-13,2016-07-16,12291,desktop,1601,Elbonia,Elbonia,29_1
413672,29,2016-07-16,2016-07-18,16386,desktop,8132,Elbonia,Elbonia,29_1
1128910,81,2016-05-15,2016-05-16,33665,desktop,9924,Elbonia,Elbonia,81_1


In [11]:
test.sample(10)

Unnamed: 0,user_id,checkin,checkout,device_class,affiliate_id,booker_country,utrip_id,city_id,hotel_country,istest
183121,3732737,2016-10-15,2016-10-16,desktop,8436,Gondal,3732737_2,50957,Borginia,1
183162,3733720,2016-08-29,2016-09-01,desktop,8132,Elbonia,3733720_1,26235,Alvonia,1
183234,3734474,2016-08-12,2016-08-13,desktop,384,Gondal,3734474_1,40292,Axphain,1
183198,3734251,2016-06-06,2016-06-08,desktop,7974,The Devilfire Empire,3734251_1,9680,Borginia,1
185432,3768259,2016-07-18,2016-07-19,desktop,6744,The Devilfire Empire,3768259_1,30768,Gondal,1
185396,3767623,2016-03-24,2016-03-25,desktop,384,Gondal,3767623_1,0,,1
185313,3766287,2016-05-14,2016-05-16,mobile,2038,The Devilfire Empire,3766287_1,8766,Patusan,1
185349,3766789,2017-01-01,2017-01-02,desktop,8784,Elbonia,3766789_2,62185,Axphain,1
185142,3762853,2017-02-05,2017-02-07,desktop,7974,The Devilfire Empire,3762853_2,28491,Nova Africa,1
185170,3763741,2016-08-18,2016-08-23,desktop,4541,Gondal,3763741_1,0,,1


In [9]:
train['istest'] = 0
test['istest'] = 1
raw = cudf.concat([train,test], sort=False )
raw = raw.sort_values( ['user_id','checkin'], ascending=True )
raw.head()

Unnamed: 0,user_id,checkin,checkout,city_id,device_class,affiliate_id,booker_country,hotel_country,utrip_id,istest
413669,29,2016-07-09,2016-07-11,47054,desktop,1601,Elbonia,Elbonia,29_1,0
413670,29,2016-07-11,2016-07-13,34444,desktop,1601,Elbonia,Elbonia,29_1,0
413671,29,2016-07-13,2016-07-16,12291,desktop,1601,Elbonia,Elbonia,29_1,0
413672,29,2016-07-16,2016-07-18,16386,desktop,8132,Elbonia,Elbonia,29_1,0
355509,65,2016-09-26,2016-09-29,36403,desktop,3577,The Devilfire Empire,Cobra Island,65_1,1


In [10]:
raw['fold'] = 0
group_kfold = GroupKFold(n_splits=5)
for fold, (train_index, test_index) in enumerate(group_kfold.split(X=raw, y=raw, groups=raw['utrip_id'].to_pandas())):
    raw.iloc[test_index,10] = fold

raw['fold'].value_counts()

1    309101
0    309101
3    309100
2    309100
4    309100
Name: fold, dtype: int32

In [12]:
#This flag tell which row must be part of the submission file.

raw['submission'] = 0
raw.loc[ (raw.city_id==0)&(raw.istest) ,'submission'] = 1

raw.loc[ raw.submission==1 ]

Unnamed: 0,user_id,checkin,checkout,city_id,device_class,affiliate_id,booker_country,hotel_country,utrip_id,istest,fold,submission
355513,65,2016-10-03,2016-10-04,0,mobile,4132,The Devilfire Empire,,65_1,1,2,1
356899,67,2016-08-11,2016-08-14,0,desktop,9924,Tcherkistan,,67_1,1,1,1
10963,115,2016-04-06,2016-04-07,0,desktop,9924,Elbonia,,115_1,1,0,1
120565,279,2016-03-27,2016-04-01,0,desktop,2803,Tcherkistan,,279_1,1,3,1
139366,307,2016-06-02,2016-06-03,0,desktop,8132,Elbonia,,307_1,1,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...
353170,6257974,2016-07-05,2016-07-07,0,tablet,7974,The Devilfire Empire,,6257974_1,1,2,1
353174,6258010,2016-06-10,2016-06-12,0,mobile,359,The Devilfire Empire,,6258010_1,1,0,1
353181,6258104,2016-08-25,2016-08-27,0,mobile,359,Gondal,,6258104_4,1,1,1
353186,6258120,2016-07-24,2016-07-25,0,desktop,9924,Gondal,,6258120_1,1,4,1


In [13]:
#number of places visited in each trip

aggs = raw.groupby('utrip_id', as_index=False)['user_id'].count().reset_index()
aggs.columns = ['utrip_id', 'N']
raw = raw.merge(aggs, on=['utrip_id'], how='inner')

In [14]:
raw['utrip_id_'], mp = raw['utrip_id'].factorize()

In [16]:
def get_order_in_group(utrip_id_,order):
    for i in range(cuda.threadIdx.x, len(utrip_id_), cuda.blockDim.x):
        order[i] = i

def add_cumcount(df, sort_col, outputname):
    df = df.sort_values(sort_col, ascending=True)
    tmp = df[['utrip_id_', 'checkin']].groupby(['utrip_id_']).apply_grouped(
        get_order_in_group,incols=['utrip_id_'],
        outcols={'order': 'int32'},
        tpb=32)
    tmp.columns = ['utrip_id_', 'checkin', outputname]
    df = df.merge(tmp, how='left', on=['utrip_id_', 'checkin'])
    df = df.sort_values(sort_col, ascending=True)
    return(df)

raw = add_cumcount(raw, ['utrip_id_','checkin'], 'dcount')

In [17]:
raw['icount'] = raw['N']-raw['dcount']-1

In [18]:
raw.head(20)

Unnamed: 0,user_id,checkin,checkout,city_id,device_class,affiliate_id,booker_country,hotel_country,utrip_id,istest,fold,submission,N,utrip_id_,dcount,icount
6880,1000027,2016-08-13,2016-08-14,8183,desktop,7168,Elbonia,Gondal,1000027_1,0,0,0,4,0,0,3
6881,1000027,2016-08-14,2016-08-16,15626,desktop,7168,Elbonia,Gondal,1000027_1,0,0,0,4,0,1,2
6882,1000027,2016-08-16,2016-08-18,60902,desktop,7168,Elbonia,Gondal,1000027_1,0,0,0,4,0,2,1
6883,1000027,2016-08-18,2016-08-21,30628,desktop,253,Elbonia,Gondal,1000027_1,0,0,0,4,0,3,0
6884,1000033,2016-04-09,2016-04-11,38677,mobile,359,Gondal,Cobra Island,1000033_1,0,0,0,5,1,0,4
6885,1000033,2016-04-11,2016-04-12,52089,desktop,384,Gondal,Cobra Island,1000033_1,0,0,0,5,1,1,3
6886,1000033,2016-04-12,2016-04-14,21328,desktop,384,Gondal,Cobra Island,1000033_1,0,0,0,5,1,2,2
6887,1000033,2016-04-14,2016-04-16,27485,desktop,384,Gondal,Cobra Island,1000033_1,0,0,0,5,1,3,1
6888,1000033,2016-04-16,2016-04-19,38677,desktop,384,Gondal,Cobra Island,1000033_1,0,0,0,5,1,4,0
6889,1000045,2016-06-18,2016-06-20,64876,desktop,2790,The Devilfire Empire,Fook Island,1000045_1,0,0,0,7,2,0,6
