In [1]:
import os
import pandas as pd
from sklearn.model_selection import GroupKFold

In [2]:
data_path = './data'

In [6]:
train = pd.read_csv(os.path.join(data_path,'train_set.csv')).sort_values(by=['user_id','checkin'])
test = pd.read_csv(os.path.join(data_path,'test_set.csv')).sort_values(by=['user_id','checkin'])

print(train.shape, test.shape)
train.head()

(1166835, 9) (378667, 9)


Unnamed: 0,user_id,checkin,checkout,city_id,device_class,affiliate_id,booker_country,hotel_country,utrip_id
413669,29,2016-07-09,2016-07-11,47054,desktop,1601,Elbonia,Elbonia,29_1
413670,29,2016-07-11,2016-07-13,34444,desktop,1601,Elbonia,Elbonia,29_1
413671,29,2016-07-13,2016-07-16,12291,desktop,1601,Elbonia,Elbonia,29_1
413672,29,2016-07-16,2016-07-18,16386,desktop,8132,Elbonia,Elbonia,29_1
1128910,81,2016-05-15,2016-05-16,33665,desktop,9924,Elbonia,Elbonia,81_1


In [7]:
train['istest'] = 0
test['istest'] = 1
raw = pd.concat([train,test], sort=False )
raw = raw.sort_values( ['user_id','checkin'], ascending=True )
raw.head()

Unnamed: 0,user_id,checkin,checkout,city_id,device_class,affiliate_id,booker_country,hotel_country,utrip_id,istest
413669,29,2016-07-09,2016-07-11,47054,desktop,1601,Elbonia,Elbonia,29_1,0
413670,29,2016-07-11,2016-07-13,34444,desktop,1601,Elbonia,Elbonia,29_1,0
413671,29,2016-07-13,2016-07-16,12291,desktop,1601,Elbonia,Elbonia,29_1,0
413672,29,2016-07-16,2016-07-18,16386,desktop,8132,Elbonia,Elbonia,29_1,0
355509,65,2016-09-26,2016-09-29,36403,desktop,3577,The Devilfire Empire,Cobra Island,65_1,1


In [10]:
raw['fold'] = 0
group_kfold = GroupKFold(n_splits=5)
for fold, (train_index, test_index) in enumerate(group_kfold.split(X=raw, y=raw, groups=raw['utrip_id'])):
    raw.iloc[test_index,10] = fold

raw.head()

Unnamed: 0,user_id,checkin,checkout,city_id,device_class,affiliate_id,booker_country,hotel_country,utrip_id,istest,fold
413669,29,2016-07-09,2016-07-11,47054,desktop,1601,Elbonia,Elbonia,29_1,0,3
413670,29,2016-07-11,2016-07-13,34444,desktop,1601,Elbonia,Elbonia,29_1,0,3
413671,29,2016-07-13,2016-07-16,12291,desktop,1601,Elbonia,Elbonia,29_1,0,3
413672,29,2016-07-16,2016-07-18,16386,desktop,8132,Elbonia,Elbonia,29_1,0,3
355509,65,2016-09-26,2016-09-29,36403,desktop,3577,The Devilfire Empire,Cobra Island,65_1,1,2


In [12]:
#This flag tell which row must be part of the submission file.

raw['submission'] = 0
raw.loc[ (raw.city_id==0)&(raw.istest) ,'submission'] = 1

raw.loc[raw.submission==1].head()

Unnamed: 0,user_id,checkin,checkout,city_id,device_class,affiliate_id,booker_country,hotel_country,utrip_id,istest,fold,submission
355513,65,2016-10-03,2016-10-04,0,mobile,4132,The Devilfire Empire,,65_1,1,2,1
356899,67,2016-08-11,2016-08-14,0,desktop,9924,Tcherkistan,,67_1,1,1,1
10963,115,2016-04-06,2016-04-07,0,desktop,9924,Elbonia,,115_1,1,0,1
120565,279,2016-03-27,2016-04-01,0,desktop,2803,Tcherkistan,,279_1,1,3,1
139366,307,2016-06-02,2016-06-03,0,desktop,8132,Elbonia,,307_1,1,2,1


In [36]:
def add_features(data):
    
    # number of places visited in each trip
    aggs = data.groupby('utrip_id', as_index=False)['user_id'].count()
    aggs.columns = ['utrip_id', 'N']
    data = data.merge(aggs, on=['utrip_id'], how='inner')
    
    data['utrip_id_'], mp = data['utrip_id'].factorize()
    
    data['dcount'] = data.groupby(['utrip_id_']).cumcount()
    
    data['icount'] = data['N']-data['dcount']-1
    
    return data

In [37]:
add_features(raw[raw.utrip_id.isin(['29_1','65_1'])])

Unnamed: 0,user_id,checkin,checkout,city_id,device_class,affiliate_id,booker_country,hotel_country,utrip_id,istest,fold,submission,N,utrip_id_,dcount,icount
0,29,2016-07-09,2016-07-11,47054,desktop,1601,Elbonia,Elbonia,29_1,0,3,0,4,0,0,3
1,29,2016-07-11,2016-07-13,34444,desktop,1601,Elbonia,Elbonia,29_1,0,3,0,4,0,1,2
2,29,2016-07-13,2016-07-16,12291,desktop,1601,Elbonia,Elbonia,29_1,0,3,0,4,0,2,1
3,29,2016-07-16,2016-07-18,16386,desktop,8132,Elbonia,Elbonia,29_1,0,3,0,4,0,3,0
4,65,2016-09-26,2016-09-29,36403,desktop,3577,The Devilfire Empire,Cobra Island,65_1,1,2,0,5,1,0,4
5,65,2016-09-29,2016-10-01,23921,desktop,8223,The Devilfire Empire,Cobra Island,65_1,1,2,0,5,1,1,3
6,65,2016-10-01,2016-10-02,42682,mobile,4132,The Devilfire Empire,Cobra Island,65_1,1,2,0,5,1,2,2
7,65,2016-10-02,2016-10-03,29990,mobile,4132,The Devilfire Empire,Cobra Island,65_1,1,2,0,5,1,3,1
8,65,2016-10-03,2016-10-04,0,mobile,4132,The Devilfire Empire,,65_1,1,2,1,5,1,4,0
