In [12]:
import pandas as pd
import numpy as np
train = pd.read_csv("./kaggle/expedia/data/train.csv",
                    dtype={'user_location_country':np.int32,'is_booking':bool,'srch_destination_id':np.int32, 'hotel_cluster':np.int32},
                    usecols=['user_location_country','srch_destination_id','is_booking','hotel_cluster'],
                    chunksize=1000000)
aggs = []
print('-'*38)
for chunk in train:
    agg = chunk.groupby(['user_location_country','srch_destination_id',
                         'hotel_cluster'])['is_booking'].agg(['sum','count'])
    agg.reset_index(inplace=True)
    aggs.append(agg)
    print('.',end='')
print('')
aggs = pd.concat(aggs, axis=0)
aggs.head(40)

--------------------------------------
......................................


Unnamed: 0,user_location_country,srch_destination_id,hotel_cluster,sum,count
0,0,21,30,0,1
1,0,21,78,0,1
2,0,21,82,0,1
3,0,104,82,1,3
4,0,137,53,0,1
5,0,137,61,1,2
6,0,137,62,0,1
7,0,137,81,0,1
8,0,137,82,2,4
9,0,181,26,0,5


In [13]:
CLICK_WEIGHT = 0.05
agg = aggs.groupby(['user_location_country','srch_destination_id','hotel_cluster']).sum().reset_index()
agg['count'] -= agg['sum']
agg = agg.rename(columns={'sum':'bookings','count':'clicks'})
agg['relevance'] = agg['bookings'] + CLICK_WEIGHT * agg['clicks']
agg.head()

Unnamed: 0,user_location_country,srch_destination_id,hotel_cluster,bookings,clicks,relevance
0,0,4,30,1,1,1.05
1,0,4,78,1,2,1.1
2,0,4,81,1,1,1.05
3,0,8,14,0,1,0.05
4,0,8,39,0,1,0.05


In [14]:
def most_popular(group, n_max=5):
    relevance = group['relevance'].values
    hotel_cluster = group['hotel_cluster'].values
    most_popular = hotel_cluster[np.argsort(relevance)[::-1]][:n_max]
    return np.array_str(most_popular)[1:-1] # remove square brackets

In [15]:
most_pop = agg.groupby(['user_location_country','srch_destination_id']).apply(most_popular)
most_pop = pd.DataFrame(most_pop).rename(columns={0:'hotel_cluster'})
most_pop.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,hotel_cluster
user_location_country,srch_destination_id,Unnamed: 2_level_1
0,4,78 81 30
0,8,76 39 14
0,9,40 60 20 61
0,14,75 20 38
0,19,64 38 75 6


In [19]:
test = pd.read_csv('./kaggle/expedia/data/test.csv',
                    dtype={'user_location_country':np.int32,'srch_destination_id':np.int32},
                    usecols=['user_location_country','user_location_region','srch_destination_id'],)

In [20]:
test = test.merge(most_pop, how='left',left_on=['user_location_country','srch_destination_id'],right_index=True)
test.head()

Unnamed: 0,user_location_country,user_location_region,srch_destination_id,hotel_cluster
0,66,174,12243,5 55 37 11 22
1,66,174,14474,
2,66,142,11353,0 31 96 25 91
3,66,258,8250,1 79 45 54 24
4,66,467,11812,91 42 2 48 59


In [21]:
test.hotel_cluster.isnull().sum()

61166

In [22]:
most_pop_all = agg.groupby('hotel_cluster')['relevance'].sum().nlargest(5).index
most_pop_all = np.array_str(most_pop_all)[1:-1]
most_pop_all

'91 48 42 59 28'

In [23]:
test.hotel_cluster.fillna(most_pop_all,inplace=True)

In [24]:
test.hotel_cluster.to_csv('predicted_with_pandas3.csv',header=True, index_label='id')

In [1]:
import pandas as pd
import numpy as np
train = pd.read_csv("./kaggle/expedia/data/train.csv")

In [None]:
train.sort(columns=['user_id','date_time','srch_destination_id'],inplace=True)

In [5]:
train.head(20)

Unnamed: 0,date_time,site_name,posa_continent,user_location_country,user_location_region,user_location_city,orig_destination_distance,user_id,is_mobile,is_package,...,srch_children_cnt,srch_rm_cnt,srch_destination_id,srch_destination_type_id,is_booking,cnt,hotel_continent,hotel_country,hotel_market,hotel_cluster
26236762,2014-12-06 01:38:26,2,3,66,174,9045,88.1489,0,1,0,...,2,1,8279,1,0,1,2,50,358,28
26236760,2014-12-03 11:30:21,2,3,66,174,42538,343.7968,0,1,0,...,1,1,8279,1,0,1,2,50,1230,56
26236759,2014-12-03 11:28:58,2,3,66,174,42538,343.9901,0,1,0,...,1,1,8279,1,0,1,2,50,1230,40
26236758,2014-12-03 11:28:11,2,3,66,174,42538,343.7866,0,1,0,...,1,1,8279,1,0,1,2,50,1230,98
26236757,2014-09-03 08:18:41,2,3,66,174,9045,53.6059,0,1,0,...,2,1,24693,6,0,1,2,50,1241,19
26236761,2014-12-03 11:32:43,2,3,66,174,42538,342.2163,0,1,0,...,2,1,8279,1,0,2,2,50,358,72
4645861,2014-04-25 16:15:11,2,3,66,174,37449,5405.4889,1,1,0,...,1,2,62881,4,0,1,4,98,2052,20
4645860,2014-04-25 16:14:46,2,3,66,174,37449,5405.4889,1,1,0,...,1,2,62881,4,0,1,4,98,2052,20
4645859,2014-04-25 16:14:28,2,3,66,174,37449,5405.4889,1,1,0,...,1,1,62881,4,0,1,4,98,2052,20
4645862,2014-04-25 16:15:50,2,3,66,174,37449,5405.7409,1,1,0,...,1,2,62881,4,0,1,4,98,2052,60
