In [None]:
import numpy as np
import pandas as pd

from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))

In [None]:
train = pd.read_csv('../input/train.csv',chunksize=1000000)

aggs = []
data = pd.DataFrame()
print('-'*38)
for chunk in train:
    data = data.append(chunk.head())
    agg = chunk.groupby(['srch_destination_id',
                         'hotel_cluster'])['is_booking'].agg(['sum','count'])
    agg.reset_index(inplace=True)
    aggs.append(agg)
    print('.',end='')
print('')
aggs = pd.concat(aggs, axis=0)
aggs.head()

In [None]:
data.columns

In [None]:
CLICK_WEIGHT = 0.05
agg = aggs.groupby(['srch_destination_id','hotel_cluster']).sum().reset_index()
agg['count'] -= agg['sum']
agg = agg.rename(columns={'sum':'bookings','count':'clicks'})
agg['relevance'] = agg['bookings'] + CLICK_WEIGHT * agg['clicks']
agg.head()

In [None]:
agg[agg['srch_destination_id']==1]

In [None]:
for chunk in train:
    print(len(chunk))

In [None]:
def most_popular(group, n_max=5):
    relevance = group['relevance'].values
    hotel_cluster = group['hotel_cluster'].values
    most_popular = hotel_cluster[np.argsort(relevance)[::-1]][:n_max]
    return np.array_str(most_popular)[1:-1] # remove square brackets

Get most popular hotel clusters for all destinations.

In [None]:
most_pop = agg.groupby(['srch_destination_id']).apply(most_popular)
most_pop = pd.DataFrame(most_pop).rename(columns={0:'hotel_cluster'})
most_pop.head()

### Predict for test data
Read in the test data and merge most popular hotel clusters.

In [None]:
test = pd.read_csv('../input/test.csv',
                    dtype={'srch_destination_id':np.int32},
                    usecols=['srch_destination_id'],)

In [None]:
test = test.merge(most_pop, how='left',left_on='srch_destination_id',right_index=True)
test.head()

Check hotel_cluster column in test for null values.

In [None]:
test.hotel_cluster.isnull().sum()

Looks like there's about 14k new destinations in test. Let's fill nas with hotel clusters that are most popular overall.

In [None]:
most_pop_all = agg.groupby('hotel_cluster')['relevance'].sum().nlargest(5).index
most_pop_all = np.array_str(most_pop_all)[1:-1]
most_pop_all

In [None]:
test.hotel_cluster.fillna(most_pop_all,inplace=True)

Save the submission.

In [None]:
test.hotel_cluster.to_csv('predicted_with_pandas.csv',header=True, index_label='id')