In [46]:
import pandas as pd

destinations = pd.read_csv("input/destinations.csv")
test = pd.read_csv("input/test.csv")
train = pd.read_csv("input/train.csv", nrows=3000000)
print(test.shape)
print(train.shape)

print(test.head(5))

print(train["hotel_cluster"].value_counts())

##
test_ids = set(test.user_id.unique())
train_ids = set(train.user_id.unique())
intersection_count = len(test_ids & train_ids)
intersection_count == len(test_ids)
##
train["date_time"] = pd.to_datetime(train["date_time"])
train["year"] = train["date_time"].dt.year
train["month"] = train["date_time"].dt.month
##
import random

unique_users = train.user_id.unique()

sel_user_ids = random.sample(list(unique_users),10000)
sel_train = train[train.user_id.isin(sel_user_ids)]

## downsampling data, new training set t1, testing set t2
# both from orig training set

t1 = sel_train[((sel_train.year == 2013) | ((sel_train.year == 2014) & (sel_train.month < 8)))]
t2 = sel_train[((sel_train.year == 2014) & (sel_train.month >= 8))]

## removing click eventsz
t2 = t2[t2.is_booking == True]

most_common_clusters = list(train.hotel_cluster.value_counts().head().index) #head() returns top 5 by default
#index == key of dataframe; cluster_id

predictions = [most_common_clusters for i in range(t2.shape[0])] #actual ones

## evaluating error
import ml_metrics as metrics
target = [[l] for l in t2["hotel_cluster"]] #predictions, from esting set
metrics.mapk(target, predictions, k=5)

(2528243, 22)
(999999, 24)
   id            date_time  site_name  posa_continent  user_location_country  \
0   0  2015-09-03 17:09:54          2               3                     66   
1   1  2015-09-24 17:38:35          2               3                     66   
2   2  2015-06-07 15:53:02          2               3                     66   
3   3  2015-09-14 14:49:10          2               3                     66   
4   4  2015-07-17 09:32:04          2               3                     66   

   user_location_region  user_location_city  orig_destination_distance  \
0                   174               37449                  5539.0567   
1                   174               37449                  5873.2923   
2                   142               17440                  3975.9776   
3                   258               34156                  1508.5975   
4                   467               36345                    66.7913   

   user_id  is_mobile      ...          srch_ci

0.062569133260667709

In [12]:
from sklearn.decomposition import PCA

pd.options.mode.chained_assignment = None

#downsampling the destinations, reducing the no. of latent features - anonymised
pca = PCA(n_components=3)
dest_small = pca.fit_transform(destinations[["d{0}".format(i + 1) for i in range(149)]])
dest_small = pd.DataFrame(dest_small)
dest_small["srch_destination_id"] = destinations["srch_destination_id"]

##actually generating features

def calc_fast_features(df):
    df["date_time"] = pd.to_datetime(df["date_time"])
    df["srch_ci"] = pd.to_datetime(df["srch_ci"], format='%Y-%m-%d', errors="coerce")
    df["srch_co"] = pd.to_datetime(df["srch_co"], format='%Y-%m-%d', errors="coerce")
    
    props = {}
    for prop in ["month", "day", "hour", "minute", "dayofweek", "quarter"]:
        props[prop] = getattr(df["date_time"].dt, prop)
    
    carryover = [p for p in df.columns if p not in ["date_time", "srch_ci", "srch_co"]]
    for prop in carryover:
        props[prop] = df[prop]
    
    date_props = ["month", "day", "dayofweek", "quarter"]
    for prop in date_props:
        props["ci_{0}".format(prop)] = getattr(df["srch_ci"].dt, prop)
        props["co_{0}".format(prop)] = getattr(df["srch_co"].dt, prop)
    props["stay_span"] = (df["srch_co"] - df["srch_ci"]).astype('timedelta64[h]')
        
    ret = pd.DataFrame(props)
    
    ret = ret.join(dest_small, on="srch_destination_id", how='left', rsuffix="dest")
    ret = ret.drop("srch_destination_iddest", axis=1)
    return ret

df = calc_fast_features(t1)
df.fillna(-1, inplace=True) #replacing missing values with -1

##assigning scores to clusters

def make_key(items):
    return "_".join([str(i) for i in items])

#aggregating to improve performance
match_cols = ["srch_destination_id"]
cluster_cols = match_cols + ['hotel_cluster']
groups = t1.groupby(cluster_cols)
top_clusters = {}
for name, group in groups:
    clicks = len(group.is_booking[group.is_booking == False])
    bookings = len(group.is_booking[group.is_booking == True])
    
    score = bookings + .15 * clicks #less weightage to clicks than booking
    
    clus_name = make_key(name[:len(match_cols)])
    if clus_name not in top_clusters:
        top_clusters[clus_name] = {}
    top_clusters[clus_name][name[-1]] = score
    
##iterate through dict top_clusters to find top5, make a new dictionary cluster_dict

import operator

cluster_dict = {}
for n in top_clusters:
    tc = top_clusters[n]
    top = [l[0] for l in sorted(tc.items(), key=operator.itemgetter(1), reverse=True)[:5]]
    cluster_dict[n] = top
    
##building predictions

preds = []
for index, row in t2.iterrows(): #itr through each row in t2
    key = make_key([row[m] for m in match_cols])
    if key in cluster_dict:
        preds.append(cluster_dict[key])
    else:
        preds.append([])
        
##

metrics.mapk([[l] for l in t2["hotel_cluster"]], preds, k=5)

0.23680621069182389

In [13]:

match_cols = ['user_location_country', 'user_location_region', 'user_location_city', 'hotel_market', 'orig_destination_distance']

groups = t1.groupby(match_cols)
    
def generate_exact_matches(row, match_cols):
    index = tuple([row[t] for t in match_cols])
    try:
        group = groups.get_group(index)
    except Exception:
        return []
    clus = list(set(group.hotel_cluster))
    return clus

exact_matches = []
for i in range(t2.shape[0]):
    exact_matches.append(generate_exact_matches(t2.iloc[i], match_cols))

##

def f5(seq, idfun=None): 
    if idfun is None:
        def idfun(x): return x
    seen = {}
    result = []
    for item in seq:
        marker = idfun(item)
        if marker in seen: continue
        seen[marker] = 1
        result.append(item)
    return result
    
full_preds = [f5(exact_matches[p] + preds[p] + most_common_clusters)[:5] for p in range(len(preds))]
metrics.mapk([[l] for l in t2["hotel_cluster"]], full_preds, k=5)

0.28027319182389937

In [41]:
result = pd.DataFrame()
result["id"]            = t2["user_id"]
result["hotel_cluster"] = full_preds

result.to_csv('hotelClusterPreds.csv', index=False)

print("done")

done
