# Trivago Popular Destination Recommender
> Using Recsys 2019 challange Trivago travel dataset to build popularity based model and recommending popular most clicked destinations to the users

- toc: true
- badges: true
- comments: true
- categories: [Trivago, Popularity, Travel]
- image:

## Setup

In [1]:
!pip install -q git+https://github.com/sparsh-ai/recochef.git

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
[K     |████████████████████████████████| 4.3MB 7.5MB/s 
[?25h  Building wheel for recochef (PEP 517) ... [?25l[?25hdone


In [15]:
import math
import pandas as pd
import numpy as np

from recochef.datasets.trivago import Trivago

## Data loading

In [16]:
trivago = Trivago()

In [17]:
df_train = trivago.load_train()
df_train.head()

Unnamed: 0,USERID,SESSIONID,TIMESTAMP,STEP,EVENTTYPE,REFERENCE,PLATFORM,CITY,DEVICE,FILTERS,IMPRESSIONS,PRICES
0,00RL8Z82B2Z1,aff3928535f48,1541037460,1,search for poi,Newtown,AU,"Sydney, Australia",mobile,,,
1,00RL8Z82B2Z1,aff3928535f48,1541037522,2,interaction item image,666856,AU,"Sydney, Australia",mobile,,,
2,00RL8Z82B2Z1,aff3928535f48,1541037522,3,interaction item image,666856,AU,"Sydney, Australia",mobile,,,
3,00RL8Z82B2Z1,aff3928535f48,1541037532,4,interaction item image,666856,AU,"Sydney, Australia",mobile,,,
4,00RL8Z82B2Z1,aff3928535f48,1541037532,5,interaction item image,109038,AU,"Sydney, Australia",mobile,,,


In [18]:
df_test = trivago.load_test()
df_test.head()

Unnamed: 0,USERID,SESSIONID,TIMESTAMP,STEP,EVENTTYPE,REFERENCE,PLATFORM,CITY,DEVICE,FILTERS,IMPRESSIONS,PRICES
0,004A07DM0IDW,1d688ec168932,1541555614,1,interaction item image,2059240,CO,"Santa Marta, Colombia",mobile,,,
1,004A07DM0IDW,1d688ec168932,1541555614,2,interaction item image,2059240,CO,"Santa Marta, Colombia",mobile,,,
2,004A07DM0IDW,1d688ec168932,1541555696,3,clickout item,1050068,CO,"Santa Marta, Colombia",mobile,,2059240|2033381|1724779|127131|399441|103357|1...,70|46|48|76|65|65|106|66|87|43|52|44|60|61|50|...
3,004A07DM0IDW,1d688ec168932,1541555707,4,clickout item,1050068,CO,"Santa Marta, Colombia",mobile,,2059240|2033381|1724779|127131|399441|103357|1...,70|46|48|76|65|65|106|66|87|43|52|44|60|61|50|...
4,004A07DM0IDW,1d688ec168932,1541555717,5,clickout item,1050068,CO,"Santa Marta, Colombia",mobile,,2059240|2033381|1724779|127131|399441|103357|1...,70|46|48|76|65|65|106|66|87|43|52|44|60|61|50|...


## Utilities

In [19]:
GR_COLS = ["USERID", "SESSIONID", "TIMESTAMP", "STEP"]

In [20]:
def get_submission_target(df):
    """Identify target rows with missing click outs."""

    mask = df["REFERENCE"].isnull() & (df["EVENTTYPE"] == "clickout item")
    df_out = df[mask]

    return df_out

In [21]:
def get_popularity(df):
    """Get number of clicks that each item received in the df."""

    mask = df["EVENTTYPE"] == "clickout item"
    df_clicks = df[mask]
    df_item_clicks = (
        df_clicks
        .groupby("REFERENCE")
        .size()
        .reset_index(name="NCLICKS")
        .transform(lambda x: x.astype(int))
    )

    return df_item_clicks

In [22]:
def string_to_array(s):
    """Convert pipe separated string to array."""

    if isinstance(s, str):
        out = s.split("|")
    elif math.isnan(s):
        out = []
    else:
        raise ValueError("Value must be either string of nan")
    return out

In [23]:
def explode(df_in, col_expl):
    """Explode column col_expl of array type into multiple rows."""

    df = df_in.copy()
    df.loc[:, col_expl] = df[col_expl].apply(string_to_array)

    df_out = pd.DataFrame(
        {col: np.repeat(df[col].values,
                        df[col_expl].str.len())
         for col in df.columns.drop(col_expl)}
    )

    df_out.loc[:, col_expl] = np.concatenate(df[col_expl].values)
    df_out.loc[:, col_expl] = df_out[col_expl].apply(int)

    return df_out

In [24]:
def group_concat(df, gr_cols, col_concat):
    """Concatenate multiple rows into one."""

    df_out = (
        df
        .groupby(gr_cols)[col_concat]
        .apply(lambda x: ' '.join(x))
        .to_frame()
        .reset_index()
    )

    return df_out

In [25]:
def calc_recommendation(df_expl, df_pop):
    """Calculate recommendations based on popularity of items.
    The final data frame will have an impression list sorted according to the number of clicks per item in a reference data frame.
    :param df_expl: Data frame with exploded impression list
    :param df_pop: Data frame with items and number of clicks
    :return: Data frame with sorted impression list according to popularity in df_pop
    """

    df_expl_clicks = (
        df_expl[GR_COLS + ["IMPRESSIONS"]]
        .merge(df_pop,
               left_on="IMPRESSIONS",
               right_on="REFERENCE",
               how="left")
    )

    df_out = (
        df_expl_clicks
        .assign(IMPRESSIONS=lambda x: x["IMPRESSIONS"].apply(str))
        .sort_values(GR_COLS + ["NCLICKS"],
                     ascending=[True, True, True, True, False])
    )

    df_out = group_concat(df_out, GR_COLS, "IMPRESSIONS")
    df_out.rename(columns={'IMPRESSIONS': 'ITEM_RECOMMENDATIONS'}, inplace=True)

    return df_out

## Getting popular items

In [29]:
print("Get popular items...")
df_popular = get_popularity(df_train)
df_popular.sort_values(by='NCLICKS', ascending=False).head(10)

Get popular items...


Unnamed: 0,REFERENCE,NCLICKS
256511,8796,822
253386,8561,763
254018,8621,726
253665,8589,652
45275,1455251,648
253635,8586,634
253989,8618,627
284354,9773310,554
535,100227,539
256679,8805412,525


## Identify target users

In [30]:
print("Identify target rows...")
df_target = get_submission_target(df_test)
df_target.head(10)

Identify target rows...


Unnamed: 0,USERID,SESSIONID,TIMESTAMP,STEP,EVENTTYPE,REFERENCE,PLATFORM,CITY,DEVICE,FILTERS,IMPRESSIONS,PRICES
6,004A07DM0IDW,1d688ec168932,1541555799,7,clickout item,,CO,"Santa Marta, Colombia",mobile,,2059240|2033381|1724779|127131|399441|103357|1...,70|46|48|76|65|65|106|66|87|43|52|44|60|61|50|...
8,009RGHI3G9A3,f05ab0de907e2,1541570940,2,clickout item,,IN,"Nathdwara, India",mobile,,10884872|7065316,64|28
10,00Y1Z24X8084,26b6d294d66e7,1541651823,2,clickout item,,PH,"Iloilo City, Philippines",mobile,,2714480|4476010|3843244|3833012|9017890|198100...,74|14|22|38|55|44|28|34|23|27|12|108|19|21|36|...
15,01V3WDTDM5CU,07628a0f5be0b,1541575643,5,clickout item,,PL,"Wisla, Poland",mobile,Sort by Price,3565720|2947584|4115018|2039671|3836538|801409...,16|18|20|21|22|22|28|28|28|30|30|33|33|35|35|3...
61,02AOAVF9PVYH,4a01c3afbc224,1541681278,46,clickout item,,JP,"Yokohama, Japan",desktop,Hotel|Resort|Sort by Price,1451247|559056|1045096|1963879|693596|1967173|...,80|81|81|82|82|82|83|83|83|85|85|88|88|91|92|9...
109,0339C84S24ET,89171d441a304,1541615683,36,clickout item,,TR,"Antalya, Turkey",mobile,,13361|5647680|116764|898719|8276346|9168|19325...,185|84|30|19|46|77|123|23|25|25|26|39|73|56|96...
111,0386OH8JDE1Q,e09591d07cdef,1541620536,2,clickout item,,UK,"John o' Groats, United Kingdom",desktop,,1193320|5488246|3858774|4552034|10620372|22696...,103|88|100|134|109|138|126|86
115,03LTH89QY623,7663406cf586c,1541554183,4,clickout item,,CA,"Koloa, USA",desktop,,241961|906477|991561|353701|1149665|77258|4943...,287|300|261|197|163|263|262|188|540|283|211|22...
138,03VT0ODUTZB0,725e8adf70e86,1541632490,23,clickout item,,UK,"Warrington, United Kingdom",desktop,,109938|164193|632366|1362450|1070666|164220|11...,45|67|78|60|58|57|86|68|57|55|148|92|66|55|61|...
314,03XH0JWCWHAM,73f4c417ff730,1541566143,176,clickout item,,MX,"Puebla, Mexico",mobile,Sort By Popularity,42692|5116230|42876|4342578|42864|3148690|2123...,53|181|60|45|96|60|80|59|41|122|50|43|113|48|5...


## Recommendations

In [31]:
print("Get recommendations...")
df_expl = explode(df_target, "IMPRESSIONS")
df_out = calc_recommendation(df_expl, df_popular)
df_out.head(10)

Get recommendations...


Unnamed: 0,USERID,SESSIONID,TIMESTAMP,STEP,ITEM_RECOMMENDATIONS
0,000324D9BBUC,89643988fdbfb,1541593942,10,924795 106315 1033140 119494 101758 903037 105...
1,0004Q49X39PY,9de47d9a66494,1541641157,1,3505150 3812004 2227896 2292254 3184842 222702...
2,0004Q49X39PY,beea5c27030cb,1541561202,1,4476010 3505150 3812004 2227896 2292254 222702...
3,00071784XQ6B,9617600e1ba7c,1541630328,2,22854 3067559 22721 22713 16121 22772 22727 22...
4,0008BO33KUQ0,2d0e2102ee0dc,1541636411,6,9857656 5849628 655716 1352530 502066 1405084 ...
5,000GO9NY6P4M,55dbafdbb9bab,1541594662,2,160577 157710 1618677 7231396 483691 1479743 1...
6,000IRHJS2DL9,f6ffffd20d43d,1541605541,12,33191 20144 20166 20154 3054956 346166 102540 ...
7,000JB0UNEH23,7df07dc9fe26e,1541618174,1,3874514 2909994 9503248 2526088 3167136 916982...
8,000OWRCYEHKT,53e84da5c2dad,1541706095,3,8758048 7195486 1153302 7795438 5416250 898083...
9,000VBY1D6BP8,033fddaaa99af,1541587306,4,32230 32233 32256 5411980 32235 6865398 32246 ...
