In [None]:
# !git clone https://github.com/trivago/recsys-challenge-2019-benchmarks.git /content/trivago_benchmark

In [23]:
import numpy as np
import pandas as pd
from src import functions as f
pd.set_option('display.max_colwidth', None)
%reload_ext autoreload
%autoreload 2

In [18]:
train_example = pd.read_parquet('./data/train_example.parquet.gzip')
train_example

Unnamed: 0,user_id,session_id,timestamp,step,action_type,reference,impressions,prices
0,64BL89,3579f89,1,1,interaction item image,5001,,
1,64BL89,3579f89,2,2,clickout item,5002,5014|5002|5010,100|125|120
2,64BL89,3579f89,3,3,interaction item info,5003,,
3,64BL89,3579f89,4,4,filter selection,unknown,,
4,64BLF,4504h9,2,1,interaction item image,5010,,
5,64BLF,4504h9,4,2,clickout item,5001,5001|5023|5040|5005,75|110|65|210
6,64BL89,5504hFL,7,1,filter selection,unknown,,
7,64BL89,5504hFL,8,2,clickout item,5004,5010|5001|5023|5004|5002|5008,120|89|140|126|86|110
8,64BL89,5504hFL,9,3,interaction item image,5001,,
9,64BL89,5504hFL,10,4,clickout item,5001,5010|5001|5023|5004|5002|5008,120|89|140|126|86|110


Build features for the lightGBM and logistic regression model

In [25]:
def build_features(df):
    """Build features for the lightGBM and logistic regression model."""

    # Select columns that are of interest for this method
    f.print_time("start")
    cols = ['user_id', 'session_id', 'timestamp', 'step',
            'action_type', 'reference', 'impressions', 'prices']
    df_cols = df.loc[:, cols] 

    # We are only interested in action types, for wich the reference is an item ID
    f.print_time("filter interactions")
    item_interactions = [
        'clickout item', 'interaction item deals', 'interaction item image',
        'interaction item info', 'interaction item rating', 'search for item'
    ]
    df_actions = (
        df_cols
        .loc[df_cols.action_type.isin(item_interactions), :]
        .copy()
        .rename(columns={'reference': 'referenced_item'})
    )

    f.print_time("cleaning")
    # Clean of instances that have no reference
    idx_rm = (df_actions.action_type != "clickout item") & (df_actions.referenced_item.isna())
    df_actions = df_actions[~idx_rm]

    # Get item ID of previous interaction of a user in a session
    f.print_time("previous interactions")
    df_actions.loc[:, "previous_item"] = (
        df_actions
        .sort_values(by=["user_id", "session_id", "timestamp", "step"],
                        ascending=[True, True, True, True])
        .groupby(["user_id"])["referenced_item"]
        .shift(1)
    )

    # Combine the impressions and item column, they both contain item IDs
    # and we can expand the impression lists in the next step to get the total
    # interaction count for an item
    f.print_time("combining columns - impressions")
    df_actions.loc[:, "interacted_item"] = np.where(
        df_actions.impressions.isna(),
        df_actions.referenced_item,
        df_actions.impressions
    )
    df_actions = df_actions.drop(columns="impressions")

    # Price array expansion will get easier without NAs
    f.print_time("combining columns - prices")
    df_actions.loc[:, "prices"] = np.where(
        df_actions.prices.isna(),
        "",
        df_actions.prices
    )

    # Convert pipe separated lists into columns
    f.print_time("explode arrays")
    df_items = f.explode_mult(df_actions, ["interacted_item", "prices"]).copy()

    # Feature: Number of previous interactions with an item
    f.print_time("interaction count")
    df_items.loc[:, "interaction_count"] = (
        df_items
        .groupby(["user_id", "interacted_item"])
        .cumcount()
    )

    # Reduce to impression level again 
    f.print_time("reduce to impressions")
    df_impressions = (
        df_items[df_items.action_type == "clickout item"]
        .copy()
        .drop(columns="action_type")
        .rename(columns={"interacted_item": "impressed_item"})
    )

    # Feature: Position of item in the original list.
    # Items are in original order after the explode for each index
    f.print_time("position feature")
    df_impressions.loc[:, "position"] = (
        df_impressions
        .groupby(["user_id", "session_id", "timestamp", "step"])
        .cumcount()+1
    )

    # Feature: Is the impressed item the last interacted item
    f.print_time("last interacted item feature")
    df_impressions.loc[:, "is_last_interacted"] = (
        df_impressions["previous_item"] == df_impressions["impressed_item"]
    ).astype(int)

    f.print_time("change price datatype")
    df_impressions.loc[:, "prices"] = df_impressions.prices.astype(int)

    return_cols = [
        "user_id",
        "session_id",
        "timestamp",
        "step",
        "position",
        "prices",
        "interaction_count",
        "is_last_interacted",
        "referenced_item",
        "impressed_item",
    ]

    df_return = df_impressions[return_cols]

    return df_return

In [26]:
build_features(train_example)

16:53:05 | start
16:53:05 | filter interactions
16:53:05 | cleaning
16:53:05 | previous interactions
16:53:05 | combining columns - impressions
16:53:05 | combining columns - prices
16:53:05 | explode arrays
16:53:05 | interaction count
16:53:05 | reduce to impressions
16:53:05 | position feature
16:53:05 | last interacted item feature
16:53:05 | change price datatype


Unnamed: 0,user_id,session_id,timestamp,step,position,prices,interaction_count,is_last_interacted,referenced_item,impressed_item
1,64BL89,3579f89,2,2,1,100,0,0,5002,5014
2,64BL89,3579f89,2,2,2,125,0,0,5002,5002
3,64BL89,3579f89,2,2,3,120,0,0,5002,5010
6,64BLF,4504h9,4,2,1,75,0,0,5001,5001
7,64BLF,4504h9,4,2,2,110,0,0,5001,5023
8,64BLF,4504h9,4,2,3,65,0,0,5001,5040
9,64BLF,4504h9,4,2,4,210,0,0,5001,5005
10,64BL89,5504hFL,8,2,1,120,1,0,5004,5010
11,64BL89,5504hFL,8,2,2,89,1,0,5004,5001
12,64BL89,5504hFL,8,2,3,140,0,0,5004,5023


Nearest neighbor crunching

In [27]:
from scipy import sparse

def calc_item_sims(df, item_col, reference_col):
    """Calculate similarity of items based on nearest neighbor algorithm.

    The final data frame will have similarity scores for pairs of items.

    :param df: Data frame of training data
    :param item_col: Name of data frame column that contains the item ID
    :param reference_col: Name of the reference column, depending on the model either
        1. session_id for the similarity based on session co-occurrences
        2. properties for the similarity based on item metadata
    :return: Data frame with item pairs and similarity scores
    """

    # Create data frame with item and reference indices
    f.print_time("item and reference indices")
    unique_items = df[item_col].unique()
    unique_refs = df[reference_col].unique()

    d_items = {item_col: unique_items, 'item_idx': range(0, len(unique_items))}
    d_refs = {reference_col: unique_refs, 'ref_idx': range(0, len(unique_refs))}

    df_items = pd.DataFrame(data=d_items)
    df_refs = pd.DataFrame(data=d_refs)

    df = (
        df
        .merge(
            df_items,
            how="inner",
            on=item_col
        )
        .merge(
            df_refs,
            how="inner",
            on=reference_col
        )
    )

    df_idx = (
        df
        .loc[:, ["item_idx", "ref_idx"]]
        .assign(data=lambda x: 1.)
        .drop_duplicates()
    )

    # Build item co-ooccurrence matrix
    f.print_time("item co-occurrence matrix")
    mat_coo = sparse.coo_matrix((df_idx.data, (df_idx.item_idx, df_idx.ref_idx)))
    mat_item_coo = mat_coo.T.dot(mat_coo)

    # Calculate Cosine similarities
    f.print_time("Cosine similarity")
    inv_occ = np.sqrt(1 / mat_item_coo.diagonal())
    cosine_sim = mat_item_coo.multiply(inv_occ)
    cosine_sim = cosine_sim.T.multiply(inv_occ)

    # Create item similarity data frame
    f.print_time("item similarity data frame")
    idx_ref, idx_item, sim = sparse.find(cosine_sim)
    d_item_sim = {'idx_ref': idx_ref, 'idx_item': idx_item, 'similarity': sim}
    df_item_sim = pd.DataFrame(data=d_item_sim)

    df_item_sim = (
        df_item_sim
        .merge(
            df_items.assign(item_ref=df_items[item_col]),
            how="inner",
            left_on="idx_ref",
            right_on="item_idx"
        )
        .merge(
            df_items.assign(item_sim=df_items[item_col]),
            how="inner",
            left_on="idx_item",
            right_on="item_idx"
        )
        .loc[:, ["item_ref", "item_sim", "similarity"]]
    )

    return df_item_sim


def predict_nn(df, df_item_sim):
    """Calculate predictions based on the item similarity scores."""

    # Select columns that are of interest for this function
    f.print_time("start")
    cols = ['user_id', 'session_id', 'timestamp', 'step',
            'action_type', 'reference', 'impressions']
    df_cols = df.loc[:, cols] 

    # Get previous reference per user
    f.print_time("previous reference")
    df_cols["previous_reference"] = (
        df_cols
        .sort_values(by=["user_id", "session_id", "timestamp"],
                     ascending=[True, True, True])
        .groupby(["user_id"])["reference"]
        .shift(1)
    )

    # Target row, withheld item ID that needs to be predicted
    f.print_time("target rows")
    df_target = f.get_target_rows(df_cols)

    # Explode to impression level
    f.print_time("explode impression array")
    df_impressions = f.explode_string(df_target, "impressions")

    df_item_sim["item_ref"] = df_item_sim["item_ref"].astype(str)
    df_item_sim["item_sim"] = df_item_sim["item_sim"].astype(str)

    # Get similarities
    f.print_time("get similarities")
    df_impressions = (
        df_impressions
        .merge(
            df_item_sim,
            how="left",
            left_on=["previous_reference", "impressions"],
            right_on=["item_ref", "item_sim"]
        )
        .fillna(value={'similarity': 0})
        .sort_values(by=["user_id", "timestamp", "step", "similarity"],
                        ascending=[True, True, True, False])
    )

    # Summarize recommendations
    f.print_time("summarize recommendations")
    df_rec = f.group_concat(
        df_impressions, ["user_id", "session_id", "timestamp", "step"], 
        "impressions"
    )

    df_rec = (
        df_rec
        .rename(columns={'impressions': 'item_recommendations'})
        .loc[:, ["user_id", "session_id", "timestamp", "step", "item_recommendations"]]
    )

    return df_rec

In [41]:
import lightgbm as lgb

class ModelGbmRank():
    """
    Model class for the lightGBM model.

    Methods
        fit(df): Fit the model on training data
        predict(df): Calculate recommendations for test data        
    """

    def fit(self, df):
        """Train the lightGBM model."""

        df_impressions = build_features(df)

        # Target column, item that was clicked
        f.print_time("target column")
        df_impressions.loc[:, "is_clicked"] = (
            df_impressions["referenced_item"] == df_impressions["impressed_item"]
        ).astype(int)

        features = [
            "position",
            "prices",
            "interaction_count",
            "is_last_interacted",
        ]

        # Bring to format suitable for lightGBM
        f.print_time("lightGBM format")
        X = df_impressions[features]
        y = df_impressions.is_clicked

        q = (
            df_impressions
            .groupby(["user_id", "session_id", "timestamp", "step"])
            .size()
            .reset_index(name="query_length")
            .query_length
        )

        # Training the actual model
        f.print_time("training lightGBM model")
        self.gbm = lgb.LGBMRanker()
        self.gbm.fit(X, y, group=q, verbose=True)


    def predict(self, df):
        """Calculate item ranking based on trained lightGBM model."""

        df_impressions = build_features(df)

        # Target row, withheld item ID that needs to be predicted
        df_impressions = df_impressions[df_impressions.referenced_item.isna()]

        features = [
            "position",
            "prices",
            "interaction_count",
            "is_last_interacted"
        ]

        df_impressions.loc[:, "click_propensity"] = self.gbm.predict(df_impressions[features])

        # Summarize recommendations
        f.print_time("summarize recommendations")
        df_rec = f.summarize_recs(df_impressions, "click_propensity")
         
        return df_rec

In [42]:
def main(train_file, test_file, subm_file, model_name):
    """
    This script runs a single model.

    \b
    The following models are supported:
    - gbm_rank: lightGBM model
    - log_reg: Logistic regression
    - nn_interaction: kNN w/ session co-occurrence
    - nn_item: kNN w/ metadata similarity
    - pop_abs: Popularity - total clicks
    - pop_user: Popularity - distinct users
    - position: Original display position
    - random: Random order
    """

    f.validate_model_name(model_name)

    models = {
        'gbm_rank': ModelGbmRank(),
        # 'log_reg': ModelLogReg(),
        # 'nn_interaction': ModelNNInteraction(),
        # 'nn_item': ModelNNItem(),
        # 'pop_abs': ModelPopAbs(),
        # 'pop_user': ModelPopUsers(),
        # 'position': ModelPosition(),
        # 'random': ModelRandom()
    }

    model = models[model_name]

    f.print_header(f"Run model {model_name}")

    print(f"Reading {train_file} ...")
    df_train = f.read_data(train_file)

    print(f"Fit model ...")
    model.fit(df_train)

    print(f"Reading {test_file} ...")
    df_test = f.read_data(test_file)

    print(f"Calculate recommendations ...")
    df_recommendations = model.predict(df_test)

    print(f"Writing {subm_file}...")
    df_recommendations.to_csv(subm_file, index=False)

    print("Finished calculating recommendations.")

In [43]:
main('./data/train_example.parquet.gzip', 
     './data/test_example.parquet.gzip', 
     './data/submission_example.csv',
     'gbm_rank')


######################
# Run model gbm_rank #
######################

Reading ./data/train_example.parquet.gzip ...
Fit model ...
17:23:34 | start
17:23:34 | filter interactions
17:23:34 | cleaning
17:23:34 | previous interactions
17:23:34 | combining columns - impressions
17:23:34 | combining columns - prices
17:23:34 | explode arrays
17:23:34 | interaction count
17:23:34 | reduce to impressions
17:23:34 | position feature
17:23:34 | last interacted item feature
17:23:34 | change price datatype
17:23:34 | target column
17:23:34 | lightGBM format
17:23:34 | training lightGBM model
Reading ./data/test_example.parquet.gzip ...
Calculate recommendations ...
17:23:34 | start
17:23:34 | filter interactions
17:23:34 | cleaning
17:23:34 | previous interactions
17:23:34 | combining columns - impressions
17:23:34 | combining columns - prices
17:23:34 | explode arrays
17:23:34 | interaction count
17:23:34 | reduce to impressions
17:23:34 | position feature
17:23:34 | last interacted item featur