# Imports

In [None]:
import polars as pl
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from matplotlib import pyplot as plt
from typing import List
import pandas as pd
from sklearn.metrics import ndcg_score
import joblib

# Parameters

In [None]:
label_column = "has_seen_venue_in_this_session"
pred_label = f"pred_{label_column}"
group_column = "session_id_hashed"
rank_column = "popularity"
rank_pos_column = "position_in_list"
predicted_rank_column = f"predicted_{rank_column}"
features = [
    'venue_id',
    'conversions_per_impression',
    'price_range',
    'rating',
    'popularity',
    'retention_rate',
    'session_id_hashed',
    'position_in_list',
    #  'has_seen_venue_in_this_session',
    #  'is_new_user',
    'is_from_order_again',
    'is_recommended']

# Utils

In [None]:
def convert_boolean_to_int(ranking_data):
    bool_cols = ranking_data.select(pl.col(pl.Boolean)).columns
    ranking_data = ranking_data.with_columns(
        [
            pl.col(column).cast(pl.Int8, strict=False).alias(column)
            for column in bool_cols
            ]
        )
    return ranking_data
def plot_lgb_report(lgb_model, evals_logs: dict, eval_at = [10, 20, 40]):
    for k in eval_at:
        fig, ax = plt.subplots(figsize=(12, 8))
        lgb.plot_metric(evals_logs, ax=ax, metric=f"ndcg@{k}")

        
    lgb.plot_importance(lgb_model, importance_type="gain", figsize=(14, 8))
    lgb.plot_importance(lgb_model, importance_type="split", figsize=(14, 8))
        
def qa_features(model: lgb, features: List[str]):
    feature_importance = {
        feature: importance
        for feature, importance in zip(
        model.feature_name(), model.feature_importance()
        )
    }
    print(feature_importance)
    used_features = [
        f for f in feature_importance
        if feature_importance[f] != 0
    ]
    dropped_features = [f for f in features if f not in used_features]
    print("-------------------------------------------------------")
    print("Used features: ")
    print(used_features)
    print("-------------------------------------------------------")
    print(
        f"passed {len(features)} "
        f"features and model used {len(used_features)} of them"
    )
    print("-------------------------------------------------------")
    print(f"these features were dropped: {dropped_features}")
    return used_features

def generate_predictions(
    test_set: pl.DataFrame, model: lgb, features_to_use: List[str]
) -> pd.DataFrame:
    test_set_pandas = test_set.sort(
        by=[group_column, rank_column], reverse=False
    ).to_pandas()
    test_set_pandas[features_to_use] = test_set_pandas[features_to_use].fillna(0)
    test_x = test_set_pandas[features_to_use]

    test_set_pandas[pred_label] = model.predict(test_x)
    test_set_pandas[predicted_rank_column] = (
        test_set_pandas.groupby(group_column)[pred_label]
        .rank(method="first", ascending=False)
    )
    return test_set_pandas

# Read data

In [None]:
%%time
sessions = pl.read_csv("../sessions.csv")
sessions = sessions.drop_nulls()
venues = pl.read_csv("../venues.csv")
venues = venues.drop_nulls()

In [None]:
def add_purchase_indicator_for_session(sessions):
    sessions = sessions.join(sessions.groupby("session_id").agg(
        pl.col('purchased').max().alias('purchased_in_session')
    ).select('session_id','purchased_in_session'), on='session_id').sort('purchased_in_session')
    return sessions

# Join venue data with search data

In [None]:
ranking_data = sessions.join(venues, on="venue_id")
ranking_data = convert_boolean_to_int(ranking_data)
# hex_string = "0a21dde9-1495-417c-bb9d-9922b81f2e6a"


ranking_data = ranking_data.with_column(
    pl.col("session_id").str.replace("-","").alias("session_id_hashed").hash(seed=0)
)



# Split data into train, val and test

In [None]:
train_set, unseen_set = train_test_split(ranking_data, train_size=0.2, test_size=0.8)
val_set, test_set = train_test_split(unseen_set, train_size=0.2, test_size=0.8)

In [None]:
train_set.shape, val_set.shape, test_set.shape

In [None]:
train_set.sort(by=[group_column, rank_column], reverse=True)

# prepare datasets

In [None]:
train_set = train_set.sort(by=[group_column, rank_column], reverse=False)
train_set_group_sizes = (
    train_set.groupby(group_column)
    .agg(pl.col(group_column).count().alias("count"))
    .sort(group_column)
    .select("count")
)

val_set = val_set.sort(by=[group_column, rank_column], reverse=False)
val_set_group_sizes = (
    val_set.groupby(group_column)
    .agg(pl.col(group_column).count().alias("count"))
    .sort(group_column)
    .select("count")
)

In [None]:
%%time

train_y = train_set[[label_column]]
train_x = train_set[features]

val_y = val_set[[label_column]]
val_x = val_set[features]

test_x = test_set[features]

lgb_train_set = lgb.Dataset(
    train_x.to_pandas(),
    label=train_y.to_pandas(),
    group=train_set_group_sizes.to_numpy(),
    free_raw_data=True
).construct()

lgb_valid_set = lgb.Dataset(
    val_x.to_pandas(),
    label=val_y.to_pandas(),
    group=val_set_group_sizes.to_numpy(),
    reference=lgb_train_set,
    free_raw_data=True
).construct()

# some memory management
# del train_set
# del val_set
del train_y
del train_x

import gc

gc.collect()

In [None]:
from collections import Counter
Counter(ranking_data[label_column].to_numpy())

In [None]:
import os
data_path: str = "/tmp/lgb_train_set.binary"
n_rows: int = lgb_train_set.num_data()
n_features: int = lgb_train_set.num_feature()

print(f"Number of rows: {n_rows}")


print(f"Number of columns: {n_features}")
os.system(f"rm -rf {data_path}")
lgb_train_set.save_binary(data_path)
# Define the path to the binary file
 
import pathlib

# Create a Path object from a string
my_path = pathlib.Path(data_path)

# Print the Path object
print(my_path)

# Load the dataset from the binary file
dataset = lgb.Dataset(my_path, free_raw_data=False).construct()

# Print some information about the dataset
print(f"Number of rows: {dataset.num_data()}")
print(f"Number of columns: {dataset.num_feature()}")
assert dataset.num_data() == n_rows
assert dataset.num_feature() == n_features

# Train

In [None]:
lgb_params = {
    "objective": "lambdarank",
    "num_leaves": 100,
    "min_sum_hessian_in_leaf": 10,
    "metric": "ndcg",
    "ndcg_eval_at": [10, 20, 40],
    "learning_rate": 0.8,
    "force_row_wise": True,
    "num_iterations": 10,
}


In [None]:
%%time

evals_logs = {}
lgb_params
lgb_model = lgb.train(
    params=lgb_params,
    train_set=lgb_train_set,
    valid_sets=[lgb_valid_set, lgb_train_set],
    valid_names=["val", "train"],
    verbose_eval=25,
    evals_result=evals_logs,
    early_stopping_rounds=25
)

# QA

In [None]:
plot_lgb_report(lgb_model, evals_logs)

In [None]:
used_features = qa_features(lgb_model, features)

In [None]:
used_features

# save the trained model

In [None]:
%%time
lgb_model.save_model("/tmp/venues_ranking.pkl")
joblib.dump(lgb_model, 'rate_venues.joblib')

In [None]:
(
    ranking_data
    # .filter(pl.col("session_id_hashed")==2697534841382868)
    .sort(by=[group_column, rank_column], reverse=False)
    .groupby("session_id")
    .agg(
        pl.all().take([0,1, 2])
    )
    .explode("rating")
)

In [None]:
def calculate_ndcg_score(y_true: pl.Series, y_pred:pl.Series):
    return pl.Series([ndcg_score(y_true=y_true, y_score=y_pred)], dtype=pl.Float64)

In [None]:
predictions = generate_predictions(test_set, lgb_model, features_to_use=features)
predictions_pl = pl.DataFrame(predictions)

In [None]:
"has_seen_venue_in_this_session"

In [None]:
def drop_sessions_with_no_interactions(ranking_data):
    active_sessions = (
        ranking_data
        .filter(pl.col(label_column)==1)
        .select("session_id","position_in_list", "popularity").sort("session_id").groupby("session_id").count().sort("count")\
        .filter(pl.col("count")>1).select("session_id")
    )
    active_ranking_data = ranking_data.join(active_sessions, on="session_id")

    return active_ranking_data

In [None]:
active_ranking_data = drop_sessions_with_no_interactions(predictions_pl)

In [None]:
import numpy as np
import numpy as np
from sklearn.metrics import ndcg_score

def my_ndcg(y_true, y_pred):
    y_true = np.asarray([y_true])
    y_pred = np.asarray([y_pred])
    return ndcg_score(y_true=y_true, y_score = y_pred)
def my_relative_percentage_diff(baseline_value, model_value):
    baseline_vector = np.asarray(baseline_value)
    model_vector = np.asarray(model_value)
    print(baseline_vector)
    print(model_vector)
    return abs(baseline_vector-model_vector)/baseline_vector


In [None]:
%%time
baseline_ndcg_per_sessions = (
    active_ranking_data
    .filter(pl.col(label_column)==1)
    .groupby("session_id")
    .agg([
        pl.apply(
            [pl.col('position_in_list'), pl.col("popularity")], lambda s: my_ndcg(s[0],s[1]) ).alias('baseline_ndcg')
    ])
)

In [None]:
%%time
model_ndcg_per_sessions = (
    active_ranking_data
    .filter(pl.col(label_column)==1)
    .groupby("session_id")
    .agg([
        pl.apply(
            [pl.col('position_in_list'), pl.col("predicted_popularity")], lambda s: my_ndcg(s[0],s[1]) ).alias('model_ndcg')
    ])
)

In [None]:
metric_columns = ["baseline_ndcg", "model_ndcg"]

In [None]:
metric_dataframe = predictions_pl.join(baseline_ndcg_per_sessions, on="session_id").join(model_ndcg_per_sessions, on="session_id")[ used_features + metric_columns].with_columns(
    [
        (pl.col("model_ndcg")-pl.col("baseline_ndcg")).alias("ndcg_diff")
    ]
).groupby("venue_id").mean()
metric_dataframe

# calculate percentage increase of ndcg

In [None]:
metric_dataframe.with_columns(
    

    (abs(pl.col('baseline_ndcg') - pl.col("model_ndcg"))/pl.col('baseline_ndcg') * 100).alias('perc_increase_in_ndcg')
    
).groupby('venue_id').agg(
    pl.col('perc_increase_in_ndcg').mean()
).sort('perc_increase_in_ndcg', reverse=True)

In [None]:
highest_rating_venues = metric_dataframe.with_columns(
    

    (abs(pl.col('baseline_ndcg') - pl.col("model_ndcg"))/pl.col('baseline_ndcg') * 100).alias('perc_increase_in_ndcg')
    
).groupby('venue_id').agg(
    pl.col('perc_increase_in_ndcg').mean()
).sort('perc_increase_in_ndcg', reverse=True).head(5).select('venue_id')

In [None]:
ranking_data.join(highest_rating_venues, on ='venue_id')[features].head(10).to_pandas().to_json(orient='records')

In [None]:
[
    {"venue_id":-4202398962129790175,"conversions_per_impression":0.3556765815,"price_range":1,"rating":8.6,"popularity":4.4884057024,"retention_rate":0.5884095,"session_id_hashed":3352618370338455358,"position_in_list":0,"is_from_order_again":1,"is_recommended":0},{"venue_id":-8608196287932575311,"conversions_per_impression":0.1206581353,"price_range":1,"rating":9.2,"popularity":0.2022771056,"retention_rate":0.18,"session_id_hashed":4664838061955502305,"position_in_list":0,"is_from_order_again":0,"is_recommended":0},
    {"venue_id":-4202398962129790175,"conversions_per_impression":0.3556765815,"price_range":1,"rating":8.6,"popularity":4.4884057024,"retention_rate":0.5884095,"session_id_hashed":1006495267592422768,"position_in_list":0,"is_from_order_again":0,"is_recommended":0},{"venue_id":-4202398962129790175,"conversions_per_impression":0.3556765815,"price_range":1,"rating":8.6,"popularity":4.4884057024,"retention_rate":0.5884095,"session_id_hashed":16271107337218474123,"position_in_list":31,"is_from_order_again":0,"is_recommended":0},{"venue_id":-4202398962129790175,"conversions_per_impression":0.3556765815,"price_range":1,"rating":8.6,"popularity":4.4884057024,"retention_rate":0.5884095,"session_id_hashed":12992628493413309367,"position_in_list":0,"is_from_order_again":1,"is_recommended":0},{"venue_id":8968794542286256815,"conversions_per_impression":0.4036363636,"price_range":1,"rating":8.8,"popularity":0.8977682883,"retention_rate":0.272727,"session_id_hashed":11792925231034451836,"position_in_list":13,"is_from_order_again":1,"is_recommended":1},{"venue_id":-4202398962129790175,"conversions_per_impression":0.3556765815,"price_range":1,"rating":8.6,"popularity":4.4884057024,"retention_rate":0.5884095,"session_id_hashed":2327279187342959944,"position_in_list":0,"is_from_order_again":1,"is_recommended":0},{"venue_id":-4202398962129790175,"conversions_per_impression":0.3556765815,"price_range":1,"rating":8.6,"popularity":4.4884057024,"retention_rate":0.5884095,"session_id_hashed":6669153405411707628,"position_in_list":33,"is_from_order_again":1,"is_recommended":0},
    {"venue_id":8968794542286256815,"conversions_per_impression":0.4036363636,"price_range":1,"rating":8.8,"popularity":0.8977682883,"retention_rate":0.272727,"session_id_hashed":3159537071444654512,"position_in_list":5,"is_from_order_again":0,"is_recommended":0},{"venue_id":8968794542286256815,"conversions_per_impression":0.4036363636,"price_range":1,"rating":8.8,"popularity":0.8977682883,"retention_rate":0.272727,"session_id_hashed":13008284017370400506,"position_in_list":31,"is_from_order_again":1,"is_recommended":1}]

# explore predicted rankings on test data

In [None]:
venues_total = predictions_pl.select('venue_id').unique()
venues_total.shape[0]

In [None]:
predictions_pl[features + [pred_label, predicted_rank_column]].filter(pl.col(pred_label)>0)

In [None]:
positive_predictions = predictions_pl[features + [pred_label, predicted_rank_column]].filter(pl.col(pred_label)>0)

In [None]:
predictions_pl.groupby('venue_id').agg(
    [
        pl.col(rank_column).max().alias(f"max_{rank_column}"),
        pl.col(rank_column).min().alias(f"min_{rank_column}"),
        pl.col(rank_column).quantile(0.8).alias(f"q80_{rank_column}"),
        pl.col(predicted_rank_column).max().alias(f"max_{predicted_rank_column}"),
        pl.col(predicted_rank_column).min().alias(f"min_{predicted_rank_column}"),
        pl.col(predicted_rank_column).quantile(0.8).alias(f"q80_{predicted_rank_column}")]
)

In [None]:
positive_predictions.groupby('venue_id').agg(
    [
        pl.col(rank_column).max().alias(f"max_{rank_column}"),
        pl.col(rank_column).min().alias(f"min_{rank_column}"),
        pl.col(rank_column).quantile(0.8).alias(f"q80_{rank_column}"),
        pl.col(predicted_rank_column).max().alias(f"max_{predicted_rank_column}"),
        pl.col(predicted_rank_column).min().alias(f"min_{predicted_rank_column}"),
        pl.col(predicted_rank_column).quantile(0.8).alias(f"q80_{predicted_rank_column}")]
)

In [None]:
ranking_data[rank_column].min(), ranking_data[rank_column].max()

In [None]:
positive_predictions.groupby('venue_id').agg(
    [
        pl.col(rank_column).max().alias(f"max_{rank_column}"),
        pl.col(rank_column).min().alias(f"min_{rank_column}"),
        pl.col(rank_column).quantile(0.8).alias(f"q80_{rank_column}"),
        pl.col(predicted_rank_column).max().alias(f"max_{predicted_rank_column}"),
        pl.col(predicted_rank_column).min().alias(f"min_{predicted_rank_column}"),
        pl.col(predicted_rank_column).quantile(0.8).alias(f"q80_{predicted_rank_column}")]
).select(f"q80_{predicted_rank_column}", f"q80_{rank_column}").to_pandas().max()

# percentage of venues where we predict popularity

In [None]:
926/1043 * 100

In [None]:
predictions_pl[features + [pred_label, predicted_rank_column]].filter(pl.col(pred_label)>0).select("venue_id", predicted_rank_column).unique()

In [None]:
def generate_predictions(
    test_set: pl.DataFrame, model: lgb, features_to_use: List[str]
) -> pd.DataFrame:
    
    test_set_pandas[features_to_use] = test_set_pandas[features_to_use].fillna(0)
    test_x = test_set_pandas[features_to_use]

    test_set_pandas[pred_label] = model.predict(test_x)
    test_set_pandas[predicted_rank_column] = (
        test_set_pandas.groupby(group_column)[pred_label]
        .rank(method="first", ascending=False)
    )
    return test_set_pandas

In [None]:
group_column

In [None]:
from typing import Any
import json
def generate_model_ratings(
    test_incoming_inference_features: str, lgb_model: Any, pred_label="has_seen_venue_in_this_session", predicted_rank_column="predicted_popularity", group_column="session_id_hashed"
    ) -> str:
    inference_dataframe = pl.DataFrame(json.loads(test_incoming_inference_features))
    incoming_features = inference_dataframe.columns
    expected_features = lgb_model.feature_name()
    assert all(expected_column == actual_column for expected_column, actual_column in zip(incoming_features, expected_features)),\
        "the inference feature do not have the same order as the training features, this can lead to poorer performance"
    inference_dataframe_pd = inference_dataframe.sort(
        by=[group_column, rank_column], reverse=False
    ).to_pandas()
    inference_dataframe_pd[pred_label] = lgb_model.predict(inference_dataframe_pd)
    inference_dataframe_pd[predicted_rank_column] = (
        inference_dataframe_pd.groupby(group_column)[pred_label]
        .rank(method="first", ascending=False)
    )
    predictions_pl = pl.DataFrame(inference_dataframe_pd)
    return predictions_pl.groupby('venue_id').agg(
    [
        pl.col(predicted_rank_column).quantile(0.8).alias(f"q80_{predicted_rank_column}")]
    ).select("venue_id", f"q80_{predicted_rank_column}").to_pandas().head(5).to_json(orient='records')


In [None]:
test_incoming_inference_features = '''[
    {"venue_id":-4202398962129790175,"conversions_per_impression":0.3556765815,"price_range":1,"rating":8.6,"popularity":4.4884057024,"retention_rate":0.5884095,"session_id_hashed":3352618370338455358,"position_in_list":0,"is_from_order_again":1,"is_recommended":0},
    {"venue_id":-8608196287932575311,"conversions_per_impression":0.1206581353,"price_range":1,"rating":9.2,"popularity":0.2022771056,"retention_rate":0.18,"session_id_hashed":4664838061955502305,"position_in_list":0,"is_from_order_again":0,"is_recommended":0},
    {"venue_id":-4202398962129790175,"conversions_per_impression":0.3556765815,"price_range":1,"rating":8.6,"popularity":4.4884057024,"retention_rate":0.5884095,"session_id_hashed":1006495267592422768,"position_in_list":0,"is_from_order_again":0,"is_recommended":0},
    {"venue_id":-4202398962129790175,"conversions_per_impression":0.3556765815,"price_range":1,"rating":8.6,"popularity":4.4884057024,"retention_rate":0.5884095,"session_id_hashed":16271107337218474123,"position_in_list":31,"is_from_order_again":0,"is_recommended":0},
    {"venue_id":-4202398962129790175,"conversions_per_impression":0.3556765815,"price_range":1,"rating":8.6,"popularity":4.4884057024,"retention_rate":0.5884095,"session_id_hashed":12992628493413309367,"position_in_list":0,"is_from_order_again":1,"is_recommended":0},
    {"venue_id":8968794542286256815,"conversions_per_impression":0.4036363636,"price_range":1,"rating":8.8,"popularity":0.8977682883,"retention_rate":0.272727,"session_id_hashed":11792925231034451836,"position_in_list":13,"is_from_order_again":1,"is_recommended":1},
    {"venue_id":-4202398962129790175,"conversions_per_impression":0.3556765815,"price_range":1,"rating":8.6,"popularity":4.4884057024,"retention_rate":0.5884095,"session_id_hashed":2327279187342959944,"position_in_list":0,"is_from_order_again":1,"is_recommended":0},
    {"venue_id":-4202398962129790175,"conversions_per_impression":0.3556765815,"price_range":1,"rating":8.6,"popularity":4.4884057024,"retention_rate":0.5884095,"session_id_hashed":6669153405411707628,"position_in_list":33,"is_from_order_again":1,"is_recommended":0},
    {"venue_id":8968794542286256815,"conversions_per_impression":0.4036363636,"price_range":1,"rating":8.8,"popularity":0.8977682883,"retention_rate":0.272727,"session_id_hashed":3159537071444654512,"position_in_list":5,"is_from_order_again":0,"is_recommended":0},
    {"venue_id":8968794542286256815,"conversions_per_impression":0.4036363636,"price_range":1,"rating":8.8,"popularity":0.8977682883,"retention_rate":0.272727,"session_id_hashed":13008284017370400506,"position_in_list":31,"is_from_order_again":1,"is_recommended":1}]
'''.replace('\n','').replace(' ','')
generate_model_ratings(test_incoming_inference_features, lgb_model)

In [None]:
def test_predict():
    server = InferenceServer(name="test-server")
    data = [[1424193000929084737, 0.403492, 1, 8.6, 5.537811, 0.384965, 0, 1, 0],
            [1424193000929084736, 0.403492, 1, 8.6, 5.537811, 0.384965, 0, 1, 0],
            [1424193000929084735, 0.403492, 1, 8.6, 5.537811, 0.384965, 0, 1, 0],
            [1424193000929084734, 0.403492, 1, 8.6, 5.537811, 0.384965, 0, 1, 0]]

    real_response = requests.get("http://localhost:8000/predict", json=data)
    expected_response = {
    "venue_id": [1424193000929084737, 1424193000929084736, 1424193000929084735, 1424193000929084734],
    "has_seen_venue_in_this_session": [0.0, 0.0, 0.0, 0.0]
    }
    
