# Generate and Evaluate Lenskit Model Stability and Select Scores for Predictions

## Load libraries and data

In [1]:
# This code checks if the lenskit library is installed and installs it if it is not.
try:
    import lenskit
    print("Lenskit is already installed.")
except ImportError:
    print("Lenskit is not installed. Installing...")
    import subprocess
    subprocess.check_call(["pip", "install", "lenskit"])

Lenskit is already installed.


In [2]:
# Standard library imports
import os
import sys
from typing import List

# Third party imports
import altair as alt  # For data visualization
import numpy as np  # For numerical operations
import pandas as pd  # For data manipulation
import scipy
from scipy.stats import zscore  # For statistical computations
from sklearn.preprocessing import MinMaxScaler  # For data preprocessing
from tqdm.notebook import tqdm  # For progress bars

# LensKit imports
from lenskit import Recommender, topn, util, batch, crossfold as xf  # For recommendation systems
from lenskit.algorithms import als, basic  # For recommendation algorithms

# Local application/library specific imports
sys.path.append("..")
from utils.missing_data_processing import *  # For handling missing data

In [3]:
# Load the initial data into four DataFrames: events_df, members_df, books_df, and borrow_overrides_df.
events_df, members_df, books_df, borrow_overrides_df  = load_initial_data()

# Process the events data to clean it and prepare it for analysis.
events_df = preprocess_events_data(events_df)

# Extract the item ID from the URI in the books DataFrame.
# The item ID is the second to last part of the URI.
books_df["item_id"] = books_df.uri.apply(
    lambda x: x.split("/")[-2] if pd.notna(x) else None
)

# Create a lookup table for item titles
title_lookup = {row.item_id: row.title for row in books_df.itertuples()}

# Generate short IDs for the members in the members DataFrame.
# The ID is the second to last part of the URI.
members_df["id"] = members_df.uri.apply(
    lambda x: x.split("/")[-2]
)

# Get all member-book interactions from the events DataFrame.
# Only include rows where the item URI is not null.
interactions_df = events_df[events_df.item_uri.notna()].copy()

# Restrict the interactions to borrow events only.
interactions_df = interactions_df[interactions_df.event_type == 'Borrow'].copy()

# Reduce the interactions DataFrame to the minimum user/item interaction fields and drop duplicate rows.
unique_interactions_df = interactions_df[
    ["member_id", "item_id"]
].drop_duplicates(subset=["member_id", "item_id"])

# Rename the columns to the names expected by LensKit.
# The DataFrame is renamed to 'ratings' for use with the tutorial.
ratings = unique_interactions_df.rename(columns={'member_id': 'user', 'item_id': 'item'})

# The example assumes a rating. Use a 1/0 rating and set all to 1 to confirm interaction.
ratings['rating'] = 1

dated_events_df = events_df.copy()
dated_events_df['start_date_dt'] = pd.to_datetime(dated_events_df['start_date'], errors='coerce')
dated_events_df['end_date_dt'] = pd.to_datetime(dated_events_df['end_date'], errors='coerce')

## Lenskit Recommendations

In [4]:
lenskit_model = pd.read_csv(f'../speculative_reading/data/lenskit_model100_scores.csv')
lenskit_model['member_period'] = lenskit_model.member_id + ': ' + lenskit_model.subscription_start.astype(str) + '/' + lenskit_model.subscription_end.astype(str)

In [5]:
lenskit_model[['member_id', 'subscription_start', 'subscription_end']].value_counts()

member_id         subscription_start  subscription_end
hemingway-ernest  1924-03-28          1925-03-28          128500
                  1921-12-28          1922-11-08           65400
dtype: int64

In [6]:
lenskit_model.groupby('member_period')['item_id'].nunique()

member_period
hemingway-ernest: 1921-12-28/1922-11-08     654
hemingway-ernest: 1924-03-28/1925-03-28    1285
Name: item_id, dtype: int64

In [7]:
def get_formatted_titles(row):

  item = books_df[books_df.item_id == row.item_id]
  if item.author.isna().any() == False:
    author = ' '.join(item.author.str.split(',').values[0][::-1])
    author = ' by' + author
  else: 
    author = '(Periodical)'
  title = item.title.values[0]
  return title + author

lenskit_model['formatted_title'] = lenskit_model.apply(get_formatted_titles, axis=1)

In [9]:
lenskit_model['period'] = lenskit_model.member_period.str.split(':').str[1]
lenskit_model['zscore'] = lenskit_model.groupby(['member_id', 'subscription_start', 'subscription_end', 'item_id'])['score'].transform(lambda x : zscore(x,ddof=1))

top_scores = lenskit_model.groupby(['member_id', 'subscription_start', 'subscription_end', 'item_id']).agg({'score':'max'})[['score']].reset_index()
top_scores = pd.merge(top_scores, lenskit_model, on=top_scores.columns.tolist(), how='inner')

top_scores = top_scores.rename(columns={'score': 'top_score', 'zscore' : 'top_zscore'})

avg_scores = lenskit_model.groupby(['member_id', 'subscription_start', 'subscription_end', 'item_id'])['score'].mean().reset_index(name='avg_score')
scores_df = pd.merge(top_scores, avg_scores, on=['member_id', 'subscription_start', 'subscription_end', 'item_id'])

std_scores = lenskit_model.groupby(['member_id', 'subscription_start', 'subscription_end', 'item_id']).agg({'score':'std'}).reset_index()
std_scores = std_scores.rename(columns={'score': 'std_score'})
scores_df = pd.merge(scores_df, std_scores, on=['member_id', 'subscription_start', 'subscription_end', 'item_id'])

median_scores = lenskit_model.groupby(['member_id', 'subscription_start', 'subscription_end', 'item_id'])['score'].median().reset_index(name='median_score')
scores_df = pd.merge(scores_df, median_scores, on=['member_id', 'subscription_start', 'subscription_end', 'item_id'])

mode_scores = lenskit_model.groupby(['member_id', 'subscription_start', 'subscription_end', 'item_id'])['score'].agg(lambda x: scipy.stats.mode(x)[0][0]).reset_index()
mode_scores = pd.merge(mode_scores, lenskit_model, on=mode_scores.columns.tolist(), how='inner')
mode_scores = mode_scores.rename(columns={'score': 'mode_score', 'zscore' : 'mode_zscore'})

lenskit_scores = pd.merge(mode_scores[['member_id', 'subscription_start', 'subscription_end', 'item_id', 'mode_score', 'mode_zscore']], scores_df, on=['member_id', 'subscription_start', 'subscription_end', 'item_id'])

In [79]:
lenskit_scores.head()

NameError: name 'lenskit_scores' is not defined

## Collaborative Filtering

In [59]:
collaborative_filtering_model = pd.read_csv("../speculative_reading/data/full_collaborative_filtering_predictions.csv")

In [78]:
collaborative_filtering_model.head()

Unnamed: 0,member_id,period,item_uri,formatted_table_title,formatted_chart_title,mode_score,mode_zscore,subscription_start,subscription_end,top_score,...,skew,std,var,kurtosis,metric,top_zscore,avg_score,median_score,std_score,coef_var
0,hemingway-ernest,1921-12-28/1922-11-08,a-e-candle-vision,The Candle of Vision by Æ,"*The Candle of Vision*,<br>Æ",0.0,-0.57735,1921-12-28,1922-11-08,0.224766,...,1.732051,0.129769,0.01684,,pearson,1.154701,0.074922,0.0,0.129769,inf
1,hemingway-ernest,1921-12-28/1922-11-08,a-e-collected-poems,Collected Poems by Æ,"*Collected Poems*,<br>Æ",0.0,-0.57735,1921-12-28,1922-11-08,0.224766,...,1.732051,0.129769,0.01684,,pearson,1.154701,0.074922,0.0,0.129769,inf
2,hemingway-ernest,1921-12-28/1922-11-08,a-e-imaginations-reveries,Imaginations and Reveries by Æ,"*Imaginations and Reveries*,<br>Æ",0.008704,-0.651694,1921-12-28,1922-11-08,0.228245,...,1.687123,0.121761,0.014826,,pearson,1.15136,0.088055,0.027215,0.121761,4.473995
3,hemingway-ernest,1921-12-28/1922-11-08,abercrombie-speculative-dialogues,Speculative Dialogues by Lascelles Abercrombie,"*Speculative Dialogues*,<br> Lascelles Abercro...",0.0,-0.57735,1921-12-28,1922-11-08,0.224766,...,1.732051,0.129769,0.01684,,pearson,1.154701,0.074922,0.0,0.129769,inf
4,hemingway-ernest,1921-12-28/1922-11-08,abercrombie-thomas-hardy-critical,Thomas Hardy: A Critical Study by Lascelles A...,"*Thomas Hardy: A Critical Study*,<br> Lascelle...",0.166476,-1.102686,1921-12-28,1922-11-08,0.272727,...,-1.071395,0.054466,0.002967,,euclidean,0.848097,0.226535,0.240401,0.054466,0.226563


## Popular Titles

In [66]:
# define a recommender using the algorithm that worked best in eval script
rec = Recommender.adapt(als.ImplicitMF(50, use_ratings=False))
# fit to all of our data
rec.fit(ratings)

# define a popularity recommender for comparison
popular = Recommender.adapt(basic.Popular())
popular.fit(ratings)

<lenskit.algorithms.basic.Popular at 0x31de78df0>

In [70]:
popular_periods = lenskit_model[['member_id', 'subscription_start', 'subscription_end']].drop_duplicates()
popular_periods['subscription_start_dt'] = pd.to_datetime(popular_periods['subscription_start'], errors='coerce')
popular_periods['subscription_end_dt'] = pd.to_datetime(popular_periods['subscription_end'], errors='coerce')
popular_periods

Unnamed: 0,member_id,subscription_start,subscription_end,subscription_start_dt,subscription_end_dt
0,hemingway-ernest,1921-12-28,1922-11-08,1921-12-28,1922-11-08
65400,hemingway-ernest,1924-03-28,1925-03-28,1924-03-28,1925-03-28


In [74]:
from dateutil.relativedelta import relativedelta

sixmonths = relativedelta(months=6)

last_member = None

popularity_results = []

for _, row in popular_periods.iterrows():

	circulation_start = row.subscription_start_dt - sixmonths
	user_id = row.member_id

	circulating_book_events = dated_events_df[dated_events_df.start_date_dt.between(circulation_start, row.subscription_end_dt) | dated_events_df.end_date_dt.between(circulation_start, row.subscription_end_dt)]

	# get a *unique* list of item ids
	item_ids = circulating_book_events.item_id.unique()
	pop_predictions = popular.recommend(user_id, n=len(item_ids), candidates=item_ids)

	pop_predictions['type'] = 'popular (all time)'
	pop_predictions['subscription_period'] = row.subscription_start + '/' + row.subscription_end
	pop_predictions = pop_predictions.rename(columns={'item': 'item_id'})
	
	popular_current = circulating_book_events.groupby(['item_id'])['item_id'].count().reset_index(name='score').sort_values(['score'], ascending=False)
	popular_current['type'] = 'popular (current)'
	popular_current['subscription_period'] = row.subscription_start + '/' + row.subscription_end

	combined = pd.concat([pop_predictions, popular_current])
	popularity_results.append(combined)



In [77]:
popularity_results_df = pd.concat(popularity_results)
popularity_results_df.head()

Unnamed: 0,item_id,score,type,subscription_period
0,joyce-portrait-artist-young,50.0,popular (all time),1921-12-28/1922-11-08
1,joyce-dubliners,45.0,popular (all time),1921-12-28/1922-11-08
2,richardson-pointed-roofs,40.0,popular (all time),1921-12-28/1922-11-08
3,mansfield-garden-party-stories,34.0,popular (all time),1921-12-28/1922-11-08
4,mansfield-bliss-short-stories,31.0,popular (all time),1921-12-28/1922-11-08
