# Generate and Evaluate Lenskit Model Stability and Select Scores for Predictions

## Load libraries and data

In [1]:
# This code checks if the lenskit library is installed and installs it if it is not.
try:
    import lenskit
    print("Lenskit is already installed.")
except ImportError:
    print("Lenskit is not installed. Installing...")
    import subprocess
    subprocess.check_call(["pip", "install", "lenskit"])

Lenskit is already installed.


In [2]:
# Standard library imports
import os
import sys
from typing import List

# Third party imports
import altair as alt  # For data visualization
import numpy as np  # For numerical operations
import pandas as pd  # For data manipulation
import scipy
from scipy.stats import zscore  # For statistical computations
from sklearn.preprocessing import MinMaxScaler  # For data preprocessing
from tqdm.notebook import tqdm  # For progress bars
from textwrap import wrap

# LensKit imports
from lenskit import Recommender, topn, util, batch, crossfold as xf  # For recommendation systems
from lenskit.algorithms import als, basic  # For recommendation algorithms

# Local application/library specific imports
sys.path.append("..")
from utils.missing_data_processing import *  # For handling missing data

In [3]:
# Load the initial data into four DataFrames: events_df, members_df, books_df, and borrow_overrides_df.
events_df, members_df, books_df, borrow_overrides_df  = load_initial_data()

# Process the events data to clean it and prepare it for analysis.
events_df = preprocess_events_data(events_df)

# Extract the item ID from the URI in the books DataFrame.
# The item ID is the second to last part of the URI.
books_df["item_id"] = books_df.uri.apply(
    lambda x: x.split("/")[-2] if pd.notna(x) else None
)

# Create a lookup table for item titles
title_lookup = {row.item_id: row.title for row in books_df.itertuples()}

# Generate short IDs for the members in the members DataFrame.
# The ID is the second to last part of the URI.
members_df["id"] = members_df.uri.apply(
    lambda x: x.split("/")[-2]
)

# Get all member-book interactions from the events DataFrame.
# Only include rows where the item URI is not null.
interactions_df = events_df[(events_df.item_uri.notna()) & (events_df.item_id.isin(books_df[books_df.author.notna()].item_id))].copy()

# Restrict the interactions to borrow events only.
interactions_df = interactions_df[interactions_df.event_type == 'Borrow'].copy()

# Reduce the interactions DataFrame to the minimum user/item interaction fields and drop duplicate rows.
unique_interactions_df = interactions_df[
    ["member_id", "item_id"]
].drop_duplicates(subset=["member_id", "item_id"])

# Rename the columns to the names expected by LensKit.
# The DataFrame is renamed to 'ratings' for use with the tutorial.
ratings = unique_interactions_df.rename(columns={'member_id': 'user', 'item_id': 'item'})

# The example assumes a rating. Use a 1/0 rating and set all to 1 to confirm interaction.
ratings['rating'] = 1

dated_events_df = events_df.copy()
dated_events_df['start_date_dt'] = pd.to_datetime(dated_events_df['start_date'], errors='coerce')
dated_events_df['end_date_dt'] = pd.to_datetime(dated_events_df['end_date'], errors='coerce')

## Lenskit Recommendations

In [4]:
lenskit_model = pd.read_csv(f'../speculative_reading/data/redo_lenskit_model100_scores.csv')
lenskit_model['member_period'] = lenskit_model.member_id + ': ' + lenskit_model.subscription_start.astype(str) + '/' + lenskit_model.subscription_end.astype(str)

In [5]:
lenskit_model[['member_id', 'subscription_start', 'subscription_end']].value_counts()

member_id         subscription_start  subscription_end
hemingway-ernest  1924-03-28          1925-03-28          116200
                  1921-12-28          1922-11-08           59500
dtype: int64

In [6]:
lenskit_model.groupby('member_period')['item_id'].nunique()

member_period
hemingway-ernest: 1921-12-28/1922-11-08     595
hemingway-ernest: 1924-03-28/1925-03-28    1162
Name: item_id, dtype: int64

In [7]:
def get_formatted_titles(row):

  item = books_df[books_df.item_id == row.item_id]
  if item.author.isna().any() == False:
    author = ' '.join(item.author.str.split(',').values[0][::-1])
    author = ' by' + author
  else: 
    author = '(Periodical)'
  title = item.title.values[0]
  return title + author

lenskit_model['formatted_title'] = lenskit_model.apply(get_formatted_titles, axis=1)

In [8]:
lenskit_model['period'] = lenskit_model.member_period.str.split(':').str[1]
lenskit_model['zscore'] = lenskit_model.groupby(['member_id', 'subscription_start', 'subscription_end', 'item_id'])['score'].transform(lambda x : zscore(x,ddof=1))

top_scores = lenskit_model.groupby(['member_id', 'subscription_start', 'subscription_end', 'item_id']).agg({'score':'max'})[['score']].reset_index()
top_scores = pd.merge(top_scores, lenskit_model, on=top_scores.columns.tolist(), how='inner')

top_scores = top_scores.rename(columns={'score': 'top_score', 'zscore' : 'top_zscore'})

avg_scores = lenskit_model.groupby(['member_id', 'subscription_start', 'subscription_end', 'item_id'])['score'].mean().reset_index(name='avg_score')
scores_df = pd.merge(top_scores, avg_scores, on=['member_id', 'subscription_start', 'subscription_end', 'item_id'])

std_scores = lenskit_model.groupby(['member_id', 'subscription_start', 'subscription_end', 'item_id']).agg({'score':'std'}).reset_index()
std_scores = std_scores.rename(columns={'score': 'std_score'})
scores_df = pd.merge(scores_df, std_scores, on=['member_id', 'subscription_start', 'subscription_end', 'item_id'])

median_scores = lenskit_model.groupby(['member_id', 'subscription_start', 'subscription_end', 'item_id'])['score'].median().reset_index(name='median_score')
scores_df = pd.merge(scores_df, median_scores, on=['member_id', 'subscription_start', 'subscription_end', 'item_id'])

mode_scores = lenskit_model.groupby(['member_id', 'subscription_start', 'subscription_end', 'item_id'])['score'].agg(lambda x: scipy.stats.mode(x)[0][0]).reset_index()
mode_scores = pd.merge(mode_scores, lenskit_model, on=mode_scores.columns.tolist(), how='inner')
mode_scores = mode_scores.rename(columns={'score': 'mode_score', 'zscore' : 'mode_zscore'})

lenskit_scores = pd.merge(mode_scores[['member_id', 'subscription_start', 'subscription_end', 'item_id', 'mode_score', 'mode_zscore']], scores_df, on=['member_id', 'subscription_start', 'subscription_end', 'item_id'])

In [9]:
lenskit_scores.head()

Unnamed: 0,member_id,subscription_start,subscription_end,item_id,mode_score,mode_zscore,top_score,median,skew,std,...,kurtosis,model_loops,model_run,member_period,formatted_title,period,top_zscore,avg_score,std_score,median_score
0,hemingway-ernest,1921-12-28,1922-11-08,a-e-candle-vision,-0.211456,-2.496513,0.362358,0.106474,-0.220694,0.122719,...,-0.040707,100,22,hemingway-ernest: 1921-12-28/1922-11-08,The Candle of Vision byÆ,1921-12-28/1922-11-08,2.179319,0.094914,0.122719,0.106474
1,hemingway-ernest,1921-12-28,1922-11-08,a-e-collected-poems,-0.322248,-2.206841,0.399307,-0.019966,0.190475,0.132462,...,0.131728,100,64,hemingway-ernest: 1921-12-28/1922-11-08,Collected Poems byÆ,1921-12-28/1922-11-08,3.240428,-0.029926,0.132462,-0.019966
2,hemingway-ernest,1921-12-28,1922-11-08,a-e-imaginations-reveries,-0.09073,-1.863097,0.327839,0.082864,0.207887,0.096638,...,-0.525446,100,22,hemingway-ernest: 1921-12-28/1922-11-08,Imaginations and Reveries byÆ,1921-12-28/1922-11-08,2.46822,0.089315,0.096638,0.082864
3,hemingway-ernest,1921-12-28,1922-11-08,abercrombie-speculative-dialogues,-0.693151,-3.2357,0.154087,-0.197518,-0.325633,0.152323,...,0.735525,100,17,hemingway-ernest: 1921-12-28/1922-11-08,Speculative Dialogues by Lascelles Abercrombie,1921-12-28/1922-11-08,2.326402,-0.200279,0.152323,-0.197518
4,hemingway-ernest,1921-12-28,1922-11-08,abercrombie-thomas-hardy-critical,-0.053308,-2.433895,0.245264,0.087059,0.055321,0.058677,...,0.016082,100,34,hemingway-ernest: 1921-12-28/1922-11-08,Thomas Hardy: A Critical Study by Lascelles Ab...,1921-12-28/1922-11-08,2.654514,0.089505,0.058677,0.087059


In [10]:
lenskit_model['formatted_wrapped_title'] = lenskit_model.formatted_title.apply(wrap, args=[50])

## Collaborative Filtering

There's a number of permutations available with collaborative filtering:

- Keep all data including periodicals
- Exclude periodicals
- Limit to circulation period when making predictions

In [11]:
collaborative_filtering_model = pd.read_csv("../speculative_reading/data/memorycf_full_results.csv")
collaborative_filtering_model = collaborative_filtering_model.rename(columns={'item_uri': 'item_id'})
collaborative_filtering_model['formatted_title'] = collaborative_filtering_model.apply(get_formatted_titles, axis=1)
collaborative_filtering_scores = pd.read_csv("../speculative_reading/data/full_collaborative_filtering_predictions.csv")
collaborative_filtering_model.columns

Index(['item_id', 'formatted_chart_title', 'formatted_table_title',
       'member_id', 'period', 'subscription_start', 'subscription_end',
       'metric', 'score', 'formatted_title'],
      dtype='object')

In [12]:
collaborative_filtering_model = collaborative_filtering_model[['item_id', 'formatted_title', 'member_id', 'period', 'subscription_start', 'subscription_end', 'metric', 'score']]
collaborative_filtering_model.loc[collaborative_filtering_model.formatted_title.str.contains('Instigations of Ezra Pound'), 'formatted_title'] = 'Instigations of Ezra Pound: Together with an Essay on the Chinese Written Character by Ezra Pound and Ernest Fenollosa'
collaborative_filtering_model['formatted_wrapped_title'] = collaborative_filtering_model.formatted_title.apply(wrap, args=[50])
collaborative_filtering_model.loc[collaborative_filtering_model.metric == 'cosine', 'metric'] = 'Cosine Distance'
collaborative_filtering_model.loc[collaborative_filtering_model.metric == 'pearson', 'metric'] = 'Pearson Correlation Coefficient'
collaborative_filtering_model.loc[collaborative_filtering_model.metric == 'euclidean', 'metric'] = 'Euclidean Distance'


In [13]:
# Create a selection for the scatter plot
selection = alt.selection_point(fields=['period'], bind='legend')

# Create a search input selection
search_box = alt.param(
    value='',
    bind=alt.binding(
        input='search',
        placeholder="Search title",
        name='Search ',
    )
)

alt.Chart(collaborative_filtering_model[0:100]).mark_point(opacity=0.8, filled=True, thickness=3).encode(
    # The y-axis is the formatted chart title, sorted by the x-axis values
    y=alt.Y('formatted_title', sort='-x', axis=alt.Axis(title='', labelFontSize=9)),
    # The x-axis is the score
    x='score',
    # The color represents the subscription period
    color=alt.Color('period:N', legend=alt.Legend(title=["Subscription Period"])),
    opacity=alt.condition(selection, alt.value(1), alt.value(0.2)),
    # The shape represents the method for collaborative filtering
    shape=alt.Shape('metric:N', legend=alt.Legend(title='Method for Collaborative Filtering')),
).add_params(
    search_box, selection
).transform_filter(
    alt.expr.test(alt.expr.regexp(search_box, 'i'), alt.datum.formatted_chart_title)
    ).properties(
    # Set the width and height of the plot
    width=300,
    # height=1800,
    # Set the title of the plot
    title='Top Predictions by Collaborative Filtering Memory-Based Methods'
)

## Popular Titles

In [14]:
# define a recommender using the algorithm that worked best in eval script
rec = Recommender.adapt(als.ImplicitMF(50, use_ratings=False))
# fit to all of our data
rec.fit(ratings)

# define a popularity recommender for comparison
popular = Recommender.adapt(basic.Popular())
popular.fit(ratings)

Numba is using threading layer workqueue - consider TBB
found 1 potential runtime problems - see https://boi.st/lkpy-perf


<lenskit.algorithms.basic.Popular at 0x16db19130>

In [15]:
popular_periods = lenskit_model[['member_id', 'subscription_start', 'subscription_end']].drop_duplicates()
popular_periods['subscription_start_dt'] = pd.to_datetime(popular_periods['subscription_start'], errors='coerce')
popular_periods['subscription_end_dt'] = pd.to_datetime(popular_periods['subscription_end'], errors='coerce')
popular_periods

Unnamed: 0,member_id,subscription_start,subscription_end,subscription_start_dt,subscription_end_dt
0,hemingway-ernest,1921-12-28,1922-11-08,1921-12-28,1922-11-08
59500,hemingway-ernest,1924-03-28,1925-03-28,1924-03-28,1925-03-28


In [16]:
from dateutil.relativedelta import relativedelta

sixmonths = relativedelta(months=6)

last_member = None

popularity_results = []

for _, row in popular_periods.iterrows():

	circulation_start = row.subscription_start_dt - sixmonths
	user_id = row.member_id

	circulating_book_events = dated_events_df[dated_events_df.start_date_dt.between(circulation_start, row.subscription_end_dt) | dated_events_df.end_date_dt.between(circulation_start, row.subscription_end_dt)]

	# get a *unique* list of item ids
	item_ids = circulating_book_events.item_id.unique()
	pop_predictions = popular.recommend(user_id, n=len(item_ids), candidates=item_ids)

	pop_predictions['type'] = 'popular (all time)'
	pop_predictions['subscription_period'] = row.subscription_start + '/' + row.subscription_end
	pop_predictions = pop_predictions.rename(columns={'item': 'item_id'})
	
	popular_current = circulating_book_events.groupby(['item_id'])['item_id'].count().reset_index(name='score').sort_values(['score'], ascending=False)
	popular_current['type'] = 'popular (current)'
	popular_current['subscription_period'] = row.subscription_start + '/' + row.subscription_end

	combined = pd.concat([pop_predictions, popular_current])
	popularity_results.append(combined)



In [17]:
popularity_results_df = pd.concat(popularity_results)
popularity_results_df.head()

Unnamed: 0,item_id,score,type,subscription_period
0,joyce-portrait-artist-young,50.0,popular (all time),1921-12-28/1922-11-08
1,joyce-dubliners,45.0,popular (all time),1921-12-28/1922-11-08
2,richardson-pointed-roofs,40.0,popular (all time),1921-12-28/1922-11-08
3,mansfield-garden-party-stories,34.0,popular (all time),1921-12-28/1922-11-08
4,mansfield-bliss-short-stories,31.0,popular (all time),1921-12-28/1922-11-08
