# Interactive Charts for Publication

## Load libraries and data

In [1]:
# This code checks if the lenskit library is installed and installs it if it is not.
try:
    import lenskit
    print("Lenskit is already installed.")
except ImportError:
    print("Lenskit is not installed. Installing...")
    import subprocess
    subprocess.check_call(["pip", "install", "lenskit"])

Lenskit is already installed.


In [2]:
# Standard library imports
import os
import sys
from typing import List

# Third party imports
import altair as alt  # For data visualization
import numpy as np  # For numerical operations
import pandas as pd  # For data manipulation
import scipy
from scipy.stats import zscore  # For statistical computations
from sklearn.preprocessing import MinMaxScaler  # For data preprocessing
from tqdm.notebook import tqdm  # For progress bars
from textwrap import wrap

# LensKit imports
from lenskit import Recommender, topn, util, batch, crossfold as xf  # For recommendation systems
from lenskit.algorithms import als, basic  # For recommendation algorithms

# Local application/library specific imports
sys.path.append("..")
from utils.missing_data_processing import *  # For handling missing data

In [3]:
# Load the initial data into four DataFrames: events_df, members_df, books_df, and borrow_overrides_df.
events_df, members_df, books_df, borrow_overrides_df  = load_initial_data()

# Process the events data to clean it and prepare it for analysis.
events_df = preprocess_events_data(events_df)

# Extract the item ID from the URI in the books DataFrame.
# The item ID is the second to last part of the URI.
books_df["item_id"] = books_df.uri.apply(
    lambda x: x.split("/")[-2] if pd.notna(x) else None
)

# Create a lookup table for item titles
title_lookup = {row.item_id: row.title for row in books_df.itertuples()}

# Generate short IDs for the members in the members DataFrame.
# The ID is the second to last part of the URI.
members_df["id"] = members_df.uri.apply(
    lambda x: x.split("/")[-2]
)

# Get all member-book interactions from the events DataFrame.
# Only include rows where the item URI is not null.
interactions_df = events_df[(events_df.item_uri.notna()) & (events_df.item_id.isin(books_df[books_df.author.notna()].item_id))].copy()

# Restrict the interactions to borrow events only.
interactions_df = interactions_df[interactions_df.event_type == 'Borrow'].copy()

# Reduce the interactions DataFrame to the minimum user/item interaction fields and drop duplicate rows.
unique_interactions_df = interactions_df[
    ["member_id", "item_id"]
].drop_duplicates(subset=["member_id", "item_id"])

# Rename the columns to the names expected by LensKit.
# The DataFrame is renamed to 'ratings' for use with the tutorial.
ratings = unique_interactions_df.rename(columns={'member_id': 'user', 'item_id': 'item'})

# The example assumes a rating. Use a 1/0 rating and set all to 1 to confirm interaction.
ratings['rating'] = 1

dated_events_df = events_df.copy()
dated_events_df['start_date_dt'] = pd.to_datetime(dated_events_df['start_date'], errors='coerce')
dated_events_df['end_date_dt'] = pd.to_datetime(dated_events_df['end_date'], errors='coerce')

## Popular Titles

In [12]:
popularity_results = pd.read_csv("../speculative_reading/data/lenskit_results/popularity_results_without_periodicals.csv")

## Lenskit Recommendations

In [7]:
full_lenskit_model = pd.read_csv(f'../speculative_reading/data/lenskit_results/full_predictions_model100_without_periodicals.csv')
aggregated_lenskit_model = pd.read_csv(f'../speculative_reading/data/lenskit_results/aggregated_full_predictions_model100_without_periodicals.csv')
sampled_lenskit_model = pd.read_csv(f'../speculative_reading/data/lenskit_results/sampled_predictions_model100_without_periodicals.csv')
aggregated_sampled_lenskit_model = pd.read_csv(f'../speculative_reading/data/lenskit_results/aggregated_sampled_predictions_model100_without_periodicals.csv')
full_lenskit_model['member_period'] = full_lenskit_model.member_id + ': ' + full_lenskit_model.subscription_start.astype(str) + '/' + full_lenskit_model.subscription_end.astype(str)
aggregated_lenskit_model['member_period'] = aggregated_lenskit_model.member_id + ': ' + aggregated_lenskit_model.subscription_start.astype(str) + '/' + aggregated_lenskit_model.subscription_end.astype(str)
sampled_lenskit_model['member_period'] = sampled_lenskit_model.member_id + ': ' + sampled_lenskit_model.subscription_start.astype(str) + '/' + sampled_lenskit_model.subscription_end.astype(str)
aggregated_sampled_lenskit_model['member_period'] = aggregated_sampled_lenskit_model.member_id + ': ' + aggregated_sampled_lenskit_model.subscription_start.astype(str) + '/' + aggregated_sampled_lenskit_model.subscription_end.astype(str)

In [11]:
def get_formatted_titles(row):

  item = books_df[books_df.item_id == row.item_id]
  if item.author.isna().any() == False:
    author = ' '.join(item.author.str.split(',').values[0][::-1])
    author = ' by' + author
  else: 
    author = '(Periodical)'
  title = item.title.values[0]
  return title + author

tqdm.pandas(desc="Formatting Titles")
if 'formatted_title' not in full_lenskit_model.columns:
  full_lenskit_model['formatted_title'] = full_lenskit_model.progress_apply(get_formatted_titles, axis=1)

if 'formatted_title' not in aggregated_lenskit_model.columns:
  aggregated_lenskit_model['formatted_title'] = aggregated_lenskit_model.progress_apply(get_formatted_titles, axis=1)

if 'formatted_title' not in sampled_lenskit_model.columns:
  sampled_lenskit_model['formatted_title'] = sampled_lenskit_model.progress_apply(get_formatted_titles, axis=1)

if 'formatted_title' not in aggregated_sampled_lenskit_model.columns:
  aggregated_sampled_lenskit_model['formatted_title'] = aggregated_sampled_lenskit_model.progress_apply(get_formatted_titles, axis=1)


Formatting Titles:   0%|          | 0/175700 [00:00<?, ?it/s]

In [13]:
tqdm.pandas(desc="Wrapping Titles")

if 'formatted_wrapped_title' not in full_lenskit_model.columns:
  full_lenskit_model['formatted_wrapped_title'] = full_lenskit_model.formatted_title.progress_apply(wrap, args=[50])

if 'formatted_wrapped_title' not in aggregated_lenskit_model.columns:
  aggregated_lenskit_model['formatted_wrapped_title'] = aggregated_lenskit_model.formatted_title.progress_apply(wrap, args=[50])

if 'formatted_wrapped_title' not in sampled_lenskit_model.columns:
  sampled_lenskit_model['formatted_wrapped_title'] = sampled_lenskit_model.formatted_title.progress_apply(wrap, args=[50])

if 'formatted_wrapped_title' not in aggregated_sampled_lenskit_model.columns:
  aggregated_sampled_lenskit_model['formatted_wrapped_title'] = aggregated_sampled_lenskit_model.formatted_title.progress_apply(wrap, args=[50])


Wrapping Titles:   0%|          | 0/175700 [00:00<?, ?it/s]

Wrapping Titles:   0%|          | 0/1757 [00:00<?, ?it/s]

Wrapping Titles:   0%|          | 0/6300 [00:00<?, ?it/s]

Wrapping Titles:   0%|          | 0/72 [00:00<?, ?it/s]

## Collaborative Filtering

There's a number of permutations available with collaborative filtering:

- Keep all data including periodicals
- Exclude periodicals
- Limit to circulation period when making predictions

In [22]:
full_collaborative_filtering_model = pd.read_csv("../speculative_reading/data/collaborative_filtering_results/full_predictions_without_periodicals_circulation_limited.csv")
full_collaborative_filtering_model = full_collaborative_filtering_model.rename(columns={'item_uri': 'item_id'})
aggregated_full_collaborative_filtering_model = pd.read_csv("../speculative_reading/data/collaborative_filtering_results/aggregated_full_predictions_without_periodicals_circulation_limited.csv")
aggregated_full_collaborative_filtering_model = aggregated_full_collaborative_filtering_model.rename(columns={'item_uri': 'item_id'})

sampled_collaborative_filtering_model = pd.read_csv("../speculative_reading/data/collaborative_filtering_results/top200_predictions_without_periodicals_circulation_limited.csv")
sampled_collaborative_filtering_model = sampled_collaborative_filtering_model.rename(columns={'item_uri': 'item_id'})

aggregated_sampled_collaborative_filtering_model = pd.read_csv("../speculative_reading/data/collaborative_filtering_results/aggregated_top200_predictions_without_periodicals_circulation_limited.csv")
aggregated_sampled_collaborative_filtering_model = aggregated_sampled_collaborative_filtering_model.rename(columns={'item_uri': 'item_id'})

tqdm.pandas(desc="Formatting Titles")

if 'formatted_title' not in full_collaborative_filtering_model.columns:
  full_collaborative_filtering_model['formatted_title'] = full_collaborative_filtering_model.progress_apply(get_formatted_titles, axis=1)

if 'formatted_title' not in aggregated_full_collaborative_filtering_model.columns:
  aggregated_full_collaborative_filtering_model['formatted_title'] = aggregated_full_collaborative_filtering_model.progress_apply(get_formatted_titles, axis=1)

if 'formatted_title' not in sampled_collaborative_filtering_model.columns:
  sampled_collaborative_filtering_model['formatted_title'] = sampled_collaborative_filtering_model.progress_apply(get_formatted_titles, axis=1)

if 'formatted_title' not in aggregated_sampled_collaborative_filtering_model.columns:
  aggregated_sampled_collaborative_filtering_model['formatted_title'] = aggregated_sampled_collaborative_filtering_model.progress_apply(get_formatted_titles, axis=1)


Formatting Titles:   0%|          | 0/2838 [00:00<?, ?it/s]

Formatting Titles:   0%|          | 0/946 [00:00<?, ?it/s]

Formatting Titles:   0%|          | 0/180 [00:00<?, ?it/s]

Formatting Titles:   0%|          | 0/60 [00:00<?, ?it/s]

In [23]:
full_collaborative_filtering_model = full_collaborative_filtering_model[['item_id', 'formatted_title', 'member_id', 'period', 'subscription_start', 'subscription_end', 'metric', 'score']]
sampled_collaborative_filtering_model = sampled_collaborative_filtering_model[['item_id', 'formatted_title', 'member_id', 'period', 'subscription_start', 'subscription_end', 'metric', 'score']]

full_collaborative_filtering_model.loc[full_collaborative_filtering_model.formatted_title.str.contains('Instigations of Ezra Pound'), 'formatted_title'] = 'Instigations of Ezra Pound: Together with an Essay on the Chinese Written Character by Ezra Pound and Ernest Fenollosa'

tqdm.pandas(desc="Wrapping Titles")
if 'formatted_wrapped_title' not in full_collaborative_filtering_model.columns:
  full_collaborative_filtering_model['formatted_wrapped_title'] = full_collaborative_filtering_model.formatted_title.progress_apply(wrap, args=[50])

if 'formatted_wrapped_title' not in sampled_collaborative_filtering_model.columns:
  sampled_collaborative_filtering_model['formatted_wrapped_title'] = sampled_collaborative_filtering_model.formatted_title.progress_apply(wrap, args=[50])

if 'formatted_wrapped_title' not in aggregated_full_collaborative_filtering_model.columns:
  aggregated_full_collaborative_filtering_model['formatted_wrapped_title'] = aggregated_full_collaborative_filtering_model.formatted_title.progress_apply(wrap, args=[50])

if 'formatted_wrapped_title' not in aggregated_sampled_collaborative_filtering_model.columns:
  aggregated_sampled_collaborative_filtering_model['formatted_wrapped_title'] = aggregated_sampled_collaborative_filtering_model.formatted_title.progress_apply(wrap, args=[50])

full_collaborative_filtering_model.loc[full_collaborative_filtering_model.metric == 'cosine', 'metric'] = 'Cosine Distance'
full_collaborative_filtering_model.loc[full_collaborative_filtering_model.metric == 'pearson', 'metric'] = 'Pearson Correlation Coefficient'
full_collaborative_filtering_model.loc[full_collaborative_filtering_model.metric == 'euclidean', 'metric'] = 'Euclidean Distance'

sampled_collaborative_filtering_model.loc[sampled_collaborative_filtering_model.metric == 'cosine', 'metric'] = 'Cosine Distance'
sampled_collaborative_filtering_model.loc[sampled_collaborative_filtering_model.metric == 'pearson', 'metric'] = 'Pearson Correlation Coefficient'
sampled_collaborative_filtering_model.loc[sampled_collaborative_filtering_model.metric == 'euclidean', 'metric'] = 'Euclidean Distance'


Wrapping Titles:   0%|          | 0/2838 [00:00<?, ?it/s]

Wrapping Titles:   0%|          | 0/180 [00:00<?, ?it/s]

Wrapping Titles:   0%|          | 0/946 [00:00<?, ?it/s]

Wrapping Titles:   0%|          | 0/60 [00:00<?, ?it/s]

In [34]:
# Create a selection for the scatter plot
period_selection = alt.selection_point(fields=['period'], bind='legend')
metric_selection = alt.selection_multi(fields=['metric'], bind='legend')

# Create a search input selection
search_box = alt.param(
    value='',
    bind=alt.binding(
        input='search',
        placeholder="Search title",
        name='Search ',
    )
)

alt.Chart(sampled_collaborative_filtering_model).mark_point(opacity=0.8, filled=True, thickness=3).encode(
    # The y-axis is the formatted chart title, sorted by the x-axis values
    y=alt.Y('formatted_title', sort='-x', axis=alt.Axis(title='', labelFontSize=9, labelLimit=1000, labelPadding=0, titlePadding=0, titleFontSize=9, titleAngle=0, titleAlign='left', titleY=-10, titleX=100)),
    # The x-axis is the score
    x='score',
    # The color represents the subscription period
    color=alt.Color('period:N', legend=alt.Legend(title=["Subscription Period"])),
    opacity=alt.condition(period_selection & metric_selection, alt.value(1), alt.value(0.2)),
    # The shape represents the method for collaborative filtering
    shape=alt.Shape('metric:N', legend=alt.Legend(title='Method for Collaborative Filtering')),
    tooltip=['formatted_title', 'score', 'period', 'metric'],
).add_params(
    search_box, period_selection, metric_selection
).transform_filter(
    alt.expr.test(alt.expr.regexp(search_box, 'i'), alt.datum.formatted_title)
    ).properties(
    # Set the width and height of the plot
    width=300,
    # height=1800,
    # Set the title of the plot
    title='Top Predictions by Collaborative Filtering Memory-Based Methods'
)