# Generate and Evaluate Lenskit Model Stability and Select Scores for Predictions

## Load libraries and data

In [1]:
# This code checks if the lenskit library is installed and installs it if it is not.
try:
    import lenskit
    print("Lenskit is already installed.")
except ImportError:
    print("Lenskit is not installed. Installing...")
    import subprocess
    subprocess.check_call(["pip", "install", "lenskit"])

Lenskit is already installed.


In [50]:
# Standard library imports
import os
import sys
from typing import List

# Third party imports
import altair as alt  # For data visualization
import numpy as np  # For numerical operations
import pandas as pd  # For data manipulation
from scipy.stats import zscore  # For statistical computations
from sklearn.preprocessing import MinMaxScaler  # For data preprocessing
from tqdm.notebook import tqdm  # For progress bars

# LensKit imports
from lenskit import Recommender, topn, util, batch, crossfold as xf  # For recommendation systems
from lenskit.algorithms import als, basic  # For recommendation algorithms

# Local application/library specific imports
sys.path.append("..")
from utils.missing_data_processing import *  # For handling missing data

In [9]:
# Load the initial data into four DataFrames: events_df, members_df, books_df, and borrow_overrides_df.
events_df, members_df, books_df, borrow_overrides_df  = load_initial_data()

# Process the events data to clean it and prepare it for analysis.
events_df = preprocess_events_data(events_df)

# Extract the item ID from the URI in the books DataFrame.
# The item ID is the second to last part of the URI.
books_df["item_id"] = books_df.uri.apply(
    lambda x: x.split("/")[-2] if pd.notna(x) else None
)

# Generate short IDs for the members in the members DataFrame.
# The ID is the second to last part of the URI.
members_df["id"] = members_df.uri.apply(
    lambda x: x.split("/")[-2]
)

# Get all member-book interactions from the events DataFrame.
# Only include rows where the item URI is not null.
interactions_df = events_df[events_df.item_uri.notna()].copy()

# Restrict the interactions to borrow events only.
interactions_df = interactions_df[interactions_df.event_type == 'Borrow'].copy()

# Reduce the interactions DataFrame to the minimum user/item interaction fields and drop duplicate rows.
unique_interactions_df = interactions_df[
    ["member_id", "item_id"]
].drop_duplicates(subset=["member_id", "item_id"])

# Rename the columns to the names expected by LensKit.
# The DataFrame is renamed to 'ratings' for use with the tutorial.
ratings = unique_interactions_df.rename(columns={'member_id': 'user', 'item_id': 'item'})

# The example assumes a rating. Use a 1/0 rating and set all to 1 to confirm interaction.
ratings['rating'] = 1

# Display the first 10 rows of the ratings DataFrame.
ratings.head(10)

Unnamed: 0,user,item,rating
2,rhys,conrad-typhoon,1
8,lanux-eyre-de,woolf-night-day,1
12,tery,james-joyce,1
13,tery,freeman-portrait-george-moore,1
22,macleish-ada,stern-tents-israel,1
38,alvear,yeats-later-poems,1
46,joyce-james,mantzius-history-theatrical-art,1
51,joyce-james,scott-poems-walter-scott,1
52,joyce-james,chekhov-horse-stealers-stories,1
53,joyce-james,stephens-crock-gold,1


In [11]:
# load previously computed partial borrowers list (sequential / near sequential subscriptions collapsed)
partial_borrowers = pd.read_csv('../appendix/speculative_reading/data/partial_borrowers_collapsed.csv')
partial_borrowers.sort_values('known_borrows', ascending=False, inplace=True)
# parse subscription dates so we can use them to identify circulating books
partial_borrowers['subscription_start'] = pd.to_datetime(partial_borrowers['subscription_start'])
partial_borrowers['subscription_end'] = pd.to_datetime(partial_borrowers['subscription_end'])
partial_borrowers.head(1)

Unnamed: 0,member_id,subscription_start,subscription_end,subscription_events,subscription_volumes,subscription_days,internal_gaps,known_borrows
23,raphael-france,1920-04-30,1921-11-17,Subscription;Renewal;Renewal;Renewal,1.0,566,0;0;0,1008


In [12]:
# generate subset of events dataset with dates, for use in identifying books 
# in circulation during and before these subscriptions

dated_events_df = events_df.copy()
dated_events_df['start_date_dt'] = pd.to_datetime(dated_events_df['start_date'], errors='coerce')
dated_events_df['end_date_dt'] = pd.to_datetime(dated_events_df['end_date'], errors='coerce')

## Fit initial model

### Run Model Comparisons

In [15]:

# Define constants
N_RECOMMENDATIONS = 20

def get_item_ids(user_id: str, bookless_sub: pd.Series, dated_events_df: pd.DataFrame, events_df: pd.DataFrame) -> List[str]:
	"""
	Get the item IDs for books that were in circulation during the subscription period

	Parameters
	----------
	user_id : str
		The user ID for the member
	bookless_sub : pd.Series
		A Series with the subscription start and end dates
	dated_events_df : pd.DataFrame
		A DataFrame with the events data and dates
	events_df : pd.DataFrame
		A DataFrame with the events data
	
	Returns
	-------
	List[str]
		A list of item IDs for books that were in circulation during the subscription period
	
	"""
	circulating_book_events = dated_events_df[(dated_events_df.start_date_dt < bookless_sub.subscription_end) | (dated_events_df.end_date_dt < bookless_sub.subscription_end)]
	item_ids = circulating_book_events[circulating_book_events.item_id.notna()].item_id.unique()
	member_book_ids = events_df[(events_df.item_id.notna()) & (events_df.member_id.str.contains(user_id))].item_id.unique()
	subset_item_ids = list(set(item_ids) - set(member_book_ids))
	return subset_item_ids

def get_predictions(user_id: str, bookless_sub: pd.Series, rec: Recommender, subset_item_ids: List[str]) -> pd.DataFrame:
	"""
	Get the recommendations for a user

	Parameters
	----------
	user_id : str
		The user ID for the member
	bookless_sub : pd.Series
		A Series with the subscription start and end dates
	rec : Recommender
		The recommender model
	subset_item_ids : List[str]
		A list of item IDs for books that were in circulation during the subscription period
	
	Returns
	-------
	pd.DataFrame
		A DataFrame with the recommendations for the user

	"""
	predictions = rec.recommend(user_id, candidates=subset_item_ids)
	predictions['member_id'] = user_id
	predictions['subscription_start'] = bookless_sub.subscription_start
	predictions['subscription_end'] = bookless_sub.subscription_end
	predictions.rename(columns={'item': 'item_id'}, inplace=True)
	return predictions

def run_model_comparisons(number_of_runs: List[int], return_scores: bool, output_path: str, members: List[str]) -> pd.DataFrame:
	"""
	Run model comparisons for a list of run lengths

	Parameters
	----------
	number_of_runs : List[int]
		A list of run lengths
	return_scores : bool
		A boolean indicating whether to return scores
	output_path : str
		The path to save the output
	members : List[str]
		A list of member IDs
	
	Returns
	-------
	pd.DataFrame
		A DataFrame with the model comparisons
	"""
	if os.path.exists(output_path):
		compare_models = pd.read_csv(output_path)
	else: 
		model_runs=[]
		for run_length in number_of_runs:
			all_recs = []
			for index in tqdm(range(run_length)):
				rec = Recommender.adapt(als.ImplicitMF(50, use_ratings=False))
				rec.fit(ratings)
				popular = Recommender.adapt(basic.Popular())
				popular.fit(ratings)
				for bookless_sub in list(partial_borrowers.itertuples()):
					user_id = bookless_sub.member_id
					if user_id in members:
						subset_item_ids = get_item_ids(user_id, bookless_sub, dated_events_df, events_df)
						predictions = get_predictions(user_id, bookless_sub, rec, subset_item_ids)
						predictions['model_run'] = index
						all_recs.append(predictions)
			all_recs_df = pd.concat(all_recs)
			metrics_df = all_recs_df.groupby(['member_id', 'subscription_start', 'subscription_end', 'item_id']).agg({'score': [np.median, 'skew', 'std', 'var']}).reset_index()
			metrics_df.columns = list(map(''.join, metrics_df.columns.values))
			metrics_df.columns = [col if 'score' not in col else col.split('score')[1] for col in metrics_df.columns ]
			kurt_df = all_recs_df.groupby(['member_id', 'subscription_start', 'subscription_end', 'item_id'])['score'].apply(pd.DataFrame.kurt).reset_index(name='kurtosis')
			final_df = pd.merge(metrics_df, kurt_df, on=['member_id', 'subscription_start', 'subscription_end', 'item_id'])
			final_df['model_loops'] = run_length
			if return_scores:
				final_df = pd.merge(final_df, all_recs_df, on=['member_id', 'subscription_start', 'subscription_end', 'item_id'], how='left')
			model_runs.append(final_df)
			compare_models = pd.concat(model_runs)
			compare_models.to_csv(output_path, index=False)
	return compare_models

In [17]:
# specify run size in number of runs
number_of_runs = [10,20,50,100, 200]
# specify members
members = ['kittredge-eleanor-hayden', 'colens-fernand', 'raphael-france', 'hemingway-ernest']
compare_models = run_model_comparisons(number_of_runs, False, './data/lenskit_comparison_model_runs.csv', members)

  0%|          | 0/10 [00:00<?, ?it/s]

Numba is using threading layer workqueue - consider TBB
found 1 potential runtime problems - see https://boi.st/lkpy-perf


  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

### Visualize stability of model scores

In [19]:
compare_models['member_period'] = compare_models.member_id + ': ' + compare_models.subscription_start.astype(str) + '/' + compare_models.subscription_end.astype(str)

In [21]:
def sample_scores(df: pd.DataFrame, get_top: bool, numb_of_books: int) -> pd.DataFrame:
    """
    This function samples scores from a DataFrame for a given number of books and periods.
    
    Parameters:
    df (pd.DataFrame): The DataFrame containing the scores.
    get_top (bool): If True, the function will return the top scores. If False, it will return random scores.
    numb_of_books (int): The number of books to sample scores for.
    
    Returns:
    pd.DataFrame: A DataFrame containing the sampled scores.
    """
    
    # Get the unique periods from the DataFrame.
    periods = df.member_period.unique().tolist()
    
    # Initialize an empty list to store the DataFrames for each period.
    visualize_df = []
    
    # For each period...
    for period in periods:
        # Initialize an empty list to store the books for this period.
        final_books = []
        
        # Get the rows from the DataFrame for this period.
        rows = df[df.member_period == period]
        
        # Get the unique loop numbers from the rows.
        loops = rows.model_loops.unique().tolist()
        
        # While the number of books is less than the specified number...
        while len(final_books) < numb_of_books:
            # For each loop...
            for loop in loops:
                # Get the rows for this loop.
                final_rows = rows[rows.model_loops == loop]
                
                # If get_top is True, sort the rows by median score in descending order and get the top books.
                # Otherwise, get a random sample of books.
                if get_top:
                    final_rows = final_rows.sort_values(by='median', ascending=False)
                    books = final_rows[0:numb_of_books].item_id.unique().tolist()
                else:
                    books = rows.item_id.sample(n=numb_of_books).reset_index()
                    books = books.item_id.unique().tolist()
                
                # If the number of books is less than the specified number, add more books until the number is reached.
                increment = numb_of_books
                while len(books) < numb_of_books:
                    increment = increment + 1
                    books = final_rows[0:increment].item_id.unique().tolist()
                
                # Add the books to the list of books for this period.
                final_books.extend(books)
            
            # Remove duplicate books from the list.
            final_books = list(set(final_books))
        
        # Add the rows for the books in the list to the list of DataFrames.
        visualize_df.append(rows[rows.item_id.isin(set(final_books))])
    
    # Concatenate the DataFrames in the list into a single DataFrame.
    final_df = pd.concat(visualize_df)
    
    return final_df

In [22]:
def visualize_model_stability(df: pd.DataFrame, get_top: bool, numb_of_books: int) -> alt.Chart:
    """
    This function visualizes the stability of a model by creating box plots and scatter plots of various score distribution metrics.
    
    Parameters:
    df (pd.DataFrame): The DataFrame containing the scores.
    get_top (bool): If True, the function will return the top scores. If False, it will return random scores.
    numb_of_books (int): The number of books to sample scores for.
    
    Returns:
    alt.Chart: A concatenated Altair chart containing the box plots and scatter plots.
    """
    
    # Sample scores from the DataFrame.
    sample_df = sample_scores(df, get_top, numb_of_books)
    
    # Define the distribution metrics to be used.
    distribution_metrics = ['median', 'skew', 'std', 'var', 'kurtosis']
    
    # Normalize the distribution metrics in the sample DataFrame using MinMaxScaler.
    sample_df[distribution_metrics] = MinMaxScaler().fit_transform(sample_df[distribution_metrics])
    
    # Melt the sample DataFrame to a long format for visualization.
    melted_sample = pd.melt(sample_df, id_vars=['member_id', 'subscription_start', 'subscription_end', 'item_id', 'model_loops',  
       'member_period'], value_vars=['median', 'skew', 'std', 'var', 'kurtosis'])

    # Create a box plot of the distribution metrics.
    boxplot = alt.Chart(melted_sample).mark_boxplot().encode(
        x= alt.X('model_loops:O', axis=alt.Axis(title='')),
        y=alt.Y('value', axis=alt.Axis(title='')),
        column=alt.Column('variable', title=''),
    ).properties(title = "Variability with Box and Whiskers")

    # Create a scatter plot of the distribution metrics.
    points = alt.Chart(melted_sample).mark_circle().encode(
        x= alt.X('model_loops:O', axis=alt.Axis(title='')),
        y=alt.Y('value', axis=alt.Axis(title='')),
        color=alt.Color('variable', legend=alt.Legend(title=['Measure of', 'Score Variability'])), 
        column=alt.Column('variable', title='')
    ).properties(title = "Variability with Score Distributions")

    # Concatenate the box plot and scatter plot horizontally and return the result.
    return alt.hconcat(boxplot, points).properties(title='Variability in Predicted Scores By Resampling Implicit Matrix Factorization Model ')

In [25]:
chart = visualize_model_stability(compare_models, True, 10)
chart.configure_axisX(
    labelAngle=0
).configure_title(anchor='middle')

### Select Optimal Model and Generate Item Scores

In [27]:
final_run = [100]
members = ['hemingway-ernest']
final_model = run_model_comparisons(final_run, True, f'./data/lenskit_model{str(final_run[0])}_scores.csv', members)

  0%|          | 0/100 [00:00<?, ?it/s]

In [28]:
member_subscriptions = final_model[['member_id', 'subscription_start', 'subscription_end']].drop_duplicates()

In [29]:
final_model[['member_id', 'subscription_start', 'subscription_end']].value_counts()

member_id         subscription_start  subscription_end
hemingway-ernest  1924-03-28          1925-03-28          128500
                  1921-12-28          1922-11-08           65400
dtype: int64

In [30]:
final_model['member_period'] = final_model.member_id + ': ' + final_model.subscription_start.astype(str) + '/' + final_model.subscription_end.astype(str)

In [31]:
final_model[(final_model.member_id == 'hemingway-ernest') & (final_model.model_run ==0)].groupby('member_period')['item_id'].nunique()

member_period
hemingway-ernest: 1921-12-28/1922-11-08     654
hemingway-ernest: 1924-03-28/1925-03-28    1285
Name: item_id, dtype: int64

In [32]:
subset_model = final_model[final_model.member_id == 'hemingway-ernest']

In [33]:
top_results = sample_scores(subset_model, True, 36)
top_results[0:1]

Unnamed: 0,member_id,subscription_start,subscription_end,item_id,median,skew,std,var,kurtosis,model_loops,score,model_run,member_period
7200,hemingway-ernest,1921-12-28,1922-11-08,burney-evelina-history-young,0.320107,-0.048081,0.18916,0.035781,0.023806,100,0.809843,0,hemingway-ernest: 1921-12-28/1922-11-08


In [35]:
def get_formatted_titles(row):

  item = books_df[books_df.item_id == row.item_id]
  if item.author.isna().any() == False:
    author = ' '.join(item.author.str.split(',').values[0][::-1])
    author = ' by' + author
  else: 
    author = '(Periodical)'
  title = item.title.values[0]
  return title + author

In [36]:
top_results['formatted_title'] = top_results.apply(get_formatted_titles, axis=1)

In [37]:
top_results['period'] = top_results.member_period.str.split(':').str[1]

In [38]:
len(top_results.item_id.unique()), len(top_results[top_results.formatted_title.str.contains('Periodical') == False].item_id.unique())

(59, 52)

In [39]:
top_results.groupby('period').item_id.nunique()

period
 1921-12-28/1922-11-08    36
 1924-03-28/1925-03-28    36
Name: item_id, dtype: int64

In [40]:
top_results[top_results.formatted_title.str.contains('Periodical') == False].groupby('period').item_id.nunique()

period
 1921-12-28/1922-11-08    34
 1924-03-28/1925-03-28    29
Name: item_id, dtype: int64

In [41]:
items = top_results[top_results.member_id == 'hemingway-ernest'].groupby(['period','item_id'])['score'].mean().reset_index(name='avg').sort_values(by='avg', ascending=False)

In [42]:
items.groupby(['period']).size()

period
 1921-12-28/1922-11-08    36
 1924-03-28/1925-03-28    36
dtype: int64

In [43]:
members = top_results.member_id.unique().tolist()
charts = []
members=['hemingway-ernest']
for member in members:
  full_name = members_df[members_df.id == member].name.values[0]
  tickplot = alt.Chart(top_results[(top_results.member_id == member)]).mark_tick(opacity=0.7).encode(
    y=alt.Y('formatted_title', sort='-x', axis=alt.Axis(title="Predicted Book")),
    x='score',
    color=alt.Color('period:N', legend=alt.Legend(title="Missing Borrowing Records Period")),
    # facet='member_period:N'
  ).properties(
      title=f'Top Predictions by Implicit Matrix Factorization Model',
      width=300
  )
  charts.append(tickplot)


In [45]:
charts[0].configure_axisY(
        titleAngle=0,
        titleAlign="left",
        titleY=-10,
        titleX=-100,
        labelLimit=1000
    )

In [47]:
# alt.hconcat(*charts).configure_axisY(
#         titleAngle=0,
#         titleAlign="left",
#         titleY=-10,
#         titleX=-10,
#         labelLimit=1000
#     )

In [None]:
# # [top_results.member_period == 'colens-fernand: 1920-04-01/1920-07-07']
# tickplot = alt.Chart(top_results).mark_tick(opacity=0.7).encode(
#     y=alt.Y('item_id', sort='-x'),
#     x='score',
#     color='member_period:N',
#     # facet='member_period:N'
# )
# tickplot

In [48]:
top_results[top_results.formatted_title.str.contains('Periodical') == False].to_csv('./data/top_scores_lenskit_model100.csv', index=False)

In [51]:
final_model['zscore'] = final_model.groupby(['member_id', 'subscription_start', 'subscription_end', 'item_id'])['score'].transform(lambda x : zscore(x,ddof=1))

In [52]:
top_scores = final_model.groupby(['member_id', 'subscription_start', 'subscription_end', 'item_id']).agg({'score':'max'})[['score']].reset_index()
top_scores = pd.merge(top_scores, final_model, on=top_scores.columns.tolist(), how='inner')

top_scores = top_scores.rename(columns={'score': 'top_score', 'zscore' : 'top_zscore'})

In [53]:
avg_scores = final_model.groupby(['member_id', 'subscription_start', 'subscription_end', 'item_id'])['score'].mean().reset_index(name='avg_score')
scores_df = pd.merge(top_scores, avg_scores, on=['member_id', 'subscription_start', 'subscription_end', 'item_id'])

In [54]:
std_scores = final_model.groupby(['member_id', 'subscription_start', 'subscription_end', 'item_id']).agg({'score':'std'}).reset_index()
std_scores = std_scores.rename(columns={'score': 'std_score'})
scores_df = pd.merge(scores_df, std_scores, on=['member_id', 'subscription_start', 'subscription_end', 'item_id'])

In [55]:
median_scores = final_model.groupby(['member_id', 'subscription_start', 'subscription_end', 'item_id'])['score'].median().reset_index(name='median_score')
scores_df = pd.merge(scores_df, median_scores, on=['member_id', 'subscription_start', 'subscription_end', 'item_id'])

In [58]:
import scipy
mode_scores = final_model.groupby(['member_id', 'subscription_start', 'subscription_end', 'item_id'])['score'].agg(lambda x: scipy.stats.mode(x)[0][0]).reset_index()
mode_scores = pd.merge(mode_scores, final_model, on=mode_scores.columns.tolist(), how='inner')
mode_scores = mode_scores.rename(columns={'score': 'mode_score', 'zscore' : 'mode_zscore'})

In [59]:
final_scores = pd.merge(mode_scores[['member_id', 'subscription_start', 'subscription_end', 'item_id', 'mode_score', 'mode_zscore']], scores_df, on=['member_id', 'subscription_start', 'subscription_end', 'item_id'])

In [None]:
# for index, group in member_subscriptions.iterrows():
#   print(group.to_dict())
#   rows = final_scores[(final_scores.member_id == group.member_id) & (final_scores.subscription_start == group.subscription_start) & (final_scores.subscription_end == group.subscription_end)]
#   print('top_scores:', rows.sort_values(by=['top_score'], ascending=False)[0:5][['item_id', 'top_score']].to_dict())
#   print('avg_scores:', rows.sort_values(by=['avg_score'], ascending=False)[0:5][['item_id', 'avg_score']].to_dict())
#   print('mode_scores:', rows.sort_values(by=['mode_score'], ascending=False)[0:5][['item_id', 'mode_score']].to_dict())

In [60]:
final_scores.to_csv(f'./data/collapsed_lenskit_model{str(final_run[0])}_scores.csv', index=False)