# S&Co Combine Recommendations

This notebook combines the outputs from our previous recommendation notebooks and compares the outputs. It also contains the code to generate figure 14 from our paper

## Load Libraries & Data

In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None
from IPython.display import display, Markdown, HTML
import altair as alt
alt.data_transformers.disable_max_rows()
from textwrap import wrap
from great_tables import GT, style, loc, md
# Local application/library specific imports
import sys
sys.path.append("..")
from utils.missing_data_processing import *  # For handling missing data

In [3]:
# Load the initial data into four DataFrames: events_df, members_df, books_df, and borrow_overrides_df.
events_df, members_df, books_df, borrow_overrides_df  = load_initial_data()

# Process the events data to clean it and prepare it for analysis.
events_df = preprocess_events_data(events_df)

# Extract the item ID from the URI in the books DataFrame.
# The item ID is the second to last part of the URI.
books_df["item_id"] = books_df.uri.apply(
    lambda x: x.split("/")[-2] if pd.notna(x) else None
)

# Generate short IDs for the members in the members DataFrame.
# The ID is the second to last part of the URI.
members_df["id"] = members_df.uri.apply(
    lambda x: x.split("/")[-2]
)

events_df['start_datetime'] = pd.to_datetime(events_df.start_date, format='%Y-%m-%d', errors='coerce')
events_df['end_datetime'] = pd.to_datetime(events_df.end_date, format='%Y-%m-%d', errors='coerce')

# get all member-book interactions from events
# shorten URIs for readability in output
borrow_events = events_df[events_df.item_uri.notna()].copy()

# restrict to borrow events only
borrow_events = borrow_events[borrow_events.event_type == 'Borrow'].copy()
# borrow_df = borrow_events[(borrow_events.start_date.isna() == False) & (borrow_events.end_date.isna() == False)]
borrow_events = borrow_events[borrow_events.start_datetime < '1942-01-01']

partial_borrowers = pd.read_csv('../data/partial_borrowers_collapsed.csv')

partial_members = ['raphael-france', 'hemingway-ernest',
                       'colens-fernand', 'kittredge-eleanor-hayden']
partial_df = partial_borrowers[partial_borrowers.member_id.isin(partial_members)]
partial_df['subscription_starttime'] = pd.to_datetime(partial_df['subscription_start'], errors='coerce')
partial_df['subscription_endtime'] = pd.to_datetime(
    partial_df['subscription_end'], errors='coerce')

In [5]:
lenskit_results = pd.read_csv('./data/aggregated_sampled_scores_lenskit_model100_without_periodicals.csv')
lenskit_results['result_type'] = 'lenskit'

In [6]:
memory_cf_results = pd.read_csv('./data/aggregated_full_scores_collaborative_filtering_without_periodicals_circulation_limited.csv')
memory_cf_results['result_type'] = 'memory_cf'

In [7]:
memory_cf_results = memory_cf_results.rename(columns={'item_uri':'item_id', })

In [8]:
memory_cf_results['member_period'] = memory_cf_results.member_id + ': ' + memory_cf_results.period

In [9]:
cols = ['member_id', 'subscription_start', 'subscription_end', 'item_id',
        'mode_score', 'mode_zscore', 'top_score', 'top_zscore',
        'avg_score', 'member_period', 'result_type', 'std_score', 'median_score']
concat_results = pd.concat([lenskit_results[cols], memory_cf_results[cols]])
concat_results['coef_variation'] = abs((concat_results.std_score / concat_results.median_score ) )

In [10]:
title_lookup = {row.item_id: row.title for row in books_df.itertuples()}

In [11]:
popular_current = pd.read_csv('./data/popular_books_by_subscription_circulation_period.csv')

In [12]:
popular_all = pd.read_csv('./data/popular_books_by_library_duration.csv')

In [13]:
def sample_scores(df: pd.DataFrame, get_top: bool, numb_of_books: int, metrics: List[str]) -> pd.DataFrame:
    """
    This function samples scores from a DataFrame for a given number of books and periods.
    
    Parameters:
    df (pd.DataFrame): The DataFrame containing the scores.
    get_top (bool): If True, the function will return the top scores. If False, it will return random scores.
    numb_of_books (int): The number of books to sample scores for.
    metrics (List[str]): The metrics to sort by when getting the top scores.
    
    Returns:
    pd.DataFrame: A DataFrame containing the sampled scores.
    """
    
    # Get the unique periods from the DataFrame.
    periods = df.member_period.unique().tolist()
    
    # Initialize an empty list to store the DataFrames for each period.
    visualize_df = []
    
    # For each period...
    for period in periods:
        # Initialize an empty list to store the books for this period.
        final_books = []
        
        # Get the rows from the DataFrame for this period.
        rows = df[df.member_period == period]
        
        # Get the unique result types from the rows.
        results = rows.result_type.unique().tolist()
        
        # For each result type...
        for result in results:
            # Get the rows for this result type.
            final_rows = rows[rows.result_type == result]
            
            # If get_top is True, sort the rows by the specified metrics and get the top books.
            # Otherwise, get a random sample of books.
            if get_top:
                final_rows = final_rows.sort_values(by=metrics, ascending=[False, True])
                books = final_rows[0:numb_of_books].item_id.unique().tolist()
            else:
                books = rows.item_id.sample(n=numb_of_books).reset_index()
                books = books.item_id.unique().tolist()
            
            # If the number of books is less than the specified number, add more books until the number is reached.
            increment = numb_of_books
            while len(books) < numb_of_books:
                increment = increment + 1
                books = final_rows[0:increment].item_id.unique().tolist()
            
            # Add the books to the list of books for this period.
            final_books.extend(books)
        
        # Add the rows for the books in the list to the list of DataFrames.
        visualize_df.append(rows[rows.item_id.isin(set(final_books))])
    
    # Concatenate the DataFrames in the list into a single DataFrame.
    final_df = pd.concat(visualize_df)
    
    return final_df

In [14]:
comparisons = [['median_score', 'std_score'], ['mode_score', 'mode_zscore'], ['top_score', 'top_zscore']]
get_top = True
df = concat_results.copy()
numb_of_books = 10
final_rows = []
charts = []
for metrics in comparisons:
  rows = sample_scores(df, get_top, numb_of_books, metrics)
  rows['comparison_metric'] = metrics[0]
  chart = alt.Chart(rows).mark_circle().encode(
    x=metrics[0],
    y=metrics[1],
    color='member_period:N',
    tooltip=['result_type', 'member_id', 'item_id', 'coef_variation'],
    row='result_type'
  )
  final_rows.append(rows)
  charts.append(chart)

In [15]:
alt.hconcat(*charts)

In [16]:
def get_formatted_titles(sorted_values: pd.DataFrame, numb_of_preds: int) -> List[str]:
    """
    This function formats the titles of the top predicted books.
    
    Parameters:
    sorted_values (pd.DataFrame): The DataFrame containing the sorted book predictions.
    numb_of_preds (int): The number of top predictions to format.
    
    Returns:
    List[str]: A list of formatted titles.
    """
    
    # Initialize an empty list to store the formatted titles.
    titles = []
    
    # For each of the top predicted books...
    for i in sorted_values[0:numb_of_preds].item_id.tolist():
        # Get the row from the books DataFrame for this book.
        item = books_df[books_df.item_id == i]
        
        # If the author is not null, format the author's name.
        # Otherwise, set the author to '(Periodical)'.
        if item.author.isna().any() == False:
            author = ' '.join(item.author.str.split(',').values[0][::-1])
        else: 
            author = '(Periodical)'
        
        # Get the title of the book.
        title = item.title.values[0]
        
        # Format the title and author and add them to the list of formatted titles.
        titles.append(f"*{title}*,<br>{author}")
    
    return titles

In [17]:
# Define the result types to consider
result_types = ['lenskit', 'memory_cf']

# Filter the concatenated results for a specific member and result types
subset_results = concat_results[(concat_results.member_id == 'hemingway-ernest') & (concat_results.result_type.isin(result_types))]

# Get the number of results
n = len(subset_results)

# Get formatted titles for the results
formatted_titles = get_formatted_titles(subset_results, n)

# Add the formatted titles to the results DataFrame
subset_results['formatted_title'] = formatted_titles

# Get the unique periods from the results
periods = subset_results.member_period.unique().tolist()

# Define the number of top results to consider
numb = 10

# Define whether to include periodicals
periodical_flag = False

# For each period...
for period in periods:
    # Filter the results for this period and result type, and sort by median score and coefficient of variation
    cf_rows = subset_results[(subset_results.member_period == period) & (subset_results.result_type == 'memory_cf') & (subset_results.formatted_title.str.contains('Periodical') == periodical_flag)].sort_values(by=['median_score', 'coef_variation'], ascending=[False, True])
    
    # Filter the results for this period and result type, and sort by coefficient of variation and median score
    lk_rows = subset_results[(subset_results.member_period == period) & (subset_results.result_type == 'lenskit') & (subset_results.formatted_title.str.contains('Periodical') == periodical_flag)].sort_values(by=['coef_variation', 'median_score'], ascending=[True, False])
    
    # Merge the top CF results with the books DataFrame to get borrow count and event count
    merged_cf = pd.merge(cf_rows.head(numb)[['median_score', 'item_id', 'coef_variation', 'formatted_title', 'member_period']], books_df[['item_id', 'borrow_count', 'event_count' ]], on='item_id', how='left')
    
    # Print the CF results
    print('cf', merged_cf[['formatted_title', 'borrow_count', 'event_count']])
    
    # Merge the top LensKit results with the books DataFrame to get borrow count and event count
    merged_lk = pd.merge(lk_rows.head(numb)[['median_score', 'item_id', 'coef_variation', 'formatted_title', 'member_period']], books_df[['item_id', 'borrow_count', 'event_count' ]], on='item_id', how='left')
    
    # Print the LensKit results
    print('lk', merged_lk[['formatted_title', 'borrow_count', 'event_count']])

cf                                      formatted_title  borrow_count  \
0             *Shakespeare*,<br> William Shakespeare             7   
1  *Thomas Hardy: A Critical Study*,<br> Lascelle...             2   
2                    *Since Cézanne*,<br> Clive Bell             4   
3                     *The Clash*,<br> Storm Jameson             3   
4           *Samuel Butler*,<br> Henry Festing Jones             3   
5  *Washington and the Hope of Peace*,<br> H. G. ...             2   
6      *The Life of Walter Pater*,<br> Thomas Wright             4   
7  *The Stiff Lip: A Novel*,<br> Walter Lionel Ge...             7   
8  *Instigations of Ezra Pound: Together with an ...             8   
9               *Totem and Taboo*,<br> Sigmund Freud             9   

   event_count  
0            7  
1            2  
2            4  
3            3  
4            4  
5            2  
6            4  
7            7  
8            8  
9            9  
lk                                     

In [18]:
# Get the unique book IDs from the books DataFrame
book_ids = books_df.item_id.unique().tolist()

# Filter the popular_all DataFrame for a specific member and reset the index
subset_popular_all = popular_all[popular_all.member_id == 'hemingway-ernest'].reset_index()

# Further filter the DataFrame to only include books that are in the books DataFrame
subset_popular_all = subset_popular_all[subset_popular_all.item_id.isin(book_ids)]

# Get the number of rows in the DataFrame
n = len(subset_popular_all)

# Get formatted titles for the books in the DataFrame
formatted_titles = get_formatted_titles(subset_popular_all, n)

# Add the formatted titles to the DataFrame
subset_popular_all['formatted_title'] = formatted_titles

# Filter the popular_current DataFrame for a specific member
subset_popular_current = popular_current[popular_current.member_id == 'hemingway-ernest']

# Further filter the DataFrame to only include books that are in the books DataFrame
subset_popular_current = subset_popular_current[subset_popular_current.item_id.isin(book_ids)]

# Get the number of rows in the DataFrame
n = len(subset_popular_current)

# Get formatted titles for the books in the DataFrame
formatted_titles = get_formatted_titles(subset_popular_current, n)

# Add the formatted titles to the DataFrame
subset_popular_current['formatted_title'] = formatted_titles

In [19]:
# Define the result types to consider
result_types = ['lenskit', 'memory_cf']

# Define the number of predictions to consider
numb_of_preds = 10

# Define the number of popular predictions to consider
popular_preds = 20

# Initialize an empty list to store the top rows
top_rows = []

# For each row in the partial DataFrame for a specific member...
for index, row in partial_df[partial_df.member_id == 'hemingway-ernest'].iterrows():
    # Display the member ID and the number of known borrows
    display(HTML("<h3>%s</h3>" % row.member_id))
    display(HTML("<p>%d known borrows</p>" % row.known_borrows))
    
    # Display the subscription period
    display(HTML("<p>subscription: %s – %s</p>" % (row.subscription_starttime.strftime("%Y-%m-%d"), row.subscription_endtime.strftime("%Y-%m-%d"))))
    
    # Initialize an empty dictionary to store the result data
    result_data = {}
    
    # Initialize an empty dictionary to store the popular data
    popular_data = {}
    
    # Get the popular all time predictions for this member and subscription period, excluding periodicals
    popular_all_rows = subset_popular_all[(subset_popular_all.member_id == row.member_id) &   (subset_popular_all.subscription_start == row.subscription_start) &   (subset_popular_all.subscription_end == row.subscription_end) &   (subset_popular_all.formatted_title.str.contains('Periodical') == False)].sort_values(by='score', ascending=False)
    
    # Add the popular all time predictions and scores to the popular data
    popular_data['popular (all time)'] = get_formatted_titles(popular_all_rows, numb_of_preds)
    popular_data['popular scores (all time)'] = popular_all_rows.head(numb_of_preds).score.tolist()
    
    # Get the popular current predictions for this member and subscription period, excluding periodicals
    popular_current_rows = subset_popular_current[(subset_popular_current.member_id == row.member_id) & (subset_popular_current.subscription_start == row.subscription_start) & (subset_popular_current.subscription_end == row.subscription_end) & (subset_popular_current.formatted_title.str.contains('Periodical') == False)].sort_values(by='score', ascending=False)
    
    # Add the popular current predictions and scores to the popular data
    popular_data['popular (current)'] = get_formatted_titles(popular_current_rows, numb_of_preds)
    popular_data['popular scores (current)'] = popular_current_rows.head(numb_of_preds).score.tolist()
    
    # For each result type...
    for result in result_types:
        # Get the rows for this member, subscription period, and result type
        selected_rows = subset_results[(subset_results.member_id == row.member_id) & (subset_results.subscription_start == row.subscription_start) & (subset_results.subscription_end == row.subscription_end) & (subset_results.result_type == result)]
        
        # Sort the rows by median score and coefficient of variation for 'bipartite' and 'memory_cf', or by coefficient of variation and median score for other result types
        if ((result == 'bipartite' ) or (result == 'memory_cf')):
            sorted_values = selected_rows.sort_values(by=['median_score', 'coef_variation'], ascending=[False, True])
        else:
            sorted_values = selected_rows.sort_values(by=['coef_variation', 'median_score'], ascending=[True, False])
        
        # Exclude periodicals from the sorted values
        selected_values = sorted_values[sorted_values.formatted_title.str.contains('Periodical') == False]
        
        # Add the predicted items, coefficient of variation, and median score to the result data
        result_data[result + '_predicted_item'] = selected_values.head(numb_of_preds).formatted_title.tolist()
        result_data[result + '_coef_variation'] = selected_values.head(numb_of_preds)['coef_variation'].tolist()
        result_data[result + '_median_score'] = selected_values.head(numb_of_preds)['median_score'].tolist()
    
    # Combine the result data and popular data into a single dictionary
    table_data = {**result_data, **popular_data}
    
    # Convert the dictionary to a DataFrame
    table = pd.DataFrame(data=table_data)
    
    # Add the member period, member ID, and period to the DataFrame
    table['member_period'] = row.member_id + ': ' + row.subscription_start + '/' + row.subscription_end
    table['member_id'] = row.member_id
    table['period'] = row.subscription_start + '/' + row.subscription_end
    
    # Add the DataFrame to the list of top rows
    top_rows.append(table)
    
    # Display the DataFrame
    display(HTML(table.to_html(index=False)))

lenskit_predicted_item,lenskit_coef_variation,lenskit_median_score,memory_cf_predicted_item,memory_cf_coef_variation,memory_cf_median_score,popular (all time),popular scores (all time),popular (current),popular scores (current),member_period,member_id,period
"*Enjoyment of Poetry*,<br> Max Eastman",0.296177,0.390322,"*Shakespeare*,<br> William Shakespeare",1.836347,0.2,"*A Portrait of the Artist as a Young Man*,<br> James Joyce",50.0,"*Dubliners*,<br> James Joyce",13,hemingway-ernest: 1921-12-28/1922-11-08,hemingway-ernest,1921-12-28/1922-11-08
"*The Moon of the Caribbees, and Six Other Plays of the Sea*,<br> Eugene O'Neill",0.316464,0.308374,"*Thomas Hardy: A Critical Study*,<br> Lascelles Abercrombie",1.892066,0.2,"*Dubliners*,<br> James Joyce",45.0,"*A Portrait of the Artist as a Young Man*,<br> James Joyce",10,hemingway-ernest: 1921-12-28/1922-11-08,hemingway-ernest,1921-12-28/1922-11-08
"*Mountain Blood: A Novel*,<br> Joseph Hergesheimer",0.350455,0.182915,"*Since Cézanne*,<br> Clive Bell",1.892066,0.2,"*Pointed Roofs (Pilgrimage 1)*,<br> Dorothy M. Richardson",40.0,"*Erewhon*,<br> Samuel Butler",8,hemingway-ernest: 1921-12-28/1922-11-08,hemingway-ernest,1921-12-28/1922-11-08
"*The Shadow Line: A Confession*,<br> Joseph Conrad",0.369546,0.527434,"*The Clash*,<br> Storm Jameson",1.892066,0.2,"*The Garden Party and Other Stories*,<br> Katherine Mansfield",33.0,"*Exiles*,<br> James Joyce",8,hemingway-ernest: 1921-12-28/1922-11-08,hemingway-ernest,1921-12-28/1922-11-08
"*Beyond the Horizon*,<br> Eugene O'Neill",0.373329,0.529142,"*Samuel Butler*,<br> Henry Festing Jones",1.892066,0.2,"*Bliss and Other Stories*,<br> Katherine Mansfield",31.0,"*Heartbreak House: A Fantasia in the Russian Manner on English Themes*,<br> George Bernard Shaw",7,hemingway-ernest: 1921-12-28/1922-11-08,hemingway-ernest,1921-12-28/1922-11-08
"*Joanna Godden*,<br> Sheila Kaye-Smith",0.391094,0.353218,"*Washington and the Hope of Peace*,<br> H. G. Wells",1.892066,0.2,"*Exiles*,<br> James Joyce",30.0,"*Plays: Pleasant and Unpleasant*,<br> George Bernard Shaw",7,hemingway-ernest: 1921-12-28/1922-11-08,hemingway-ernest,1921-12-28/1922-11-08
"*Henry James*,<br> Rebecca West",0.403754,0.31148,"*The Life of Walter Pater*,<br> Thomas Wright",1.892066,0.2,"*Women in Love*,<br> D. H. Lawrence",29.0,"*Typhoon*,<br> Joseph Conrad",6,hemingway-ernest: 1921-12-28/1922-11-08,hemingway-ernest,1921-12-28/1922-11-08
"*Rahab*,<br> Waldo Frank",0.422392,0.298885,"*The Stiff Lip: A Novel*,<br> Walter Lionel George",2.167911,0.197437,"*South Wind*,<br> Norman Douglas",27.0,"*The Way of All Flesh*,<br> Samuel Butler",6,hemingway-ernest: 1921-12-28/1922-11-08,hemingway-ernest,1921-12-28/1922-11-08
"*The Story of an African Farm*,<br> Olive Schreiner",0.429111,0.376603,"*Instigations of Ezra Pound: Together with an Essay on the Chinese Written Character, by Ernest Fenollosa*,<br> Ernest Ezra;Fenollosa Pound",3.423488,0.133707,"*Ulysses*,<br> James Joyce",26.0,"*Cashel Byron's Profession*,<br> George Bernard Shaw",6,hemingway-ernest: 1921-12-28/1922-11-08,hemingway-ernest,1921-12-28/1922-11-08
"*The Adventures of Huckleberry Finn*,<br> Mark Twain",0.437368,0.428899,"*Totem and Taboo*,<br> Sigmund Freud",2.885327,0.133333,"*Honeycomb (Pilgrimage 3)*,<br> Dorothy M. Richardson",25.0,"*Under Western Eyes*,<br> Joseph Conrad",6,hemingway-ernest: 1921-12-28/1922-11-08,hemingway-ernest,1921-12-28/1922-11-08


lenskit_predicted_item,lenskit_coef_variation,lenskit_median_score,memory_cf_predicted_item,memory_cf_coef_variation,memory_cf_median_score,popular (all time),popular scores (all time),popular (current),popular scores (current),member_period,member_id,period
"*The Sentry and Other Stories*,<br> Nikola Semyonovich Leskov",0.151999,0.387138,"*Works of Francis Thompson*,<br> Francis Thompson",1.570092e-16,1.0,"*A Portrait of the Artist as a Young Man*,<br> James Joyce",50.0,"*Dubliners*,<br> James Joyce",24,hemingway-ernest: 1924-03-28/1925-03-28,hemingway-ernest,1924-03-28/1925-03-28
"*English Diaries*,<br> Arthur Ponsonby",0.201775,0.443691,"*Works of Francis Thompson*,<br> Francis Thompson",1.570092e-16,1.0,"*Dubliners*,<br> James Joyce",45.0,"*A Portrait of the Artist as a Young Man*,<br> James Joyce",20,hemingway-ernest: 1924-03-28/1925-03-28,hemingway-ernest,1924-03-28/1925-03-28
"*The Collected Essays and Papers of George Saintsbury, 1875 – 1920*,<br> George Saintsbury",0.274802,0.231165,"*The Theory of Beauty*,<br> E. F. Carritt",0.7095249,0.4,"*Pointed Roofs (Pilgrimage 1)*,<br> Dorothy M. Richardson",40.0,"*The Garden Party and Other Stories*,<br> Katherine Mansfield",14,hemingway-ernest: 1924-03-28/1925-03-28,hemingway-ernest,1924-03-28/1925-03-28
"*Enjoyment of Poetry*,<br> Max Eastman",0.296177,0.390322,"*The Last Chronicle of Barset*,<br> Anthony Trollope",1.396802,0.296449,"*A Passage to India*,<br> E. M. Forster",34.0,"*Under Western Eyes*,<br> Joseph Conrad",11,hemingway-ernest: 1924-03-28/1925-03-28,hemingway-ernest,1924-03-28/1925-03-28
"*The House of Souls*,<br> Arthur Machen",0.309951,0.368593,"*Avowals*,<br> George Moore",1.396549,0.296265,"*The Garden Party and Other Stories*,<br> Katherine Mansfield",33.0,"*Main Street: The Story of Carol Kennicott*,<br> Sinclair Lewis",11,hemingway-ernest: 1924-03-28/1925-03-28,hemingway-ernest,1924-03-28/1925-03-28
"*The Hairy Ape*,<br> Eugene O'Neill",0.311481,0.599975,"*One of Ours*,<br> Willa Cather",1.960499,0.200612,"*Bliss and Other Stories*,<br> Katherine Mansfield",31.0,"*Lord Jim*,<br> Joseph Conrad",11,hemingway-ernest: 1924-03-28/1925-03-28,hemingway-ernest,1924-03-28/1925-03-28
"*The Moon of the Caribbees, and Six Other Plays of the Sea*,<br> Eugene O'Neill",0.316464,0.308374,"*A Story Teller's Story*,<br> Sherwood Anderson",1.892066,0.2,"*Exiles*,<br> James Joyce",30.0,"*Babbitt*,<br> Sinclair Lewis",10,hemingway-ernest: 1924-03-28/1925-03-28,hemingway-ernest,1924-03-28/1925-03-28
"*Lummox*,<br> Fannie Hurst",0.324407,0.215282,"*Lady Susan*,<br> Jane Austen",1.892066,0.2,"*Women in Love*,<br> D. H. Lawrence",29.0,"*Erewhon*,<br> Samuel Butler",10,hemingway-ernest: 1924-03-28/1925-03-28,hemingway-ernest,1924-03-28/1925-03-28
"*The Amazing Marriage*,<br> George Meredith",0.329141,0.423971,"*Lord Byron's Correspondence*,<br> George Gordon Byron Byron",1.892066,0.2,"*South Wind*,<br> Norman Douglas",27.0,"*Night and Day*,<br> Virginia Woolf",10,hemingway-ernest: 1924-03-28/1925-03-28,hemingway-ernest,1924-03-28/1925-03-28
"*God's Orchid*,<br> Hjalmar Bergman",0.333261,0.047389,"*The Threshold of Quiet*,<br> Daniel Corkery",1.892066,0.2,"*Ulysses*,<br> James Joyce",26.0,"*The Way of All Flesh*,<br> Samuel Butler",10,hemingway-ernest: 1924-03-28/1925-03-28,hemingway-ernest,1924-03-28/1925-03-28


In [23]:
final_df = pd.concat(top_rows)
final_df = final_df[['member_period','period','lenskit_predicted_item', 'lenskit_median_score','lenskit_coef_variation', 'memory_cf_predicted_item', 'memory_cf_median_score','memory_cf_coef_variation','popular (current)', 'popular scores (current)', 'popular (all time)', 'popular scores (all time)', 'member_id']]
final_df.loc[final_df.memory_cf_predicted_item.str.contains('Ezra'), 'memory_cf_predicted_item'] = "*Instigations of Ezra Pound: Together with an Essay on the Chinese Written Character*,<br>Ezra Pound and Ernest Fenollosa"
final_df.to_csv('./data/final_top_results.csv', index=False)

In [3]:
top_results_lenskit = pd.read_csv('./data/top_scores_lenskit_model100.csv')

In [4]:
top_scores_memorycf = pd.read_csv('./data/memorycf_top_results.csv')

In [5]:
subset_lenskit = top_results_lenskit[['member_id', 'subscription_start', 'subscription_end', 'item_id','score','formatted_title', 'period', 'model_run']]

In [27]:
subset_lenskit = subset_lenskit[subset_lenskit.member_id == 'hemingway-ernest']

In [28]:
subset_mcf = top_scores_memorycf[['item_uri', 'formatted_chart_title', 'member_id', 'period', 'subscription_start', 'subscription_end', 'metric', 'score']]

In [29]:
subset_lenskit.groupby('period')['item_id'].nunique()

period
 1921-12-28/1922-11-08    34
 1924-03-28/1925-03-28    29
Name: item_id, dtype: int64

In [30]:
subset_mcf.groupby('period')['item_uri'].nunique()

period
1921-12-28/1922-11-08    30
1924-03-28/1925-03-28    47
Name: item_uri, dtype: int64

In [31]:
subset_lenskit['period'] = subset_lenskit.period.str.split(' ').str[1]

In [32]:
subset_mcf.loc[subset_mcf.formatted_chart_title.str.contains('Instigations of Ezra Pound'), 'formatted_chart_title'] = 'Instigations of Ezra Pound: Together with an Essay on the Chinese Written Character by Ezra Pound and Ernest Fenollosa'

In [33]:
subset_mcf['formatted_wrapped_title'] = subset_mcf.formatted_chart_title.apply(wrap, args=[50])
subset_lenskit['formatted_wrapped_title'] = subset_lenskit.formatted_title.apply(wrap, args=[50])

In [34]:
subset_mcf.loc[subset_mcf.metric == 'cosine', 'metric'] = 'Cosine Distance'
subset_mcf.loc[subset_mcf.metric == 'pearson', 'metric'] = 'Pearson Correlation Coefficient'
subset_mcf.loc[subset_mcf.metric == 'euclidean', 'metric'] = 'Euclidean Distance'

## Figure - Distributions of Prediction Scores for the Top Thirty-Three Books for Ernest Hemingway.

In [35]:
# Create a scatter plot of the top predictions by collaborative filtering memory-based methods
chart = alt.Chart(subset_mcf).mark_point(opacity=0.8, filled=True, thickness=3).encode(
    # The y-axis is the formatted chart title, sorted by the x-axis values
    y=alt.Y('formatted_chart_title', sort='-x', axis=alt.Axis(title='', labelFontSize=9)),
    # The x-axis is the score
    x='score',
    # The color represents the subscription period
    color=alt.Color('period:N', legend=alt.Legend(title=["Subscription Period"])),
    # The shape represents the method for collaborative filtering
    shape=alt.Shape('metric:N', legend=alt.Legend(title='Method for Collaborative Filtering')),
).properties(
    # Set the width and height of the plot
    width=300,
    height=1800,
    # Set the title of the plot
    title='Top Predictions by Collaborative Filtering Memory-Based Methods'
)

# Get the full name of the member
full_name = members_df[members_df.id == 'hemingway-ernest'].name.values[0]

# Create a tick plot of the top predictions by the implicit matrix factorization model
tickplot = alt.Chart(subset_lenskit).mark_tick(opacity=0.7).encode(
  # The y-axis is the formatted title, sorted by the x-axis values
  y=alt.Y('formatted_title', sort='-x', axis=alt.Axis(title="", orient='right', labelFontSize=9)),
  # The x-axis is the score
  x='score',
  # The color represents the subscription period
  color=alt.Color('period:N', legend=alt.Legend(title=["Subscription Period"], orient='left')),
).properties(
    # Set the title of the plot
    title=f'Top Predictions by Implicit Matrix Factorization Model',
    # Set the width and height of the plot
    width=300,
    height=1800
)

# Concatenate the tick plot and scatter plot horizontally and configure the y-axis
alt.hconcat(tickplot, chart).configure_axisY(
    titleAngle=0,
    titleAlign="left",
    titleY=-10,
    titleX=100,
    labelLimit=1000
)

## Figure - Top Book Recommendations for Ernest Hemingway.

In [101]:
filtered_df = final_df.copy()
filtered_df['blank_rowname'] = ['&nbsp;' * 14 for _ in range(len(filtered_df))]

In [102]:
def process_subscription_period(rows):
    member_period = rows.member_period.unique()[0]
    start_period = member_period.split(': ')[1].split('/')[0]
    end_period = member_period.split(': ')[1].split('/')[1]
    # turn into dates
    start_period = pd.to_datetime(start_period)
    end_period = pd.to_datetime(end_period)
    # Turn into Subscription Period from start and end dates, using Month Day, Year format
    subscription_period = f"Subscription Period from {start_period.strftime('%B %d %Y')} to {end_period.strftime('%B %d %Y')}"
    rows['subscription_formatted'] = subscription_period
    return rows
    
filtered_df = filtered_df.groupby(['member_period']).apply(process_subscription_period)

In [103]:
ignore_columns = ['member_period', 'period', 'member_id',]
filtered_df = filtered_df.drop(columns=ignore_columns)

In [104]:
filtered_df = filtered_df.rename(columns={
    "popular (all time)": "popular_all_time",
    "popular scores (all time)": "popular_scores_all_time",
    "popular (current)": "popular_current",
    "popular scores (current)": "popular_scores_current",})

In [105]:
final_table = (
    GT(filtered_df, rowname_col="blank_rowname", groupname_col="subscription_formatted").cols_width(
        cases={
			"lenskit_predicted_item": "300px",
			"memory_cf_predicted_item": "300px",
			"popular_current": "200px",
			"popular_all_time": "200px",
            "lenskit_coef_variation":"50px",
            "lenskit_median_score":"50px",
            "memory_cf_coef_variation":"50px",
            "memory_cf_median_score":"50px",
            "popular_scores_all_time":"30px",
            "popular_scores_current":"30px",
		}
	).fmt_percent(
        columns=[
			"lenskit_coef_variation",
			"memory_cf_coef_variation",
		]
	).fmt_markdown(
        columns=[
			"lenskit_predicted_item",
			"memory_cf_predicted_item",
            "popular_all_time",
            "popular_current",
            "subscription_formatted",
		]
	).fmt_number(
        columns=[
			"lenskit_median_score",
			"memory_cf_median_score",
		],
        decimals = 3
	).cols_label(
        lenskit_predicted_item="Predicted Book by Implicit Matrix Factorization Model (IMF)",
        lenskit_coef_variation="CV (IMF)",
        lenskit_median_score="Score (IMF)",
        memory_cf_predicted_item="Predicted Book by Memory-Based Collaborative Filtering (CF)",
        memory_cf_coef_variation="CV (CF)",
        memory_cf_median_score="Score (CF)",
        popular_all_time=md("Most Popular Books<br>(1919-1942)"),
        popular_scores_all_time="Total Borrows",
        popular_current=md("Most Popular Books<br>in Subscription Period"),
        popular_scores_current="Total Borrows",
	).tab_style(
        # select only the groupname column. Should be using the LOCRowGroups but not available yet
        locations=loc.body(columns="subscription_formatted"),
        # make it bold
        style=[
            style.text(weight="bold")
		]
	).tab_style(
        #select columns
        locations=[
            loc.body(
            columns=[
                "lenskit_coef_variation",
                "memory_cf_coef_variation",
        	]),
            ## Currently column labels is not supported but seems like the code will be aded soon
			# loc.column_labels(
			# columns=[
			# 	"lenskit_coef_variation",
			# 	"memory_cf_coef_variation",
			# ]
			# )
        ],
        style=[
            style.fill(color = "#F0F8FF")
		]
	).tab_style(
        locations=[
            loc.body(
			columns=[
				"lenskit_median_score",
				"memory_cf_median_score",
			]),
            # loc.column_labels(
			# columns=[
			# 	"lenskit_median_score",
			# 	"memory_cf_median_score",
			# ]
		],
        style=[
			style.fill(color = "#F0F0F0")
		]
	).cols_align(align="center").tab_header(
    	title = 'Predicted Books for Ernest Hemingway',
      	subtitle = "Predictions based on periods of known subscriptions with no extant borrowing records."
	).tab_source_note(
		source_note = "Score based on median of predictions from model or method.",
        # locations=loc.column_labels(columns=["lenskit_median_score", "memory_cf_median_score"])
        
	).tab_source_note(
        source_note = "Ranked by coefficient of variation (CV) and then median scores (Score)."
	)
)

final_table

Predicted Books for Ernest Hemingway,Predicted Books for Ernest Hemingway,Predicted Books for Ernest Hemingway,Predicted Books for Ernest Hemingway,Predicted Books for Ernest Hemingway,Predicted Books for Ernest Hemingway,Predicted Books for Ernest Hemingway,Predicted Books for Ernest Hemingway,Predicted Books for Ernest Hemingway,Predicted Books for Ernest Hemingway,Predicted Books for Ernest Hemingway
Predictions based on periods of known subscriptions with no extant borrowing records.,Predictions based on periods of known subscriptions with no extant borrowing records..1,Predictions based on periods of known subscriptions with no extant borrowing records..2,Predictions based on periods of known subscriptions with no extant borrowing records..3,Predictions based on periods of known subscriptions with no extant borrowing records..4,Predictions based on periods of known subscriptions with no extant borrowing records..5,Predictions based on periods of known subscriptions with no extant borrowing records..6,Predictions based on periods of known subscriptions with no extant borrowing records..7,Predictions based on periods of known subscriptions with no extant borrowing records..8,Predictions based on periods of known subscriptions with no extant borrowing records..9,Predictions based on periods of known subscriptions with no extant borrowing records..10
,,,,,,,,,,
Subscription Period from December 28 1921 to November 08 1922,Subscription Period from December 28 1921 to November 08 1922,Subscription Period from December 28 1921 to November 08 1922,Subscription Period from December 28 1921 to November 08 1922,Subscription Period from December 28 1921 to November 08 1922,Subscription Period from December 28 1921 to November 08 1922,Subscription Period from December 28 1921 to November 08 1922,Subscription Period from December 28 1921 to November 08 1922,Subscription Period from December 28 1921 to November 08 1922,Subscription Period from December 28 1921 to November 08 1922,Subscription Period from December 28 1921 to November 08 1922
,"The Moon of the Caribbees, and Six Other Plays of the Sea,  Eugene O'Neill",0.308,31.65%,"Since Cézanne,  Clive Bell",0.285,68.94%,"A Portrait of the Artist as a Young Man,  James Joyce",10,"Dubliners,  James Joyce",45.0
,"Mountain Blood: A Novel,  Joseph Hergesheimer",0.183,35.05%,"The Clash,  Storm Jameson",0.285,68.94%,"Erewhon,  Samuel Butler",8,"Pointed Roofs (Pilgrimage 1),  Dorothy M. Richardson",40.0
,"The Shadow Line: A Confession,  Joseph Conrad",0.527,36.95%,"Samuel Butler,  Henry Festing Jones",0.285,68.94%,"Exiles,  James Joyce",8,"The Garden Party and Other Stories,  Katherine Mansfield",33.0
,"Beyond the Horizon,  Eugene O'Neill",0.529,37.33%,"The Daniel Jazz and Other Poems,  Vachel Lindsay",0.285,68.94%,"Heartbreak House: A Fantasia in the Russian Manner on English Themes,  George Bernard Shaw",7,"Bliss and Other Stories,  Katherine Mansfield",31.0
,"Joanna Godden,  Sheila Kaye-Smith",0.353,39.11%,"George Gissing: A Critical Study,  Frank Swinnerton",0.285,68.94%,"Plays: Pleasant and Unpleasant,  George Bernard Shaw",7,"Exiles,  James Joyce",30.0
,"Henry James,  Rebecca West",0.311,40.38%,"Washington and the Hope of Peace,  H. G. Wells",0.285,68.94%,"Typhoon,  Joseph Conrad",6,"Women in Love,  D. H. Lawrence",29.0
,"Rahab,  Waldo Frank",0.299,42.24%,"The Life of Walter Pater,  Thomas Wright",0.285,68.94%,"The Way of All Flesh,  Samuel Butler",6,"South Wind,  Norman Douglas",27.0
,"The Story of an African Farm,  Olive Schreiner",0.377,42.91%,"Shakespeare,  William Shakespeare",0.285,71.05%,"Cashel Byron's Profession,  George Bernard Shaw",6,"Ulysses,  James Joyce",26.0


In [113]:
# final_table.save("../figures/fig13-hemingway-predictions.png", scale=2.0, window_size=[2000, 1000])