# S&Co Collaborative Filtering Notebook

This notebook implements collaborative filtering methods for S&Co data.

## Load Libraries and Datasets

In [28]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import altair as alt
alt.data_transformers.disable_max_rows()
import sys
sys.path.append("..")
from speculative_reading.load_datasets import get_updated_shxco_data
from typing import List, Tuple, Dict

from scipy import spatial, stats
from scipy.stats import zscore, mode
# from scipy.stats import pearsonr

In [4]:
members_df, books_df, borrow_events, events_df = get_updated_shxco_data(get_subscription=False)
partial_df = pd.read_csv('../appendix/speculative_reading/data/partial_borrowers_collapsed.csv')

## Define Functions

In [5]:
def get_author_title(item: pd.DataFrame) -> Tuple[str, str]:
    """
    Get author and title from a book item

    Args:
    item: pd.DataFrame: a DataFrame containing a book item

    Returns:
    Tuple[str, str]: a tuple containing the author and title of the book
    """
    if item.author.isna().any() == False:
        author = ' '.join(item.author.str.split(',').values[0][::-1])
    else: 
        author = '(Periodical)'
    title = item.title.values[0]
    return author, title

def get_formatted_titles(sorted_values: pd.DataFrame, numb_of_preds: int, is_table: bool = False) -> List[str]:
    """
    Get a list of formatted titles

    Args:
    sorted_values: pd.DataFrame: a DataFrame containing the sorted values
    numb_of_preds: int: the number of predictions to return
    is_table: bool: whether to return the titles in table format

    Returns:
    List[str]: a list of formatted titles
    """
    titles = []
    for i in sorted_values[0:numb_of_preds].item_uri.tolist():
        item = books_df[books_df.id == i]
        author, title = get_author_title(item)
        if is_table:
            titles.append(f"*{title}*,<br>{author}")
        else:
            titles.append(f"{title} by {author}")
    return titles

In [16]:
def euc_distance(prefs: Dict[str, Dict[str, float]], person1: str, person2: str) -> float:
    """
    Calculate the Euclidean distance between two people based on their preferences.
    """
    # Find common items
    si = {item: 1 for item in prefs[person1] if item in prefs[person2]}
    
    # If they have no common items, return 0
    if len(si) == 0: 
        return 0

    # Calculate the sum of squares of differences
    sum_of_squares = sum(pow(prefs[person1][item] - prefs[person2][item], 2) for item in si)
    
    # Return the inverse of Euclidean distance to give a higher value to people who are more similar
    return 1 / (1 + sum_of_squares)

def calculate_rankings(prefs: Dict[str, Dict[str, float]], person: str, other: str, score: float, totals: Dict[str, float], simSums: Dict[str, float], query_books: List[str]) -> List[Tuple[float, str]]:
    """
    Calculate the rankings of books for a person based on the preferences of another person and a similarity score.
    """
    for item in prefs[other]:
        # Only consider books that the person hasn't read or rated yet and are in the query books
        if item not in prefs[person] or prefs[person][item] == 0:
            if item in query_books:
                # Calculate the total score for each book
                totals.setdefault(item, 0) 
                totals[item] += prefs[other][item] * score
                # Calculate the total similarity for each book
                simSums.setdefault(item, 0)
                simSums[item] += score

    # Calculate the average score for each book
    rankings = [(total / simSums[item], item) for item, total in totals.items()]
    # Sort the rankings in descending order
    rankings.sort()
    rankings.reverse()
    return rankings

def get_predictions(person: str, prefs: Dict[str, Dict[str, float]], weighted: pd.DataFrame,  query_books: List[str]) -> Tuple[List[Tuple[float, str]], List[Tuple[float, str]], List[Tuple[float, str]]]:
    """
    Get book recommendations for a person based on the preferences of all other people.
    """
    totals = {} 
    simSums = {}

    cos_totals = {} 
    cos_simSums = {}

    pear_totals = {} 
    pear_simSums = {}

    for other in prefs:
        # Don't compare the person to themselves
        if other == person:
            continue

        # Get the preference vectors for the person and the other person
        member_vector = weighted.loc[person]
        other_vector = weighted.loc[other]

        # Calculate the similarity scores
        euc_score = euc_distance(prefs, person, other)
        cosine_score = spatial.distance.cosine(member_vector, other_vector)
        pearson_score, pearson_p = stats.pearsonr(member_vector, other_vector)

        # Ignore people who have a similarity score of 0 with the person
        if euc_score <= 0:
            continue

        # Calculate the rankings based on each similarity score
        rankings = calculate_rankings(prefs, person, other, euc_score, totals, simSums, query_books)
        cos_rankings = calculate_rankings(prefs, person, other, cosine_score, cos_totals, cos_simSums, query_books)
        pear_rankings = calculate_rankings(prefs, person, other, pearson_score, pear_totals, pear_simSums, query_books)

    return rankings, cos_rankings, pear_rankings

In [18]:
def create_rankings_dataframe(rankings: List[Tuple[float, str]], score_type: str, n: int) -> pd.DataFrame:
    """
    Create a DataFrame from the rankings of books.
    
    Parameters:
    rankings: A list of tuples where each tuple contains a score and an item URI.
    score_type: A string indicating the type of score used in the rankings.
    n: The number of top items to include in the DataFrame.

    Returns:
    A DataFrame with the top n items from the rankings.
    """
    # Create a DataFrame from the top n items in the rankings.
    df = pd.DataFrame(rankings[0:n])
    # Set the column names.
    df.columns = ['score', 'item_uri']
    
    # Get the formatted titles for the chart and add them to the DataFrame.
    chart_titles = get_formatted_titles(df, n, True)
    df['formatted_chart_title'] = chart_titles
    
    # Get the formatted titles for the table and add them to the DataFrame.
    table_titles = get_formatted_titles(df, n, False)
    df['formatted_table_title'] = table_titles
    
    # Add the score type to the DataFrame.
    df['type'] = score_type
    
    return df

In [20]:
def scale_col(df: pd.DataFrame, cols: List[str]) -> pd.DataFrame:
    """
    Scale specified columns in a DataFrame using MinMaxScaler.

    Parameters:
    df: A pandas DataFrame.
    cols: A list of column names to scale.

    Returns:
    A DataFrame with the scaled columns.
    """
    # For each column in the list of columns...
    for col in cols:
        # Create a MinMaxScaler.
        scaler = MinMaxScaler()
        # Fit the scaler to the column and transform the column.
        # The 'reshape' function is used to reshape the column to a 2D array, which is required by the 'fit_transform' function.
        # The transformed column is added to the DataFrame with the suffix '_scaled'.
        df[col + '_scaled'] = scaler.fit_transform(df[col].values.reshape(-1, 1))
    # Return the DataFrame with the scaled columns.
    return df

## Process Data

In [17]:
# Group the borrow events by member ID and item URI, and count the number of events for each combination.
# This can help us understand how many times each member borrowed each item.
grouped_borrows = borrow_events[['member_id', 'item_uri']].groupby(['member_id', 'item_uri']).size().reset_index(name='counts')

# Pivot the grouped borrow events to create a matrix where the rows are member IDs, the columns are item URIs, and the values are counts.
# Fill missing values with 0 and convert the counts to integers.
# This creates a user-item matrix that can be used for collaborative filtering.
pivoted_borrows = grouped_borrows.pivot(
    index='member_id', columns='item_uri', values='counts').fillna(0).astype(int)

# Standardize the user-item matrix by subtracting the mean and dividing by the standard deviation.
# This can help improve the performance of the collaborative filtering algorithm.
scaler = StandardScaler()
weighted = scaler.fit_transform(pivoted_borrows)
weighted = pd.DataFrame(weighted, index=pivoted_borrows.index, columns=pivoted_borrows.columns)

# Define a list of members to query.
partial_members = ['hemingway-ernest']

# Get the unique member IDs from the grouped borrow events that are in the query members list.
query_members = grouped_borrows.loc[grouped_borrows.member_id.isin(partial_members)].member_id.unique().tolist()

# Filter the partial_df DataFrame to include only rows where the member ID is in the query members list.
partial_df = partial_df[partial_df.member_id.isin(query_members)]

# Convert the subscription start and end times to datetime objects.
partial_df['subscription_starttime'] = pd.to_datetime(partial_df['subscription_start'])
partial_df['subscription_endtime'] = pd.to_datetime(partial_df['subscription_end'])

# Create a dictionary where the keys are member IDs and the values are dictionaries of item URIs and counts.
# This can be used to quickly look up the count for a specific member and item.
member_item_counts = (grouped_borrows.groupby(['member_id'])['item_uri', 'counts'].apply(
     lambda x: dict(x.values)).to_dict())

In [19]:
# Set the number of top items to include in the DataFrame.
n = 200
# Initialize an empty list to store the final DataFrames.
final_df = []
# Initialize variables to store the previous books and narrow query books.
previous_books = None
narrow_query_books = None
# Set a flag to limit the books to those in circulation.
limit_to_circulation = False

# For each row in the partial_df DataFrame...
for index, row in partial_df.iterrows():
    print(f"Processing index: {index}")
    print(f"Subscription period: {row.subscription_start} to {row.subscription_end}")

    # Get the events that occurred before the end of the subscription.
    circulation_events = events_df[(events_df.start_datetime < row.subscription_endtime) | (events_df.end_datetime < row.subscription_endtime)]
    print(f"Identified events: {len(circulation_events)}")

    # Get the unique books from the events.
    query_books = circulation_events[circulation_events.item_uri.notna()].item_uri.unique().tolist()
    print(f"Identified books: {len(query_books)}")

    # Get the unique books that the member has read.
    member_book_ids = events_df[(events_df.item_uri.notna()) & (events_df.member_id == row.member_id)].item_uri.unique()
    # Remove the books that the member has read from the query books.
    query_books = list(set(query_books) - set(member_book_ids))
    print(f"Identified books without those read by member: {len(query_books)}")

    if limit_to_circulation:
        if index == 0:
            previous_books = query_books
        else:
            # Remove the books that were read in the previous subscription from the query books.
            query_books = list(set(query_books) - set(previous_books))
            print(f"Identified books without those read by member and those read in previous subscription: {len(query_books)}")

    # Get the book recommendations based on Euclidean distance, cosine similarity, and Pearson correlation.
    rankings, cos_rankings, pear_rankings = get_predictions(row.member_id, member_item_counts, weighted, query_books)
    print(f"Euclidean: {len(rankings)}, Cosine: {len(cos_rankings)}, Pearson: {len(pear_rankings)}")

    # Create DataFrames from the rankings.
    euc_df = create_rankings_dataframe(rankings, 'euclidean', n)
    cos_df = create_rankings_dataframe(cos_rankings, 'cosine', n)
    pear_df = create_rankings_dataframe(pear_rankings, 'pearson', n)

    # Concatenate the DataFrames.
    dfs = pd.concat([euc_df, cos_df, pear_df])
    # Add the member ID, subscription period, and subscription start and end times to the DataFrame.
    dfs['member_id'] = row.member_id
    dfs['period'] = row.subscription_start + '/' + row.subscription_end
    dfs['subscription_start'] = row.subscription_start
    dfs['subscription_end'] = row.subscription_end
    # Add the DataFrame to the final list.
    final_df.append(dfs)

Processing index: 25
Subscription period: 1921-12-28 to 1922-11-08
Identified events: 2602
Identified books: 666
Identified books without those read by member: 660
Euclidean: 551, Cosine: 551, Pearson: 551
Processing index: 26
Subscription period: 1924-03-28 to 1925-03-28
Identified events: 6112
Identified books: 1292
Identified books without those read by member: 1280
Euclidean: 1097, Cosine: 1097, Pearson: 1097


In [21]:
# Concatenate all the DataFrames in 'final_df' into a single DataFrame.
# This creates a DataFrame with all the book recommendations.
final_preds = pd.concat(final_df)

# Pivot 'final_preds' to create a matrix where the rows are item URIs, formatted chart titles, formatted table titles, member IDs, periods, subscription start times, and subscription end times, the columns are score types, and the values are scores.
# Fill missing values with 0.
# This creates a user-item matrix that can be used for collaborative filtering.
pivoted_predictions = pd.pivot(final_preds, index=['item_uri', 'formatted_chart_title', 'formatted_table_title', 'member_id', 'period', 'subscription_start', 'subscription_end'], columns='type', values='score').reset_index().fillna(0)

# Scale the scores in 'final_preds' within each period and score type.
# This can help improve the performance of the collaborative filtering algorithm.
final_preds['score_scaled'] = final_preds.groupby(['period', 'type'])['score'].apply(lambda x: (x-min(x))/(max(x)-min(x)))

# Define a list of metrics to scale.
metrics = ['cosine', 'pearson', 'euclidean']

# Scale the metrics in 'pivoted_predictions'.
scaled_predictions = scale_col(pivoted_predictions, metrics)

# Melt 'scaled_predictions' to create a DataFrame where each row represents a book recommendation, with columns for the item URI, formatted chart title, formatted table title, member ID, period, subscription start time, subscription end time, metric, and score.
preds_df = pd.melt(scaled_predictions, id_vars=['item_uri', 'formatted_chart_title', 'formatted_table_title', 'member_id', 'period', 'subscription_start', 'subscription_end'], value_vars=['cosine_scaled', 'pearson_scaled', 'euclidean_scaled'], var_name='metric', value_name='score')

# Replace the scaled metric names with the original metric names.
preds_df.loc[preds_df.metric == 'cosine_scaled', 'metric'] = 'cosine'
preds_df.loc[preds_df.metric == 'pearson_scaled', 'metric'] = 'pearson'
preds_df.loc[preds_df.metric == 'euclidean_scaled', 'metric'] = 'euclidean'

# Sort 'preds_df' by score in descending order.
# This orders the book recommendations by score, with the highest scores first.
preds_df = preds_df.sort_values(by=['score'] ,ascending=False)

# Get the unique item URIs from the top 100 rows in 'preds_df'.
# This gives us the top 100 recommended books.
items = preds_df[0:100].item_uri.unique()

# Print the number of unique recommended books.
print(len(items))

63


In [22]:
# Get the unique periods from 'preds_df'.
periods = preds_df.period.unique().tolist()

# Set the number of top items to include for each period.
n = 23

# Initialize an empty list to store the final items for each period.
final_items = []

# For each period...
for index, period in enumerate(periods):
    # Initialize an empty list to store the items for this period.
    items = []
    
    # Get the rows from 'preds_df' for this period that are not periodicals, and sort them by score in descending order.
    rows = preds_df[(preds_df.period == period) & (preds_df.formatted_chart_title.str.contains('Periodical') == False)].sort_values(by=['score'] ,ascending=False)
    
    # If this is the first period...
    if index == 0:
        # While the number of items is less than or equal to n...
        while len(items) <= n:
            # Get the item URI of the first row.
            item = rows.item_uri.iloc[0]
            # If the item is not already in the list of items, add it.
            if item not in items:
                items.append(item)
            # Remove the first row from 'rows'.
            rows = rows.drop(rows.index[0])
        # Add the list of items to 'final_items'.
        final_items.append(items)
    # If this is not the first period...
    else:
        # While the number of items is less than or equal to n...
        while len(items) <= n:
            # Get the item URI of the first row.
            item = rows.item_uri.iloc[0]
            # If the item is not already in the list of items and it was not in the list of items for the previous period, add it.
            if (item not in items) and (item not in final_items[index-1]):
                items.append(item)
            # Remove the first row from 'rows'.
            rows = rows.drop(rows.index[0])
        # Limit the list of items to the top n items.
        items = items[0:n]
        # Add the list of items to 'final_items'.
        final_items.append(items)

# Print the number of items in the list for the second period.
print(f"The number of items in the list for the second period is: {len(final_items[1])}")

The number of items in the list for the second period is: 23


In [25]:
# Concatenate the lists of top items for the first and second periods.
# This gives us a list of all the top items across both periods.
top_items = final_items[0] + final_items[1] 

# Filter 'preds_df' to include only rows where the item URI is in the list of top items.
# Then, group the rows by period and count the number of unique item URIs for each period.
# This gives us the number of unique recommended books for each period.
unique_items_per_period = preds_df[preds_df.item_uri.isin(top_items)].groupby(['period'])['item_uri'].nunique()
print(unique_items_per_period)

top_results = preds_df[preds_df.item_uri.isin(top_items)]
# top_results.to_csv('./public_data/memorycf_top_results.csv', index=False)

period
1921-12-28/1922-11-08    24
1924-03-28/1925-03-28    47
Name: item_uri, dtype: int64


In [26]:
chart = alt.Chart(preds_df[preds_df.item_uri.isin(top_items)]).mark_point(opacity=0.8, filled=True, thickness=3).encode(
    y=alt.Y('formatted_chart_title', sort='-x',  axis=alt.Axis(title='Predicted Book', orient='right')),
    x='score',
    color=alt.Color('period:N', legend=alt.Legend(title='Subscription Periods With Missing Records', orient='left')),
    # column='member_id:N',
    shape=alt.Shape('metric:N'),
).properties(
    width=300,
    title='Top Predictions by Collaborative Filtering Memory-Based Methods'
)
chart = chart.configure_axisY(
        titleAngle=0,
        titleAlign="left",
        titleY=-10,
        titleX=-100,
        labelLimit=1000
    )

chart

In [29]:
# Group 'preds_df' by member ID, subscription start and end times, period, item URI, and formatted table and chart titles.
# Calculate the median, skewness, standard deviation, and variance of the scores for each group.
# Reset the index of the resulting DataFrame and flatten the multi-level column names.
metrics_df = preds_df.groupby(['member_id', 'subscription_start', 'subscription_end', 'period', 'item_uri', 'formatted_table_title', 'formatted_chart_title']).agg({'score': [np.median, 'skew', 'std', 'var']}).reset_index()
metrics_df.columns = list(map(''.join, metrics_df.columns.values))
metrics_df.columns = [col if 'score' not in col else col.split('score')[1] for col in metrics_df.columns ]

# Calculate the kurtosis of the scores for each group and add it to 'metrics_df'.
kurt_df = preds_df.groupby(['member_id', 'subscription_start', 'subscription_end', 'period', 'item_uri', 'formatted_table_title', 'formatted_chart_title'])['score'].apply(pd.DataFrame.kurt).reset_index(name='kurtosis')
final_df = pd.merge(metrics_df, kurt_df, on=['member_id', 'subscription_start', 'subscription_end', 'period', 'item_uri', 'formatted_table_title', 'formatted_chart_title'])

# Merge 'final_df' with 'preds_df' to add the original scores.
final_df = pd.merge(final_df, preds_df, on=['member_id', 'subscription_start', 'subscription_end', 'period', 'item_uri', 'formatted_table_title', 'formatted_chart_title'], how='left')

# Calculate the z-scores of the scores for each group.
final_df['zscore'] = final_df.groupby(['member_id', 'subscription_start', 'subscription_end', 'period', 'item_uri', 'formatted_table_title', 'formatted_chart_title'])['score'].transform(lambda x : zscore(x,ddof=1))

# Calculate the maximum score for each group and merge it with 'final_df'.
top_scores = final_df.groupby(['member_id', 'subscription_start', 'subscription_end', 'period', 'item_uri', 'formatted_table_title', 'formatted_chart_title']).agg({'score':'max'})[['score']].reset_index()
top_scores = pd.merge(top_scores, final_df, on=top_scores.columns.tolist(), how='inner')
top_scores = top_scores.rename(columns={'score': 'top_score', 'zscore' : 'top_zscore'})

# Calculate the average score for each group and merge it with 'top_scores'.
avg_scores = final_df.groupby(['member_id', 'subscription_start', 'subscription_end', 'period', 'item_uri', 'formatted_table_title', 'formatted_chart_title'])['score'].mean().reset_index(name='avg_score')
scores_df = pd.merge(top_scores, avg_scores, on=['member_id', 'subscription_start', 'subscription_end', 'period', 'item_uri', 'formatted_table_title', 'formatted_chart_title'])

# Calculate the median score for each group and merge it with 'scores_df'.
median_scores = final_df.groupby(['member_id', 'subscription_start', 'subscription_end', 'period', 'item_uri', 'formatted_table_title', 'formatted_chart_title'])['score'].median().reset_index(name='median_score')
scores_df = pd.merge(scores_df, median_scores, on=['member_id', 'subscription_start', 'subscription_end', 'period', 'item_uri', 'formatted_table_title', 'formatted_chart_title'])

# Calculate the standard deviation of the scores for each group and merge it with 'scores_df'.
std_scores = final_df.groupby(['member_id', 'subscription_start', 'subscription_end', 'period', 'item_uri', 'formatted_table_title', 'formatted_chart_title']).agg({'score': 'std'}).reset_index()
std_scores = std_scores.rename(columns={'score': 'std_score'})
scores_df = pd.merge(scores_df, std_scores, on=['member_id', 'subscription_start', 'subscription_end', 'period', 'item_uri', 'formatted_table_title', 'formatted_chart_title'])

# Calculate the mode of the scores for each group and merge it with 'final_df'.
mode_scores = preds_df.groupby(['member_id', 'period', 'subscription_start', 'subscription_end','item_uri'])['score'].agg(lambda x: pd.Series.mode(x).iat[0]).reset_index()
mode_scores = pd.merge(mode_scores, final_df, on=mode_scores.columns.tolist(), how='inner')
mode_scores = mode_scores.rename(columns={'score': 'mode_score', 'zscore' : 'mode_zscore'})

# Merge 'mode_scores' with 'scores_df' to create the final DataFrame of scores.
final_scores = pd.merge(mode_scores[['member_id', 'period', 'item_uri', 'formatted_table_title', 'formatted_chart_title', 'mode_score', 'mode_zscore', 'subscription_start', 'subscription_end']], scores_df, on=['member_id', 'period', 'item_uri', 'formatted_table_title', 'formatted_chart_title', 'subscription_start', 'subscription_end'])

In [30]:
final_scores.head(5)

Unnamed: 0,member_id,period,item_uri,formatted_table_title,formatted_chart_title,mode_score,mode_zscore,subscription_start,subscription_end,top_score,median,skew,std,var,kurtosis,metric,top_zscore,avg_score,median_score,std_score
0,hemingway-ernest,1921-12-28/1922-11-08,a-e-imaginations-reveries,Imaginations and Reveries by Æ,"*Imaginations and Reveries*,<br>Æ",0.024169,-1.11149,1921-12-28,1922-11-08,0.221125,0.166048,-1.177452,0.101616,0.010326,,euclidean,0.826748,0.137114,0.166048,0.101616
1,hemingway-ernest,1921-12-28/1922-11-08,abercrombie-thomas-hardy-critical,Thomas Hardy: A Critical Study by Lascelles A...,"*Thomas Hardy: A Critical Study*,<br> Lascelle...",0.03954,-1.07561,1921-12-28,1922-11-08,0.428571,0.285434,-0.759614,0.196765,0.038716,,euclidean,0.901532,0.251182,0.285434,0.196765
2,hemingway-ernest,1921-12-28/1922-11-08,beerbohm-zuleika-dobson,Zuleika Dobson by Max Beerbohm,"*Zuleika Dobson*,<br> Max Beerbohm",0.026132,-1.097906,1921-12-28,1922-11-08,0.226531,0.163079,-1.014785,0.102421,0.01049,,euclidean,0.858713,0.13858,0.163079,0.102421
3,hemingway-ernest,1921-12-28/1922-11-08,bell-since-cezanne,Since Cézanne by Clive Bell,"*Since Cézanne*,<br> Clive Bell",0.03954,-1.07561,1921-12-28,1922-11-08,0.428571,0.285434,-0.759614,0.196765,0.038716,,euclidean,0.901532,0.251182,0.285434,0.196765
4,hemingway-ernest,1921-12-28/1922-11-08,bennett-married-life,Married Life by Arnold Bennett,"*Married Life*,<br> Arnold Bennett",0.02436,-1.113959,1921-12-28,1922-11-08,0.225564,0.170788,-1.207611,0.104023,0.010821,,euclidean,0.820269,0.140237,0.170788,0.104023


In [34]:
# Select the columns for member ID, period, item URI, formatted table and chart titles, and subscription start and end times from 'final_scores'.
# Drop duplicate rows to get a DataFrame of unique member subscriptions.
member_subscriptions = final_scores[['member_id', 'period', 'item_uri', 'formatted_table_title', 'formatted_chart_title', 'subscription_start', 'subscription_end']].drop_duplicates()

# Drop duplicate rows from 'final_scores' to get a DataFrame of unique scores.
final_scores_dedup = final_scores.drop_duplicates()

# Calculate the coefficient of variation (standard deviation divided by median) for each score in 'final_scores_dedup'.
# Add the coefficient of variation to 'final_scores_dedup' as a new column.
final_scores_dedup['coef_var'] = (final_scores_dedup.std_score/ final_scores_dedup.median_score)

print(f"Deduping final scores: {final_scores.shape} -> {final_scores_dedup.shape}")

Deduping final scores: (480, 20) -> (441, 21)


In [35]:
final_scores_dedup.to_csv('./data/collaborative_filtering_predictions.csv', index=False)