# S&Co Collaborative Filtering Notebook

This notebook implements collaborative filtering methods for S&Co data.

## Load Libraries and Datasets

In [1]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import altair as alt
alt.data_transformers.disable_max_rows()
import sys
sys.path.append("..")
from speculative_reading.load_datasets import get_updated_shxco_data


from scipy import spatial, stats
# from scipy.stats import pearsonr

In [2]:
members_df, books_df, borrow_events, events_df = get_updated_shxco_data(get_subscription=False)
partial_df = pd.read_csv('../data/partial_borrowers_collapsed.csv')

FileNotFoundError: [Errno 2] No such file or directory: '../data/partial_borrowers_collapsed.csv'

## Define Functions

In [59]:
def get_author_title(item):
    if item.author.isna().any() == False:
        author = ' '.join(item.author.str.split(',').values[0][::-1])
    else: 
        author = '(Periodical)'
    title = item.title.values[0]
    return author, title

def get_formatted_titles(sorted_values, numb_of_preds, is_table=False):
    titles = []
    for i in sorted_values[0:numb_of_preds].item_uri.tolist():
        item = books_df[books_df.id == i]
        author, title = get_author_title(item)
        if is_table:
            titles.append(f"*{title}*,<br>{author}")
        else:
            titles.append(f"{title} by {author}")
    return titles

In [60]:
def euc_distance(prefs,person1,person2):
    si = {} 
    for item in prefs[person1]:
        if item in prefs[person2]:
            si[item]=1    
    if len(si) == 0: 
        return 0
    sum_of_squares = sum([pow(prefs[person1][item] - prefs[person2][item],2) 
                          for item in prefs[person1] if item in prefs[person2]])
    return 1/(1+sum_of_squares)

def calculate_rankings(prefs, person, other, score, totals, simSums, query_books):
    for item in prefs[other]:
        if item not in prefs[person] or prefs[person][item] == 0:
            if item in query_books:
                totals.setdefault(item,0) 
                totals[item] += prefs[other][item] * score
                simSums.setdefault(item,0)
                simSums[item] += score
    rankings = [(total/simSums[item], item) for item, total in totals.items()]
    rankings.sort()
    rankings.reverse()
    return rankings

def get_predictions(person, prefs, weighted, query_books):
    totals = {} 
    simSums = {}

    cos_totals = {} 
    cos_simSums = {}

    pear_totals = {} 
    pear_simSums = {}

    for other in prefs:
        if other == person:
            continue
        member_vector = weighted.loc[person]
        other_vector = weighted.loc[other]

        euc_score = euc_distance(prefs, person, other)
        cosine_score = spatial.distance.cosine(member_vector,other_vector)
        pearson_score, pearson_p = stats.pearsonr(member_vector, other_vector)

        if euc_score <= 0:
            continue

        rankings = calculate_rankings(prefs, person, other, euc_score, totals, simSums, query_books)
        cos_rankings = calculate_rankings(prefs, person, other, cosine_score, cos_totals, cos_simSums, query_books)
        pear_rankings = calculate_rankings(prefs, person, other, pearson_score, pear_totals, pear_simSums, query_books)

    return rankings, cos_rankings, pear_rankings

In [None]:
# Returns a distance-based similarity score for person1 and person2
def euc_distance(prefs,person1,person2):
    si = {} 
    for item in prefs[person1]:
        if item in prefs[person2]:
            si[item]=1    
    if len(si) == 0: 
        return 0
    sum_of_squares = sum([pow(prefs[person1][item] - prefs[person2][item],2) 
                          for item in prefs[person1] if item in prefs[person2]])
    return 1/(1+sum_of_squares)


def get_predictions(person, prefs, weighted, query_books):

    totals = {} 
    simSums = {}

    cos_totals = {} 
    cos_simSums = {}

    pear_totals = {} 
    pear_simSums = {}
    for other in prefs:

        if other == person:
            continue
        member_vector = weighted.loc[person]
        other_vector = weighted.loc[other]

        euc_vector = euc_distance(prefs, person, other)
        cosine_score = spatial.distance.cosine(member_vector,other_vector)
        pearson_score, pearson_p = stats.pearsonr(member_vector, other_vector)
        if euc_vector <= 0:
            continue
        for item in prefs[other]:
            if item not in prefs[person] or prefs[person][item] == 0:
                if item in query_books:

                    totals.setdefault(item,0) 
                    totals[item] += prefs[other][item] * euc_vector
                    simSums.setdefault(item,0)
                    simSums[item] += euc_vector

                    cos_totals.setdefault(item,0) 
                    cos_totals[item] += prefs[other][item] * cosine_score
                    cos_simSums.setdefault(item,0)
                    cos_simSums[item] += cosine_score

                    pear_totals.setdefault(item,0) 
                    pear_totals[item] += prefs[other][item] * pearson_score
                    pear_simSums.setdefault(item,0)
                    pear_simSums[item] += pearson_score
                    pear_simSums.setdefault(item,0)
                    pear_simSums[item] += pearson_score
    rankings = [(total/simSums[item], item) for item, total in totals.items()]
    rankings.sort()
    rankings.reverse()

    cos_rankings = [(total/cos_simSums[item], item) for item, total in cos_totals.items()]
    cos_rankings.sort()
    cos_rankings.reverse()

    pear_rankings = [(total/pear_simSums[item], item) for item, total in pear_totals.items()]
    pear_rankings.sort()
    pear_rankings.reverse()

    return rankings, cos_rankings, pear_rankings

## Process Data

In [61]:
grouped_borrows = borrow_events[['member_id', 'item_uri']].groupby(['member_id', 'item_uri']).size().reset_index(name='counts')
pivoted_borrows = grouped_borrows.pivot(
    index='member_id', columns='item_uri', values='counts').fillna(0).astype(int)
scaler = StandardScaler()
weighted = scaler.fit_transform(pivoted_borrows)
weighted = pd.DataFrame(weighted, index=pivoted_borrows.index, columns=pivoted_borrows.columns)

partial_members = ['hemingway-ernest']
query_members = grouped_borrows.loc[grouped_borrows.member_id.isin(partial_members)].member_id.unique().tolist()
partial_df = partial_df[partial_df.member_id.isin(query_members)]
partial_df['subscription_starttime'] = pd.to_datetime(partial_df['subscription_start'])
partial_df['subscription_endtime'] = pd.to_datetime(partial_df['subscription_end'])

d = (grouped_borrows.groupby(['member_id'])['item_uri', 'counts'].apply(
     lambda x: dict(x.values)).to_dict())

In [63]:
def create_df(rankings, score_type, n):
    df = pd.DataFrame(rankings[0:n])
    df.columns = ['score', 'item_uri']
    chart_titles = get_formatted_titles(df, n, True)
    df['formatted_chart_title'] = chart_titles
    table_titles = get_formatted_titles(df, n, False)
    df['formatted_table_title'] = table_titles
    df['type'] = score_type
    return df

n = 200
final_df = []
previous_books = None
narrow_query_books = None
limit_to_circulation = False
for index, row in partial_df.iterrows():
    print(index)
    print(row.subscription_start, row.subscription_end)
    circulation_events = events_df[(events_df.start_datetime < row.subscription_endtime) | (events_df.end_datetime < row.subscription_endtime)]
    print('identified events', len(circulation_events))
    query_books = circulation_events[circulation_events.item_uri.notna()].item_uri.unique().tolist()
    print('identified books', len(query_books))
    member_book_ids = events_df[(events_df.item_uri.notna()) & (events_df.member_id == row.member_id)].item_uri.unique()
    query_books = list(set(query_books) - set(member_book_ids))
    print('identified books without those read by member', len(query_books))
    if limit_to_circulation:
        if index == 0:
            previous_books = query_books
        else:
            query_books = list(set(query_books) - set(previous_books))
            print('identified books without those read by member and those read in previous subscription', len(query_books))
    rankings, cos_rankings, pear_rankings = get_predictions(row.member_id, d, weighted, query_books)
    print('euclidean:',len(rankings), 'cosine:', len(cos_rankings), 'pearson:',len(pear_rankings))

    euc_df = create_df(rankings, 'euclidean', n)
    cos_df = create_df(cos_rankings, 'cosine', n)
    pear_df = create_df(pear_rankings, 'pearson', n)

    dfs = pd.concat([euc_df, cos_df, pear_df])
    dfs['member_id'] = row.member_id
    dfs['period'] = row.subscription_start + '/' + row.subscription_end
    dfs['subscription_start'] = row.subscription_start
    dfs['subscription_end'] = row.subscription_end
    final_df.append(dfs)

25
1921-12-28 1922-11-08
identified events 2602
identified books 666
identified books without those read by member 660
euclidean: 551 cosine: 551 pearson: 551
26
1924-03-28 1925-03-28
identified events 6112
identified books 1292
identified books without those read by member 1280
euclidean: 1097 cosine: 1097 pearson: 1097


In [13]:
final_preds = pd.concat(final_df)
pivoted_predictions =pd.pivot(final_preds, index=['item_uri', 'formatted_chart_title', 'formatted_table_title', 'member_id', 'period', 'subscription_start', 'subscription_end'], columns='type', values='score').reset_index().fillna(0)

In [15]:
final_preds['score_scaled'] = final_preds.groupby(['period', 'type'])['score'].apply(lambda x: (x-min(x))/(max(x)-min(x)))

In [16]:
def scale_col(df, cols):
    for col in cols:
        df[col + '_scaled'] = MinMaxScaler().fit_transform(df[col].values.reshape(-1, 1))
    return df
metrics = ['cosine', 'pearson', 'euclidean']
scaled_predictions = scale_col(pivoted_predictions, metrics)
preds_df = pd.melt(scaled_predictions, id_vars=['item_uri', 'formatted_chart_title', 'formatted_table_title', 'member_id', 'period', 'subscription_start', 'subscription_end'], value_vars=['cosine_scaled', 'pearson_scaled', 'euclidean_scaled'], var_name='metric', value_name='score')

In [17]:
preds_df.loc[preds_df.metric == 'cosine_scaled', 'metric'] = 'cosine'
preds_df.loc[preds_df.metric == 'pearson_scaled', 'metric'] = 'pearson'
preds_df.loc[preds_df.metric == 'euclidean_scaled', 'metric'] = 'euclidean'

In [18]:
preds_df = preds_df.sort_values(by=['score'] ,ascending=False)
items = preds_df[0:100].item_uri.unique()
print(len(items))

62


In [19]:
periods = preds_df.period.unique().tolist()
n =23
final_items = []
for index, period in enumerate(periods):
    if index == 0:
        items = []
        rows = preds_df[(preds_df.period == period) & (preds_df.formatted_chart_title.str.contains('Periodical') == False)].sort_values(by=['score'] ,ascending=False)
        while len(items) <= n:
            item = rows.item_uri.iloc[0]
            if item not in items:
                items.append(item)
            rows = rows.drop(rows.index[0])
        final_items.append(items)
    else:
        items = []
        rows = preds_df[(preds_df.period == period) & (preds_df.formatted_chart_title.str.contains('Periodical') == False)].sort_values(by=['score'] ,ascending=False)
        while len(items) <= n:
            item = rows.item_uri.iloc[0]
            if (item not in items) and (item not in final_items[index-1]):
                items.append(item)
            rows = rows.drop(rows.index[0])
        items = items[0:n]
        # print(len(items))
        final_items.append(items)

In [20]:
len(final_items[1])

23

In [21]:
top_items = final_items[0] + final_items[1] 

In [22]:
preds_df[preds_df.item_uri.isin(top_items)].groupby(['period'])['item_uri'].nunique()

period
1921-12-28/1922-11-08    31
1924-03-28/1925-03-28    47
Name: item_uri, dtype: int64

In [24]:
top_results = preds_df[preds_df.item_uri.isin(top_items)]
# top_results.to_csv('./public_data/memorycf_top_results.csv', index=False)

In [25]:
preds_df[(preds_df.formatted_chart_title.str.contains('Trollope')) & (preds_df.item_uri.isin(top_items))]

Unnamed: 0,item_uri,formatted_chart_title,formatted_table_title,member_id,period,subscription_start,subscription_end,metric,score


In [26]:
chart = alt.Chart(preds_df[preds_df.item_uri.isin(top_items)]).mark_point(opacity=0.8, filled=True, thickness=3).encode(
    y=alt.Y('formatted_chart_title', sort='-x',  axis=alt.Axis(title='Predicted Book', orient='right')),
    x='score',
    color=alt.Color('period:N', legend=alt.Legend(title='Subscription Periods With Missing Records', orient='left')),
    # column='member_id:N',
    shape=alt.Shape('metric:N'),
).properties(
    width=300,
    title='Top Predictions by Collaborative Filtering Memory-Based Methods'
)
chart = chart.configure_axisY(
        titleAngle=0,
        titleAlign="left",
        titleY=-10,
        titleX=-100,
        labelLimit=1000
    )

In [27]:
chart

In [28]:
# preds_df = final_preds.copy()
# preds_df['score'] = preds_df.score_scaled

In [29]:
import numpy as np
metrics_df = preds_df.groupby(['member_id', 'subscription_start', 'subscription_end', 'period', 'item_uri', 'formatted_table_title', 'formatted_chart_title']).agg({'score': [np.median, 'skew', 'std', 'var']}).reset_index()
metrics_df.columns = list(map(''.join, metrics_df.columns.values))
metrics_df.columns = [col if 'score' not in col else col.split('score')[1] for col in metrics_df.columns ]
kurt_df = preds_df.groupby(['member_id', 'subscription_start', 'subscription_end', 'period', 'item_uri', 'formatted_table_title', 'formatted_chart_title'])['score'].apply(pd.DataFrame.kurt).reset_index(name='kurtosis')
final_df = pd.merge(metrics_df, kurt_df, on=['member_id', 'subscription_start', 'subscription_end', 'period', 'item_uri', 'formatted_table_title', 'formatted_chart_title'])
final_df = pd.merge(final_df, preds_df, on=['member_id', 'subscription_start', 'subscription_end', 'period', 'item_uri', 'formatted_table_title', 'formatted_chart_title'], how='left')

In [30]:
from scipy.stats import zscore, mode
final_df['zscore'] = final_df.groupby(['member_id', 'subscription_start', 'subscription_end', 'period', 'item_uri', 'formatted_table_title', 'formatted_chart_title'])['score'].transform(lambda x : zscore(x,ddof=1))

In [31]:
top_scores = final_df.groupby(['member_id', 'subscription_start', 'subscription_end', 'period', 'item_uri', 'formatted_table_title', 'formatted_chart_title']).agg({'score':'max'})[['score']].reset_index()
top_scores = pd.merge(top_scores, final_df, on=top_scores.columns.tolist(), how='inner')

top_scores = top_scores.rename(columns={'score': 'top_score', 'zscore' : 'top_zscore'})

In [32]:
avg_scores = final_df.groupby(['member_id', 'subscription_start', 'subscription_end', 'period', 'item_uri', 'formatted_table_title', 'formatted_chart_title'])['score'].mean().reset_index(name='avg_score')
scores_df = pd.merge(top_scores, avg_scores, on=['member_id', 'subscription_start', 'subscription_end', 'period', 'item_uri', 'formatted_table_title', 'formatted_chart_title'])

In [33]:
median_scores = final_df.groupby(['member_id', 'subscription_start', 'subscription_end', 'period', 'item_uri', 'formatted_table_title', 'formatted_chart_title'])['score'].median().reset_index(name='median_score')
scores_df = pd.merge(scores_df, median_scores, on=['member_id', 'subscription_start', 'subscription_end', 'period', 'item_uri', 'formatted_table_title', 'formatted_chart_title'])

In [34]:
std_scores = final_df.groupby(['member_id', 'subscription_start', 'subscription_end', 'period', 'item_uri', 'formatted_table_title', 'formatted_chart_title']).agg({'score': 'std'}).reset_index()
std_scores = std_scores.rename(columns={'score': 'std_score'})
scores_df = pd.merge(scores_df, std_scores, on=['member_id', 'subscription_start', 'subscription_end', 'period', 'item_uri', 'formatted_table_title', 'formatted_chart_title'])

In [35]:
mode_scores = preds_df.groupby(['member_id', 'period', 'subscription_start', 'subscription_end','item_uri'])['score'].agg(lambda x: pd.Series.mode(x).iat[0]).reset_index()
mode_scores = pd.merge(mode_scores, final_df, on=mode_scores.columns.tolist(), how='inner')
mode_scores = mode_scores.rename(columns={'score': 'mode_score', 'zscore' : 'mode_zscore'})

In [36]:
final_scores = pd.merge(mode_scores[['member_id', 'period', 'item_uri', 'formatted_table_title', 'formatted_chart_title', 'mode_score', 'mode_zscore', 'subscription_start', 'subscription_end']], scores_df, on=['member_id', 'period', 'item_uri', 'formatted_table_title', 'formatted_chart_title', 'subscription_start', 'subscription_end'])

In [37]:
final_scores

Unnamed: 0,member_id,period,item_uri,formatted_table_title,formatted_chart_title,mode_score,mode_zscore,subscription_start,subscription_end,top_score,median,skew,std,var,kurtosis,metric,top_zscore,avg_score,median_score,std_score
0,hemingway-ernest,1921-12-28/1922-11-08,a-e-collected-poems,"*Collected Poems*,<br>Æ",Collected Poems byÆ,0.000000,-0.577350,1921-12-28,1922-11-08,0.019770,0.000000,1.732051,0.011414,0.000130,,pearson,1.154701,0.006590,0.000000,0.011414
1,hemingway-ernest,1921-12-28/1922-11-08,a-e-collected-poems,"*Collected Poems*,<br>Æ",Collected Poems byÆ,0.000000,-0.577350,1921-12-28,1922-11-08,0.019770,0.000000,1.732051,0.011414,0.000130,,pearson,1.154701,0.006590,0.000000,0.011414
2,hemingway-ernest,1921-12-28/1922-11-08,a-e-imaginations-reveries,"*Imaginations and Reveries*,<br>Æ",Imaginations and Reveries byÆ,0.024169,-1.111490,1921-12-28,1922-11-08,0.221125,0.166048,-1.177452,0.101616,0.010326,,euclidean,0.826748,0.137114,0.166048,0.101616
3,hemingway-ernest,1921-12-28/1922-11-08,abercrombie-thomas-hardy-critical,"*Thomas Hardy: A Critical Study*,<br> Lascelle...",Thomas Hardy: A Critical Study by Lascelles Ab...,0.039540,-1.075610,1921-12-28,1922-11-08,0.428571,0.285434,-0.759614,0.196765,0.038716,,euclidean,0.901532,0.251182,0.285434,0.196765
4,hemingway-ernest,1921-12-28/1922-11-08,arnim-caravaners,"*The Caravaners*,<br> Elizabeth von Arnim",The Caravaners by Elizabeth von Arnim,0.000000,-0.577350,1921-12-28,1922-11-08,0.019770,0.000000,1.732051,0.011414,0.000130,,pearson,1.154701,0.006590,0.000000,0.011414
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
504,hemingway-ernest,1924-03-28/1925-03-28,yeats-cutting-agate,"*The Cutting of an Agate*,<br> William Butler ...",The Cutting of an Agate by William Butler Yeats,0.000000,-0.577350,1924-03-28,1925-03-28,0.019770,0.000000,1.732051,0.011414,0.000130,,pearson,1.154701,0.006590,0.000000,0.011414
505,hemingway-ernest,1924-03-28/1925-03-28,yeats-plays,"*Plays*,<br> William Butler Yeats",Plays by William Butler Yeats,0.026220,-1.111089,1924-03-28,1925-03-28,0.220779,0.166144,-1.172574,0.100347,0.010069,,euclidean,0.827777,0.137715,0.166144,0.100347
506,hemingway-ernest,1924-03-28/1925-03-28,yeats-plays-controversies,"*Plays and Controversies*,<br> William Butler ...",Plays and Controversies by William Butler Yeats,0.031707,-1.110428,1924-03-28,1925-03-28,0.285714,0.213894,-1.164532,0.130939,0.017145,,euclidean,0.829463,0.177105,0.213894,0.130939
507,hemingway-ernest,1924-03-28/1925-03-28,yeats-responsibilities-poems,"*Responsibilities and Other Poems*,<br> Willia...",Responsibilities and Other Poems by William Bu...,0.034166,-1.131807,1924-03-28,1925-03-28,0.230769,0.189674,-1.431100,0.103702,0.010754,,euclidean,0.764045,0.151536,0.189674,0.103702


In [38]:
member_subscriptions = final_scores[['member_id', 'period', 'item_uri', 'formatted_table_title', 'formatted_chart_title', 'subscription_start', 'subscription_end']].drop_duplicates()

In [39]:
final_scores_dedup = final_scores.drop_duplicates()

In [40]:
final_scores_dedup['coef_var'] = (final_scores_dedup.std_score/ final_scores_dedup.median_score)

In [41]:
# for index, group in member_subscriptions.iterrows():
#   print(group.to_dict())
#   rows = final_scores_dedup[(final_scores_dedup.member_id == group.member_id) & (final_scores_dedup.period == group.period) ]
#   print('coef_scores:', rows.sort_values(by=['coef_var'], ascending=True)[0:5][['item_uri', 'coef_var', 'median_score']].to_dict())
#   print('median_scores:', rows.sort_values(by=['median_score'], ascending=False)[0:5][['item_uri', 'median_score', 'coef_var']].to_dict())

In [42]:
len(final_scores), len(final_scores_dedup)

(509, 456)

In [43]:
for period in final_scores_dedup.period.unique().tolist():

    rows = final_scores_dedup[final_scores_dedup.period == period].sort_values(by=['median_score', 'coef_var'], ascending=[False, True])
    print(rows[['formatted_chart_title']].head(10))

                                 formatted_chart_title
84                             The Nation (Periodical)
133         The Times Literary Supplement (Periodical)
165                    The London Mercury (Periodical)
117            Specimens of Early English (Periodical)
3    Thomas Hardy: A Critical Study by Lascelles Ab...
7                          Since Cézanne by Clive Bell
66                                Howells (Periodical)
74                          The Clash by Storm Jameson
79                Samuel Butler by Henry Festing Jones
93   The Daniel Jazz and Other Poems by Vachel Lindsay
                             formatted_chart_title
358                        The Nation (Periodical)
465  Works of Francis Thompson by Francis Thompson
426     The Times Literary Supplement (Periodical)
388              The American Mercury (Periodical)
278          The Theory of Beauty by E. F. Carritt
288                     The Criterion (Periodical)
445                The London Mercury 

In [44]:
final_scores_dedup[final_scores_dedup.formatted_chart_title.str.contains('Trollope')]

Unnamed: 0,member_id,period,item_uri,formatted_table_title,formatted_chart_title,mode_score,mode_zscore,subscription_start,subscription_end,top_score,median,skew,std,var,kurtosis,metric,top_zscore,avg_score,median_score,std_score,coef_var
468,hemingway-ernest,1924-03-28/1925-03-28,trollope-bertrams,"*The Bertrams*,<br> Anthony Trollope",The Bertrams by Anthony Trollope,0.035482,-1.154517,1924-03-28,1925-03-28,0.241071,0.236885,-1.729578,0.117508,0.013808,,euclidean,0.59507,0.171146,0.236885,0.117508,0.496052
469,hemingway-ernest,1924-03-28/1925-03-28,trollope-claverings,"*The Claverings*,<br> Anthony Trollope",The Claverings by Anthony Trollope,0.026898,-1.12698,1924-03-28,1925-03-28,0.222527,0.177871,-1.36971,0.102517,0.01051,,euclidean,0.78129,0.142432,0.177871,0.102517,0.576352
470,hemingway-ernest,1924-03-28/1925-03-28,trollope-last-chronicle-barset,"*The Last Chronicle of Barset*,<br> Anthony Tr...",The Last Chronicle of Barset by Anthony Trollope,0.080533,-1.104105,1924-03-28,1925-03-28,0.35527,0.272727,-1.088332,0.140968,0.019872,,cosine,0.844822,0.236177,0.272727,0.140968,0.516883
471,hemingway-ernest,1924-03-28/1925-03-28,trollope-prime-minister,"*The Prime Minister*,<br> Anthony Trollope",The Prime Minister by Anthony Trollope,0.02705,-1.128214,1924-03-28,1925-03-28,0.221198,0.177798,-1.385334,0.1019,0.010384,,euclidean,0.777062,0.142015,0.177798,0.1019,0.573126
472,hemingway-ernest,1924-03-28/1925-03-28,trollope-small-house-allington,"*The Small House at Allington*,<br> Anthony Tr...",The Small House at Allington by Anthony Trollope,0.052491,-1.154649,1924-03-28,1925-03-28,0.236686,0.234694,-1.73136,0.105775,0.011188,,cosine,0.586741,0.174624,0.234694,0.105775,0.450692


In [158]:
final_scores_dedup.to_csv('./public_data/collaborative_filtering_predictions.csv', index=False)

In [60]:
final_scores

Unnamed: 0,member_id,period,item_uri,formatted_table_title,formatted_chart_title,mode_score,mode_zscore,top_score,median,skew,std,var,kurtosis,metric,top_zscore,avg_score,median_score,std_score
0,hemingway-ernest,1921-12-28/1922-11-08,a-e-imaginations-reveries,"*Imaginations and Reveries*,<br>Æ",Imaginations and Reveries byÆ,0.024169,-1.111490,0.221125,0.166048,-1.177452,0.101616,0.010326,,euclidean,0.826748,0.137114,0.166048,0.101616
1,hemingway-ernest,1921-12-28/1922-11-08,abercrombie-thomas-hardy-critical,"*Thomas Hardy: A Critical Study*,<br> Lascelle...",Thomas Hardy: A Critical Study by Lascelles Ab...,0.039540,-1.075610,0.428571,0.285434,-0.759614,0.196765,0.038716,,euclidean,0.901532,0.251182,0.285434,0.196765
2,hemingway-ernest,1921-12-28/1922-11-08,barrie-kiss-cinderella,"*A Kiss for Cinderella: A Comedy*,<br> J. M. B...",A Kiss for Cinderella: A Comedy by J. M. Barrie,0.000000,-0.577350,0.019770,0.000000,1.732051,0.011414,0.000130,,pearson,1.154701,0.006590,0.000000,0.011414
3,hemingway-ernest,1921-12-28/1922-11-08,barrie-kiss-cinderella,"*A Kiss for Cinderella: A Comedy*,<br> J. M. B...",A Kiss for Cinderella: A Comedy by J. M. Barrie,0.000000,-0.577350,0.019770,0.000000,1.732051,0.011414,0.000130,,pearson,1.154701,0.006590,0.000000,0.011414
4,hemingway-ernest,1921-12-28/1922-11-08,beerbohm-zuleika-dobson,"*Zuleika Dobson*,<br> Max Beerbohm",Zuleika Dobson by Max Beerbohm,0.026132,-1.097906,0.226531,0.163079,-1.014785,0.102421,0.010490,,euclidean,0.858713,0.138580,0.163079,0.102421
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
512,hemingway-ernest,1924-03-28/1925-03-28,woolf-jacobs-room,"*Jacob's Room*,<br> Virginia Woolf",Jacob's Room by Virginia Woolf,0.021104,-1.067627,0.234127,0.152253,-0.671771,0.107457,0.011547,,euclidean,0.914773,0.135828,0.152253,0.107457
513,hemingway-ernest,1924-03-28/1925-03-28,woolf-night-day,"*Night and Day*,<br> Virginia Woolf",Night and Day by Virginia Woolf,0.023264,-1.101864,0.217897,0.158259,-1.061608,0.099718,0.009944,,euclidean,0.849967,0.133140,0.158259,0.099718
514,hemingway-ernest,1924-03-28/1925-03-28,wright-life-walter-pater,"*The Life of Walter Pater*,<br> Thomas Wright",The Life of Walter Pater by Thomas Wright,0.039540,-1.075610,0.428571,0.285434,-0.759614,0.196765,0.038716,,euclidean,0.901532,0.251182,0.285434,0.196765
515,hemingway-ernest,1924-03-28/1925-03-28,yeats-plays,"*Plays*,<br> William Butler Yeats",Plays by William Butler Yeats,0.026220,-1.111089,0.220779,0.166144,-1.172574,0.100347,0.010069,,euclidean,0.827777,0.137715,0.166144,0.100347
