In [2]:
import implicit
import numpy as np
import scipy.sparse as sp
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
import pickle
import os
import csv
import numpy.matlib

# Get Data

We'll need to go from csv to a scipy.sparse.csr_matrix. This code should be put into a preprocessing script w/in the pipelie.

In [3]:
df = pd.read_csv("../data/swr_fanworks/fanworks_kudos.csv")
df.head()

Unnamed: 0,user,work_id
0,3104510,klutzyelf
1,3104510,Ello_its_meme
2,3104510,JadedBlade
3,3104510,Moony_PirateKing
4,3104510,EgoBang07


In [4]:
num_users = len(df['work_id'].unique())
num_works = len(df['user'].unique())
data = np.zeros((num_works, num_users))
print(data.shape)

(4224, 18096)


In [5]:
indices = {'work_id':{}, 'user':{}}
work_count, user_count = 0, 0
with open('../data/swr_fanworks/fanworks_kudos.csv', newline='') as csvfile:
    interactions = csv.reader(csvfile, delimiter=',')
    next(interactions)
    for row in interactions:
        indices['work_id'].setdefault(row[0], len(indices['work_id']))
        indices['user'].setdefault(row[1], len(indices['user']))
        data[indices['work_id'][row[0]]][indices['user'][row[1]]] = 1

In [6]:
sparse_matrix = sp.csr_matrix(data)
print(type(sparse_matrix))

<class 'scipy.sparse.csr.csr_matrix'>


In [7]:
inverted_indices = {'work_id':{}, 'user':{}}
inverted_indices['work_id'] = {v: k for k, v in indices['work_id'].items()}
inverted_indices['user'] = {v: k for k, v in indices['user'].items()}


# Get Recs

In [8]:
work_indice = indices['work_id']['11997396']
num_to_return = 20

### ALS results

### BPR

In [9]:
# initialize a model
modelBPR = implicit.bpr.BayesianPersonalizedRanking(factors=50, verify_negative_samples=True)
# train the model on a sparse matrix of item/user/confidence weights
modelBPR.fit(sparse_matrix)

# find related items
related_BPR = modelBPR.similar_items(work_indice, num_to_return)
for suggestion in related_BPR:
    work_id = inverted_indices['work_id'][suggestion[0]]
    print(f"http://www.archiveofourown.org/works/{work_id}")

HBox(children=(FloatProgress(value=0.0), HTML(value='')))


http://www.archiveofourown.org/works/11997396
http://www.archiveofourown.org/works/13210923
http://www.archiveofourown.org/works/10703883
http://www.archiveofourown.org/works/13241082
http://www.archiveofourown.org/works/19170541
http://www.archiveofourown.org/works/10546950
http://www.archiveofourown.org/works/14495193
http://www.archiveofourown.org/works/14517879
http://www.archiveofourown.org/works/11573010
http://www.archiveofourown.org/works/14542257
http://www.archiveofourown.org/works/11103651
http://www.archiveofourown.org/works/13904244
http://www.archiveofourown.org/works/14085450
http://www.archiveofourown.org/works/13964358
http://www.archiveofourown.org/works/11758140
http://www.archiveofourown.org/works/14559507
http://www.archiveofourown.org/works/13345785
http://www.archiveofourown.org/works/13130430
http://www.archiveofourown.org/works/13967031
http://www.archiveofourown.org/works/14311677


### Logistic Matrix Fatorization

In [51]:
df_meta = pd.read_csv("../data/fanfics_metadata.csv")
df_meta.head()

Unnamed: 0,work_id,title,rating,category,fandom,relationship,character,additional tags,language,published,status,status date,words,chapters,comments,kudos,bookmarks,hits
0,3104510,Second Chances,Teen And Up Audiences,"M/M, F/F","Star Wars - All Media Types, Star Wars Prequel...","Obi-Wan Kenobi/Anakin Skywalker, Obi-Wan Kenob...","Leia Organa, Luke Skywalker, Anakin Skywalker,...","Age Regression/De-Aging, Soul Bond, The Force,...",English,2015-01-05,Completed,2016-04-16,121358.0,42/42,803.0,1917.0,446.0,74424.0
1,6423526,hurricane on the edge of oblivion (with nowher...,Mature,Gen,Star Wars: The Wrath of Darth Maul - Ryder Win...,"Obi-Wan Kenobi & Xanatos, Qui-Gon Jinn & Feemo...","Obi-Wan Kenobi, Xanatos (Star Wars), Qui-Gon J...","minor OC's - Freeform, at least I'm pretty sur...",English,2016-04-01,Updated,2019-12-23,144982.0,24/?,744.0,1815.0,380.0,28728.0
2,9552773,time to change the road you're on,General Audiences,Gen,"Star Wars - All Media Types, Star Wars: The Cl...","Anakin Skywalker & Ahsoka Tano, Ahsoka Tano an...","Ahsoka Tano, Anakin Skywalker, Kanan Jarrus, E...","AU, Time Travel Fix-It, possibly more of a tim...",English,2017-02-02,Completed,2017-12-27,37400.0,8/8,234.0,1446.0,340.0,25348.0
3,5162474,Twin Sunrise,General Audiences,Gen,"Star Wars Original Trilogy, Star Wars: Rebels,...","Luke Skywalker & Darth Vader, Darth Vader & Ap...","Anakin Skywalker | Darth Vader, Luke Skywalker...","Grey Jedi, Alternate Universe, Sith Shenanigan...",English,2015-11-07,Completed,2018-01-14,302486.0,55/55,1003.0,1418.0,382.0,52527.0
4,4417469,On the Edge of the Devil's Backbone,Teen And Up Audiences,"F/M, Multi","Star Wars: Rebels, Star Wars - All Media Types",Kanan Jarrus/Hera Syndulla,"Hera Syndulla, Kanan Jarrus, Sabine Wren, Gara...","Alternate Universe, Canon-Typical Violence",English,2015-07-25,Completed,2018-10-20,361502.0,30/30,675.0,1395.0,255.0,39386.0


In [57]:
x = df_meta.loc[df_meta['work_id'] == 13278054]
x['title'].values[0]

'Things to Come'

In [60]:
def lookup_meta(related):
    for work in related:
        work_id = inverted_indices['work_id'][work[0]]
        meta = df_meta.loc[df_meta['work_id'] == int(work_id)]
        title = meta['title'].values[0]
        print(f"http://www.archiveofourown.org/works/{work_id}\t{title}")

In [61]:
lookup_meta(related_BPR)

http://www.archiveofourown.org/works/11997396	The Recruiter
http://www.archiveofourown.org/works/13210923	He Started It
http://www.archiveofourown.org/works/10703883	Dismantled
http://www.archiveofourown.org/works/13241082	It's Been Awhile
http://www.archiveofourown.org/works/19170541	Blaster Bolts and Banter
http://www.archiveofourown.org/works/10546950	Before the Hour
http://www.archiveofourown.org/works/14495193	If At First You Don't Succeed
http://www.archiveofourown.org/works/14517879	The Silence of Growing Things
http://www.archiveofourown.org/works/11573010	Broken Chains
http://www.archiveofourown.org/works/14542257	We're going to Palpatinopia!
http://www.archiveofourown.org/works/11103651	Stay
http://www.archiveofourown.org/works/13904244	After
http://www.archiveofourown.org/works/14085450	Fortune
http://www.archiveofourown.org/works/13964358	Gone
http://www.archiveofourown.org/works/11758140	Paradise
http://www.archiveofourown.org/works/14559507	The Mask
http://www.archiveofou

In [18]:
def storeData(model, modelname): 

    #this_folder = os.path.dirname(os.path.abspath(__file__))
    #my_file = os.path.join(this_folder, modelname)
    # Its important to use binary mode 
    dbfile = open('../models/'+modelname+'.pkl', 'wb') 
      
    # source, destination 
    pickle.dump(model, dbfile)                      
    dbfile.close() 

In [15]:
def loadData(modelname): 
    # for reading also binary mode is important 
    dbfile = open('../models/'+modelname+'.pkl', 'rb')      
    db = pickle.load(dbfile) 
    return db

In [16]:
def printRecs(related):
    for suggestion in related:
        work_id = inverted_indices['work_id'][suggestion[0]]
        print(f"http://www.archiveofourown.org/works/{work_id}")

In [19]:
storeData(modelBPR, 'bpr270220')

In [None]:

pickled_model = loadData('test')
# find related items
related_pickled = pickled_model.similar_items(work_indice, num_to_return)
printRecs(related_pickled)

In [21]:
storeData(indices, 'indices270220')