In [38]:
import implicit
import numpy as np
import scipy.sparse as sp
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from pathvalidate import replace_symbol
import pickle
import os
import csv
import numpy.matlib

RAW_PATH = '../data/raw/'
TEST_FANDOM = 'Star Wars: The Clone Wars (2008) - All Media Types'
KUDO_FILE = RAW_PATH + replace_symbol(TEST_FANDOM) + '/kudos.csv'
META_FILE = RAW_PATH + replace_symbol(TEST_FANDOM) + '/meta.csv'
TEST_REBELS = 'Star Wars: Rebels'
KUDO_REBELS = RAW_PATH + replace_symbol(TEST_REBELS) + '/kudos.csv'


# Get Data

We'll need to go from csv to a scipy.sparse.csr_matrix. This code should be put into a preprocessing script w/in the pipelie.

In [41]:
def init_path(fandom):
    fandom_dir = replace_symbol(fandom)
    data_path = os.path.join(
        os.path.dirname(os.path.abspath(__file__)), RAW_PATH+fandom_dir)
    if not os.path.exists(data_path):
        os.makedirs(data_path)
    os.chdir(data_path)

#init_path(TEST_FANDOM)   
#df = pd.read_csv("kudos.csv")

df_TCW = pd.read_csv(KUDO_FILE)
df_REB = pd.read_csv(KUDO_REBELS)
print(f" df_TCW: {type(df_TCW)} df_REB: {df_REB}")
#print(f"dims TCW: {df_TCW. })
frames = [df_TCW, df_REB]
df = pd.concat(frames)
df.head()

df_TCW: <class 'pandas.core.frame.DataFrame'> df_REB:          work_id                    user
0       22186015             ChillyCorgi
1       22186015                Numbuh_7
2       22186015  Padawan_Of_Ahsoka_Tano
3       22186015             stiltsrosko
4       22186015                 igofuzz
...          ...                     ...
112948  10629384               Death7559
112949  10629384              ICurlyHead
112950  10629384          Thousandsmiles
112951  10629384         BurningOffARoad
112952  10629384               Glamdroid

[112953 rows x 2 columns]


Unnamed: 0,work_id,user
0,23657317,christinedieae
1,23657317,AngelInLeather
2,23657317,Mayosandwich
3,23657317,littlecloudy
4,23657317,T3llurian


In [42]:
num_works = len(df['work_id'].unique())
num_users = len(df['user'].unique())
data = np.zeros((num_works, num_users))
print(data.shape)

(3472, 19331)


In [17]:
indices = {'work_id':{}, 'user':{}}
with open(KUDO_FILE, newline='') as csvfile:
    interactions = csv.reader(csvfile, delimiter=',')
    next(interactions)
    for row in interactions:
        indices['work_id'].setdefault(row[0], len(indices['work_id']))
        indices['user'].setdefault(row[1], len(indices['user']))
        data[indices['work_id'][row[0]]][indices['user'][row[1]]] = 1

In [18]:
sparse_matrix = sp.csr_matrix(data)
print(type(sparse_matrix))

<class 'scipy.sparse.csr.csr_matrix'>


In [19]:
inverted_indices = {'work_id':{}, 'user':{}}
inverted_indices['work_id'] = {v: k for k, v in indices['work_id'].items()}
inverted_indices['user'] = {v: k for k, v in indices['user'].items()}


# Get Recs

In [21]:
work_indice = indices['work_id']['23657317']
num_to_return = 20

In [22]:
# initialize a model
modelBPR = implicit.bpr.BayesianPersonalizedRanking(factors=50, verify_negative_samples=True)
# train the model on a sparse matrix of item/user/confidence weights
modelBPR.fit(sparse_matrix)

# find related items
related_BPR = modelBPR.similar_items(work_indice, num_to_return)
for suggestion in related_BPR:
    work_id = inverted_indices['work_id'][suggestion[0]]
    print(f"http://www.archiveofourown.org/works/{work_id}")

100%|██████████| 100/100 [00:01<00:00, 57.11it/s, correct=94.80%, skipped=9.84%]
http://www.archiveofourown.org/works/23657317
http://www.archiveofourown.org/works/23662951
http://www.archiveofourown.org/works/23644651
http://www.archiveofourown.org/works/23668639
http://www.archiveofourown.org/works/23640064
http://www.archiveofourown.org/works/23654458
http://www.archiveofourown.org/works/23660587
http://www.archiveofourown.org/works/23688109
http://www.archiveofourown.org/works/23613976
http://www.archiveofourown.org/works/23638108
http://www.archiveofourown.org/works/23652616
http://www.archiveofourown.org/works/23642665
http://www.archiveofourown.org/works/23587408
http://www.archiveofourown.org/works/23425900
http://www.archiveofourown.org/works/23666200
http://www.archiveofourown.org/works/23663839
http://www.archiveofourown.org/works/23684605
http://www.archiveofourown.org/works/23651674
http://www.archiveofourown.org/works/23659255
http://www.archiveofourown.org/works/23054122

In [28]:
print(META_FILE)
df_meta = pd.read_csv(META_FILE)
df_meta.head()

../data/raw/StarWarsTheCloneWars2008AllMediaTypes/meta.csv


Unnamed: 0,work_id,title,author,gifted,rating,warnings,category,status,fandom,relationship,...,chapters,collections,comments,kudos,bookmarks,hits,series_part,series_name,updated,scrape_date
0,23657317,'cause i'm going to make this place your home,MageOfCole,,General Audiences,No Archive Warnings Apply,"F/M, Gen, M/M, Multi",Work in Progress,"Star Wars - All Media Types, Star Wars: The Cl...","Ezra Bridger & Kanan Jarrus, Depa Billaba & Ka...",...,5/?,,44.0,57.0,6.0,492.0,29.0,Children of Hope and Glory,17 Apr 2020,2020Apr16
1,23671393,When the World Wasn't at War,jewelofmandalore,,Mature,No Archive Warnings Apply,"F/M, M/M",Work in Progress,Star Wars: The Clone Wars (2008) - All Media T...,"CT-7567 | Rex/Ahsoka Tano, CC-2224 | Cody/Obi-...",...,2/3,,1.0,14.0,1.0,123.0,3.0,Clone Wars/Rebels Tumblr prompts,16 Apr 2020,2020Apr16
2,23497945,Undercurrent of Remorse,Oppo Rancisis (NowThatsDedication),,Not Rated,No Archive Warnings Apply,F/M,Work in Progress,Star Wars: The Clone Wars (2008) - All Media T...,Obi-Wan Kenobi/Satine Kryze,...,9/?,,,39.0,4.0,304.0,,,16 Apr 2020,2020Apr16
3,23084485,Unnamed project,W01FS0NG,,General Audiences,Choose Not To Use Archive Warnings,"F/F, Gen, Multi, Other",Work in Progress,"Maximum Ride, Star Wars: The Clone Wars (2008)...",,...,8/?,,,6.0,,36.0,,,16 Apr 2020,2020Apr16
4,22822012,When the Force and Stars Become One,taffee23,,Mature,No Archive Warnings Apply,F/M,Work in Progress,"Star Wars - All Media Types, Star Wars Prequel...",Obi-Wan Kenobi/Original Female Character(s),...,5/?,,9.0,11.0,3.0,290.0,1.0,A Dyad Written in the Stars,16 Apr 2020,2020Apr16


In [30]:
x = df_meta.loc[df_meta['work_id'] == 23657317]
x['title'].values[0]

"'cause i'm going to make this place your home"

In [31]:
def lookup_meta(related):
    for work in related:
        work_id = inverted_indices['work_id'][work[0]]
        meta = df_meta.loc[df_meta['work_id'] == int(work_id)]
        title = meta['title'].values[0]
        print(f"http://www.archiveofourown.org/works/{work_id}\t{title}")


In [32]:
lookup_meta(related_BPR)

http://www.archiveofourown.org/works/23657317	'cause i'm going to make this place your home
http://www.archiveofourown.org/works/23662951	gold and diamond, jewels behind the throne
http://www.archiveofourown.org/works/23644651	Long enough to see the sunrise
http://www.archiveofourown.org/works/23668639	Blubberbird
http://www.archiveofourown.org/works/23640064	When all else falls apart
http://www.archiveofourown.org/works/23654458	speak your truths to me, share my love to you
http://www.archiveofourown.org/works/23660587	These Boots are Made for Walking
http://www.archiveofourown.org/works/23688109	Joyful Sound
http://www.archiveofourown.org/works/23613976	a study in heartbeat
http://www.archiveofourown.org/works/23638108	Walk Tall, an Aside
http://www.archiveofourown.org/works/23652616	Amidala vs. Taa
http://www.archiveofourown.org/works/23642665	we made houses out of cardboard boxes
http://www.archiveofourown.org/works/23587408	TARFU
http://www.archiveofourown.org/works/23425900	The L

In [33]:
def storeData(model, modelname): 

    #this_folder = os.path.dirname(os.path.abspath(__file__))
    #my_file = os.path.join(this_folder, modelname)
    # Its important to use binary mode 
    dbfile = open('../models/'+modelname+'.pkl', 'wb') 
      
    # source, destination 
    pickle.dump(model, dbfile)                      
    dbfile.close() 

In [34]:
def loadData(modelname): 
    # for reading also binary mode is important 
    dbfile = open('../models/'+modelname+'.pkl', 'rb')      
    db = pickle.load(dbfile) 
    return db

In [35]:
def printRecs(related):
    for suggestion in related:
        work_id = inverted_indices['work_id'][suggestion[0]]
        print(f"http://www.archiveofourown.org/works/{work_id}")

In [36]:
storeData(modelBPR, 'bpr270220')

In [37]:

pickled_model = loadData('test')
# find related items
related_pickled = pickled_model.similar_items(work_indice, num_to_return)
printRecs(related_pickled)

FileNotFoundError: [Errno 2] No such file or directory: '../models/test.pkl'

In [None]:
storeData(indices, 'indices270220')