In [None]:
import implicit
import numpy as np
import scipy.sparse as sp
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from pathvalidate import replace_symbol
import pickle
import os
import csv
import numpy.matlib

RAW_PATH = '../data/raw/'
TEST_FANDOM = 'Star Wars: The Clone Wars (2008) - All Media Types'
KUDO_FILE = RAW_PATH + replace_symbol(TEST_FANDOM) + '/kudos.csv'
META_FILE = RAW_PATH + replace_symbol(TEST_FANDOM) + '/meta.csv'
TEST_REBELS = 'Star Wars: Rebels'
KUDO_REBELS = RAW_PATH + replace_symbol(TEST_REBELS) + '/kudos.csv'


# Get Data

We'll need to go from csv to a scipy.sparse.csr_matrix. This code should be put into a preprocessing script w/in the pipelie.

In [None]:
def init_path(fandom):
    fandom_dir = replace_symbol(fandom)
    data_path = os.path.join(
        os.path.dirname(os.path.abspath(__file__)), RAW_PATH+fandom_dir)
    if not os.path.exists(data_path):
        os.makedirs(data_path)
    os.chdir(data_path)

#init_path(TEST_FANDOM)   
#df = pd.read_csv("kudos.csv")

df_TCW = pd.read_csv(KUDO_FILE)
df_REB = pd.read_csv(KUDO_REBELS)
print(f" df_TCW: {type(df_TCW)} df_REB: {df_REB}")
#print(f"dims TCW: {df_TCW. })
frames = [df_TCW, df_REB]
df = pd.concat(frames)
df.head()

In [None]:
num_works = len(df['work_id'].unique())
num_users = len(df['user'].unique())
data = np.zeros((num_works, num_users))
print(data.shape)

In [None]:
indices = {'work_id':{}, 'user':{}}
with open(KUDO_FILE, newline='') as csvfile:
    interactions = csv.reader(csvfile, delimiter=',')
    next(interactions)
    for row in interactions:
        indices['work_id'].setdefault(row[0], len(indices['work_id']))
        indices['user'].setdefault(row[1], len(indices['user']))
        data[indices['work_id'][row[0]]][indices['user'][row[1]]] = 1

In [None]:
sparse_matrix = sp.csr_matrix(data)
print(type(sparse_matrix))

In [None]:
inverted_indices = {'work_id':{}, 'user':{}}
inverted_indices['work_id'] = {v: k for k, v in indices['work_id'].items()}
inverted_indices['user'] = {v: k for k, v in indices['user'].items()}


# Get Recs

In [None]:
work_indice = indices['work_id']['23657317']
num_to_return = 20

In [None]:
# initialize a model
modelBPR = implicit.bpr.BayesianPersonalizedRanking(factors=50, verify_negative_samples=True)
# train the model on a sparse matrix of item/user/confidence weights
modelBPR.fit(sparse_matrix)

# find related items
related_BPR = modelBPR.similar_items(work_indice, num_to_return)
for suggestion in related_BPR:
    work_id = inverted_indices['work_id'][suggestion[0]]
    print(f"http://www.archiveofourown.org/works/{work_id}")

In [None]:
print(META_FILE)
df_meta = pd.read_csv(META_FILE)
df_meta.head()

In [None]:
x = df_meta.loc[df_meta['work_id'] == 23657317]
x['title'].values[0]

In [None]:
def lookup_meta(related):
    for work in related:
        work_id = inverted_indices['work_id'][work[0]]
        meta = df_meta.loc[df_meta['work_id'] == int(work_id)]
        title = meta['title'].values[0]
        print(f"http://www.archiveofourown.org/works/{work_id}\t{title}")


In [None]:
lookup_meta(related_BPR)

In [None]:
def storeData(model, modelname): 

    #this_folder = os.path.dirname(os.path.abspath(__file__))
    #my_file = os.path.join(this_folder, modelname)
    # Its important to use binary mode 
    dbfile = open('../models/'+modelname+'.pkl', 'wb') 
      
    # source, destination 
    pickle.dump(model, dbfile)                      
    dbfile.close() 

In [None]:
def loadData(modelname): 
    # for reading also binary mode is important 
    dbfile = open('../models/'+modelname+'.pkl', 'rb')      
    db = pickle.load(dbfile) 
    return db

In [None]:
def printRecs(related):
    for suggestion in related:
        work_id = inverted_indices['work_id'][suggestion[0]]
        print(f"http://www.archiveofourown.org/works/{work_id}")

In [None]:
storeData(modelBPR, 'bpr270220')

In [None]:

pickled_model = loadData('test')
# find related items
related_pickled = pickled_model.similar_items(work_indice, num_to_return)
printRecs(related_pickled)

In [None]:
storeData(indices, 'indices270220')