<h3>Importing</h3>

Most of the code was adapted using implict documentation https://benfred.github.io/implicit/ and their Github Repository https://github.com/benfred/implicit/tree/main/implicit/datasets

In [1]:
import implicit
from implicit.datasets.lastfm import get_lastfm
from implicit.nearest_neighbours import bm25_weight
from implicit.als import AlternatingLeastSquares

import pandas
import random
import numpy as np
from scipy.sparse import coo_matrix, csr_matrix

<h3>Data Processing</h3>
<b>artist_user_plays</b> is a sparse matrix. </br>
A Sparse matrix is a datastructure that contains coordinates of 2D list/array (list inside of lists) to non-zero values.</br>
Example: if, 2D list = [[1 0 0 0 0 0][0 0 2 0 0 1][0 0 0 2 0 0]], then sparse matrix =   (0, 0)	1, (1, 2)	2, (1, 5)	1, (2, 3) 2</br>
Explanation taken from https://www.educative.io/answers/sparse-matrices-in-python.</br>
</br>
<b>artists and users</b></br>
are arrays of string labels for each row and column in the sparse matrix.

In [2]:
#artists, users, artist_user_plays = get_lastfm()

In [3]:
def load_data(path, product_column_name, quantity_column_name):
    file = pandas.read_csv(path + ".csv")
    #print(file[product_column_name], file[bought_column_name])
    fake_users = []
    for i in range(10):
        fake_users.append("User " + str(i))
    
    users = []
    items = []
    sizes = []
    for i in range(200):
        
        user = random.choice(fake_users)
        item = random.choice(file[product_column_name])
        size = random.choice(file[quantity_column_name])
        
        amount = ''
        for c in size:
            if(c != ',' and c != '.'):
                amount += c
        
        users.append(user)
        items.append(item)
        sizes.append(random.randint(1, int(amount)))
    
    df = pandas.DataFrame({'user': users, 'item': items, 'plays': sizes})
    df["user"] = df["user"].astype("category")
    df["item"] = df["item"].astype("category")
    
    data = df
    
    plays = coo_matrix(
        (
            data["plays"].astype(np.float32),
            (data["user"].cat.codes.copy(), data["item"].cat.codes.copy()),
        )
    ).tocsr()
    
    return np.array(data["item"][:]), np.array(data["user"][:]), plays
    

In [4]:
class User:
    
    def __init__(self):
        self.items = {}
    
    def purchase(self, item):
        if item in self.items:
            self.items[item] += 1
        else:
            self.items[item] = 0
    
    def get_lists(self):
        items = []
        sizes = []
        for key in self.items:
            items.append(key)
            sizes.append(self.items[key])
        return items, sizes
            

In [5]:
def read_file(filename):
    dataframe = pandas.read_excel(filename + '.xlsx')
    return dataframe

def load_synth_data(path_training_data):
    
    df = read_file(path_training_data)
    users = {}
    for index, line in df.iterrows():
        c_id = line[' customer_id']
        product = line['product_id']
        
        if c_id in users:
            users[c_id].purchase(product)
        else:
            user = User()
            user.purchase(product)
            users[c_id] = user
    
    users1 = []
    items1 = []
    sizes1 = []
    
    for c_id in users:
        items, sizes = users[c_id].get_lists()
        items1 += items
        sizes1 += sizes
        for i in range(len(sizes)):
            users1.append(c_id)
    
    df = pandas.DataFrame({'user': users1, 'item': items1, 'plays': sizes1})
    df["user"] = df["user"].astype("category")
    df["item"] = df["item"].astype("category")
    
    data = df
    
    plays = coo_matrix(
        (
            data["plays"].astype(np.float32),
            (data["user"].cat.codes.copy(), data["item"].cat.codes.copy()),
        )
    ).tocsr()
    
    return np.array(data["item"][:]), np.array(data["user"][:]), plays

def test(path_training_data, path_testing_data, num_recommendations):
    
    products, users, products_user_purchased = load_synth_data(path_training_data)
    products_purchased = products_user_purchased.T.tocsr()
    model = AlternatingLeastSquares(factors=64, regularization=0.05, alpha=2.0)
    model.fit(products_purchased)
    
    df = read_file(path_testing_data)
    testing_data = {}
    
    for index, line in df.iterrows():
        user_id = line[' customer_id']
        product = line['product_id']
        
        if user_id in testing_data:
            testing_data[user_id].append(product)
        else:
            testing_data[user_id] = [product]
    
    correct = 0
    for user_id in testing_data:
        try:
            ids, scores = model.recommend(user_id, products_purchased[user_id], N=num_recommendations, filter_already_liked_items=False)
        except:
            continue
        for recommendation in products[ids]:
            if recommendation in testing_data[user_id]:
                correct += 1
    
    return correct

def test_random(interaction_data, path_testing_data, num_recommendations):
    
    interaction = read_file(interaction_data)
    df = read_file(path_testing_data)
    products = []
    testing_data = {}
    for index, line in interaction.iterrows():
        product = line['product_id']
        products.append(product)
        
    for index, line in df.iterrows():
        
        user_id = line[' customer_id']
        product = line['product_id']
        
        if user_id in testing_data:
            testing_data[user_id].append(product)
        else:
            testing_data[user_id] = [product]
    correct = 0
    for user in testing_data:
        recommendations = []
        for i in range(num_recommendations):
            rand_product = random.choice(products)
            if rand_product in testing_data[user]:
                correct += 1
    
    return correct
        
        
        

In [6]:
artists, users, artist_user_plays = load_synth_data("interaction")

In [7]:
#artists, users, artist_user_plays = load_data("topyoutube", 'Artist', 'Total Views')

In [8]:
user_plays = artist_user_plays.T.tocsr()

In [9]:
model = AlternatingLeastSquares(factors=64, regularization=0.05, alpha=2.0)
model.fit(user_plays)

  0%|          | 0/15 [00:00<?, ?it/s]

In [10]:
# Get recommendations for the a single user
userid = 2
ids, scores = model.recommend(userid, user_plays[userid], N=10, filter_already_liked_items=False)

In [11]:
# Use pandas to display the output in a table, pandas isn't a dependency of implicit otherwise
import numpy as np
import pandas as pd
print(userid)
pd.DataFrame({"product_id": artists[ids], "score": scores, "already_liked": np.in1d(ids, user_plays[userid].indices)})

2


Unnamed: 0,product_id,score,already_liked
0,23,1.661757e-10,True
1,176,1.610143e-11,False
2,26,1.244384e-11,False
3,51,8.623748e-12,False
4,66,6.472392e-12,False
5,112,3.654008e-12,False
6,176,1.445087e-12,False
7,179,1.477453e-19,False
8,181,9.829877e-20,False
9,62,9.115072e-20,False


In [12]:
# get related items for the beatles (itemid = 25512)
ids, scores= model.similar_items(5)

# display the results using pandas for nicer formatting
pd.DataFrame({"artist": artists[ids], "score": scores})

Unnamed: 0,artist,score
0,40,1.0
1,130,0.733506
2,203,0.733478
3,197,0.725888
4,191,0.724396
5,174,0.722207
6,1,0.721743
7,107,0.721673
8,104,0.718919
9,115,0.717818


In [13]:
t = "test_data"
train = "train_data"

In [18]:
print("Correct: ", test(train, t, 10))

  0%|          | 0/15 [00:00<?, ?it/s]

Correct:  18


In [19]:
print("Correct: ", test_random(train, t, 10))

Correct:  11
