In [109]:
# import pandas as pd
# import matplotlib.pyplot as plt
# import seaborn as sns

# # Read data into a dataframe and 
# df = pd.read_csv('ratings.csv', dtype={"vote": str})
# counts = df.groupby('reviewerID')['reviewerID'].size()
# counts = counts[(counts >=5) & (counts <= 10)]
# densityPlot = sns.kdeplot(counts, bw_adjust=3)
# yLabels = densityPlot.get_yticks()
# densityPlot.set_yticklabels('{:,.0%}'.format(y) for y in yLabels)
# plt.xlabel("Number of reviews")
# plt.ylabel("Percentage of users")
# plt.savefig(fname="densityPlot")
# plt.show()

In [110]:
# import pandas as pd

# # Read ratings into a dataframe and narrow down reviewers to desired review count
# ratings = pd.read_csv('ratings.csv', dtype={"vote": str})
# counts = ratings.groupby('reviewerID')['reviewerID'].size()
# counts = counts[(counts >=5) & (counts <= 10)]

# # Create and apply mask that removes all other reviewers.
# mask = ratings['reviewerID'].isin(counts.index)
# ratings = ratings[mask]

# # Read metadata into dataframe and define asins that are used by making a set
# metadata = pd.read_csv("metadata.csv")
# used_asins = set(ratings["asin"])

# # Create and apply mask that removes unreferenced items
# mask = metadata["asin"].isin(used_asins)
# metadata = metadata[mask]
# metadata.to_pickle("metadata.pickle")
# ratings.to_pickle("ratings.pickle")


In [111]:
# import pandas as pd
# import numpy as np
# from scipy.sparse import csr_matrix, save_npz
# ratings : pd.DataFrame = pd.read_pickle("ratings.pickle")
# metadata : pd.DataFrame = pd.read_pickle("metadata.pickle")
# ratings = ratings[["reviewerID", "asin", "overall"]]

# # Series of each unique value of reviewerID and asin 
# reviewerIDs = ratings['reviewerID'].unique()
# asins = ratings["asin"].unique()
# # Dictionary that maps reviewers to an index
# reviewerMap = {reviewerID: i for i, reviewerID in enumerate(reviewerIDs)}

# # Dictionary that maps asins to an index
# asinMap = {asin: i for i, asin in enumerate(asins)}

# # Initialize the utility matrix with zeros
# utilityMatrixArray = np.zeros((len(reviewerIDs), len(asins)), "int8")

# # Fill in the utility matrix with data from the records array
# for row in ratings.itertuples():
#     reviewerIndex = reviewerMap[row.reviewerID]
#     asinIndex = asinMap[row.asin]
#     utilityMatrixArray[reviewerIndex, asinIndex] = row.overall
# utilityMatrix = csr_matrix(utilityMatrixArray)
# save_npz("utilityMatrix.npz", utilityMatrix)

In [112]:
import pandas as pd
import numpy as np
from scipy.sparse import load_npz
from sklearn.neighbors import NearestNeighbors

# Get all data needed
ratings : pd.DataFrame = pd.read_pickle("ratings.pickle")
metadata : pd.DataFrame = pd.read_pickle("metadata.pickle")
utilityMatrix = load_npz("utilityMatrix.npz")

asins = ratings["asin"].unique()
asinMap = {asin: i for i, asin in enumerate(asins)}

# Creation of test data
hoodies = metadata[metadata['category'].apply(lambda c: "Hoodie" in c)]
hoodieAsins = hoodies["asin"]
testAsins = hoodieAsins[5:9]
testIndices = testAsins.index.to_list()

# Create a NearestNeighbors model with cosine similarity
model = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=20)

# Fit the model to the item vectors
model.fit(utilityMatrix.T)

# Get the nearest neighbors of test items
_, indices = model.kneighbors(utilityMatrix.T[testIndices], n_neighbors=20)

# The asinDict contains a pair of an asin and its predictions in a set.
asinDict = {}

# Iterate through the test items
for i, idx in enumerate(testIndices):
    neighborIndices = indices[i][1:]

    # Get the current asin
    currentAsin = testAsins[idx] 
    predictionsList = []

    # Iterate through the nearest neighbors and add their asins to predictionsList
    for neighborIndex in neighborIndices:
        predictedAsin = metadata.iloc[neighborIndex]['asin']
        predictionsList.append(predictedAsin)
    asinDict[currentAsin] = predictionsList

# Assessing accuracy based on percentage of hoodies in recommmendations.
for asin in testAsins:
    asinPredictions = asinDict[asin]
    counter = 0
    for predictedAsin in asinPredictions:
        if predictedAsin in hoodieAsins:
            counter += 1
        correctness = (counter / len(predictionsList)) * 100
    print(f"The predictions for {asin} have {counter} / {len(predictionsList)} predictions in the hoodie category.\nThat is a {correctness}% correctness")

The predictions for B0008IVI3M have 0 / 19 predictions in the hoodie category.
That is a 0.0% correctness
The predictions for B0008MF4QK have 0 / 19 predictions in the hoodie category.
That is a 0.0% correctness
The predictions for B0009G9QWI have 0 / 19 predictions in the hoodie category.
That is a 0.0% correctness
The predictions for B000B6A5HG have 0 / 19 predictions in the hoodie category.
That is a 0.0% correctness
