In [63]:
import os
import time
import re
import heapq
import pandas as pd
import numpy as np
from joblib import dump, load # to store matrix
from surprise import accuracy
from surprise import Dataset
from surprise import Reader
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from surprise.model_selection import cross_validate
from surprise.model_selection import train_test_split
from surprise.model_selection import LeaveOneOut
from surprise import NormalPredictor, SVD, AlgoBase, PredictionImpossible

# from recommender_metrics import RecommenderMetrics
# from evaluator import Evaluator
from fuzzywuzzy import process

In [34]:
from utility.recommender_metrics import RecommenderMetrics
from utility.evaluation_data import EvaluationData
from utility.evaluated_algorithm import EvaluatedAlgorithm
from utility.evaluator import Evaluator

### Loading Dataset

In [55]:
# file paths for saving/loading 
lsa_matrix_file = 'lsa_matrix.joblib'
product_data_file = '../newData/flipkart_cleaned.csv'
purchase_history_file = '../newData/synthetic_v2.csv'

### content based rec sys aims to recommend items based on similarity between items
df_products = pd.read_csv(product_data_file)
df_purchase = pd.read_csv(purchase_history_file)

# merge df_purchase with df_products on Product ID
df = pd.merge(df_purchase, df_products, left_on='Product ID', right_on='uniq_id')

print("df_products shape", df_products.shape)
print("df_purchase shape", df_purchase.shape)
print("merged df shape", df.shape)
print("df.head", df.info())


df_products shape (19906, 12)
df_purchase shape (124968, 14)
merged df shape (124956, 26)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 124956 entries, 0 to 124955
Data columns (total 26 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   Order ID                     124956 non-null  object 
 1   Product ID                   124956 non-null  object 
 2   Product Quantity             124956 non-null  int64  
 3   Product Price Each           124956 non-null  float64
 4   Order Total                  124956 non-null  float64
 5   Order Date                   124956 non-null  object 
 6   Purchase Address             124956 non-null  object 
 7   User rating for the product  124956 non-null  float64
 8   User ID                      124956 non-null  object 
 9   User Age                     124956 non-null  int64  
 10  User Occupation              124956 non-null  object 
 11  User Income                

### Configuring Reader

In [56]:
# keep users with >= ratings
# Keep users with at least N ratings
min_user_ratings = 10  # threshold
user_counts = df['User ID'].value_counts()
valid_users = user_counts[user_counts >= min_user_ratings].index
df = df[df['User ID'].isin(valid_users)]
print('valid_users',len(valid_users))

# Keep items with at least N ratings
min_item_ratings = 5  # example threshold
item_counts = df['Product ID'].value_counts()
valid_items = item_counts[item_counts >= min_item_ratings].index
df = df[df['Product ID'].isin(valid_items)]
print('valid_items',len(valid_items))

valid_users 6597
valid_items 11904


In [29]:
# randomly sample 10,000 rows from df
sampled_df = df.sample(n=10000, random_state=42)  

# check for users with no ratings
user_counts = sampled_df['User ID'].value_counts()
print(user_counts[user_counts == 0])

# Check for items with no ratings
item_counts = sampled_df['Product ID'].value_counts()
print(item_counts[item_counts == 0])

Series([], Name: count, dtype: int64)
Series([], Name: count, dtype: int64)


In [50]:
### Configuring Reader
reader = Reader(rating_scale=(0,5))
evaluation_data = Dataset.load_from_df(df[['User ID', 'Product ID', 'User rating for the product']], reader)

evaluation_data


<surprise.dataset.DatasetAutoFolds at 0x375076d90>

In [51]:
# popularity rankings calculations
average_ratings = df.groupby('Product ID')['User rating for the product'].mean()

df['average_ratings'] = df['Product ID'].map(average_ratings)

In [52]:
# evaluation_data = df_purchase
# products_data = df_products
popularity_rankings = df[['Product ID', 'average_ratings']]

# Remove duplicate rows based on 'Product ID'
popularity_rankings = popularity_rankings.drop_duplicates(subset='Product ID')

print(popularity_rankings)

                             Product ID  average_ratings
159    17087d2ddd19e19929c2ef485dd8c8e7         3.060000
230    446876ebd53141d997ef790e6dd42d67         3.214286
335    ed2b9ac3bd209d106366832fcea6f520         1.860000
369    63f2dd3d90ff0c352b2a133c5ecaefcd         3.600000
370    fb187233117b2eae554f47d2745fa954         2.440000
...                                 ...              ...
67546  31604a5b5c9b3399031b238a192996d6         2.560000
68480  52de97eb4157cbe2dc1e9bd650b5348f         2.833333
69670  1e36eb37f6c83aca2a2aab1ab0d90cd8         2.614286
69981  0ae6a105ab4768e5da39dbe2399de9f1         1.540000
77872  72fc51e3949799c0c694534325ad2cac         3.160000

[389 rows x 2 columns]


### Combining product features to form content for each product

In [53]:
df['content'] = df['product_name'].astype(str) + ' ' + df['product_category_tree'].astype(str) + ' ' + df['description'].astype(str) + ' ' + df['brand'].astype(str)

### Prepare evaluator
Use evaluation data to test algorithms. Evaluator works with:
- EvaluationData
- EvaluatedAlgorithm

In [54]:
evaluator = Evaluator(evaluation_data, popularity_rankings)

Number of full trainset users: 1048
Number of full trainset items: 389
Number of trainset users: 898
Number of trainset items: 389
Size of testset: 513
Estimating biases using als...
Computing the cosine similarity matrix...


ZeroDivisionError: float division

# Trying Filler Algo to see if functions work

In [61]:
class MyOwnAlgorithm(AlgoBase):
    def __init__(self):

        # Always call base method before doing anything.
        AlgoBase.__init__(self)

    def estimate(self, u, i):

        return 3

In [62]:
algo = MyOwnAlgorithm()
cross_validate(algo, evaluation_data, verbose=True)

Evaluating RMSE, MAE of algorithm MyOwnAlgorithm on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.4345  1.4803  1.4599  1.4603  1.4505  1.4571  0.0149  
MAE (testset)     1.1927  1.2453  1.2100  1.2449  1.2305  1.2247  0.0205  
Fit time          0.00    0.00    0.00    0.00    0.00    0.00    0.00    
Test time         0.00    0.00    0.00    0.00    0.00    0.00    0.00    


{'test_rmse': array([1.43445608, 1.48033336, 1.45994487, 1.46028731, 1.45047511]),
 'test_mae': array([1.19270073, 1.24525547, 1.21      , 1.24487805, 1.2304878 ]),
 'fit_time': (9.059906005859375e-06,
  9.799003601074219e-05,
  9.298324584960938e-05,
  0.00011420249938964844,
  9.489059448242188e-05),
 'test_time': (0.0019421577453613281,
  0.0003390312194824219,
  0.0004930496215820312,
  0.000621795654296875,
  0.0003409385681152344)}

# CBF

In [64]:
class ContentBasedRecSysKNN(AlgoBase):

    def __init__(self, k=40, movie_data=None, sim_options={}):
        AlgoBase.__init__(self)
        self.k = k
        self.movie_data = movie_data

    def fit(self, trainset):
        """
        Fit function - calculate genre similarities
        Args
            trainset: the training set
        Returns
            self
        """
        AlgoBase.fit(self, trainset)
        
        self.compute_genre_similarities()
    
        return self
    
    def estimate(self, u, i):
        """
        Estimate function: evaluate the ratings for an user & item
        Process is:
            - gather the ratings for the user u
            - from the similarities (on items) between item i and the items
            from the ratings gathered (neighboring items)
            - order these neighbors 
            - calculate the average simmilarities score weighted by user ratings
        Args
            u: current user
            i: current item
        Returns
            predicted ratings
        """
        if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)):
            raise PredictionImpossible('User and/or item is unkown.')
        
        # Build up similarity scores between this item and everything the user rated
        neighbors = []
        for rating in self.trainset.ur[u]:
            genre_similarity = self.genre_similarities[i,rating[0]]
            neighbors.append( (genre_similarity, rating[1]) )
        
        # Extract the top-K most-similar ratings (k is set at init)
        k_neighbors = heapq.nlargest(self.k, neighbors, key=lambda t: t[0])
        
        # Compute average sim score of K neighbors weighted by user ratings
        sim_total = weighted_sum = 0
        for (sim_score, rating) in k_neighbors:
            if (sim_score > 0):
                sim_total += sim_score
                weighted_sum += sim_score * rating
            
        if (sim_total == 0):
            raise PredictionImpossible('No neighbors')

        predicted_rating = weighted_sum / sim_total

        return predicted_rating
    
    def compute_genre_similarities(self):
        # compute similarities
        genre_columns = self.movie_data.columns[6:-1]
        genres = self.movie_data[genre_columns]
        # calculate cosine similarity using linear_kernel from sklearn
        self.genre_similarities = linear_kernel(genres, genres)
        return self

In [65]:
algo = ContentBasedRecSysKNN()
cross_validate(algo, evaluation_data, verbose=True)

AttributeError: 'NoneType' object has no attribute 'columns'

### Building User Profile
Aggregate features of the products a user has interacted with, based on their purchase history and available ratings.

In [None]:
# Combining product features to form content for each product
df['product_content'] = df['product_name'].astype(str) + ' ' + df['product_category_tree'].astype(str) + ' ' + df['description'].astype(str) + ' ' + df['brand'].astype(str)

# group purchases by userID and aggregate product content and raings
# user_data: purchases made by a specific user
def aggregate_user_profile(user_data):
    # weight product content by user ratings, normalize ratings
    weighted_content = []
    for index, row in user_data.iterrows():
        content = row['product_content']
        rating_weight = row['User rating for the product'] if pd.notna(row['User rating for the product']) else 0.1 # if rating not available, default to 0.1
        weighted_content.append((content, rating_weight))

    print("weighted content", weighted_content)
    
    # aggregate product content: repeat each product's content according to rating weifht
    all_content = ' '.join([content * round(weight) for content, weight in weighted_content])
    print("all_content", all_content)
    return all_content

# aggregate product contents for each user
user_profiles = df.groupby('User ID').apply(aggregate_user_profile).reset_index()
user_profiles.columns = ['User ID', 'user_profile_content']

# vectorize the aggregated user profiles using CountVectorizer or TfidfTransformer
vectorizer = CountVectorizer(max_df=0.85, stop_words='english')
user_profiles_matrix = vectorizer.fit_transform(user_profiles['user_profile_content'])

# apply TF-IDF transformation
tfidf_transformer = TfidfTransformer()
user_profiles_tfidf = tfidf_transformer.fit_transform(user_profiles_matrix)

# At this point, `user_profiles_tfidf` is a matrix where each row is a user's profile,
# represented as a vector of their aggregated product interactions.


### Checking if LSA Matrix needs to be recalculated (if there is a modification to flipkart)

In [8]:
recalculate_lsa = False

if os.path.exists(lsa_matrix_file):
    # Compare modification times
    lsa_matrix_mtime = os.path.getmtime(lsa_matrix_file)
    product_data_mtime = os.path.getmtime(product_data_file)

    if product_data_mtime > lsa_matrix_mtime:
        print("Product information database was updated, recalculating lsa_matrix...")
        recalculate_lsa = True
else:
    print("lsa_matrix does not exist, computing...")
    recalculate_lsa = True

# Check if there is a need to compute LSA matrix
if recalculate_lsa:
    print("Computing LSA matrix...")

    # Create bag of words
    vectorizer = CountVectorizer()
    bow = vectorizer.fit_transform(df['content'])

    # Convert bag of words to TF-IDF
    tfidf_transformer = TfidfTransformer()
    tfidf = tfidf_transformer.fit_transform(bow)

    # Apply LSA 
    lsa = TruncatedSVD(n_components=100, algorithm='arpack')
    lsa.fit(tfidf) # train lsa model
    lsa_matrix = lsa.transform(tfidf) # project data onto learned components

    # Save the computed LSA matrix to file
    dump(lsa_matrix, lsa_matrix_file)
    print("LSA matrix saved to file.")
else:
    print("loading lsa_matrix from file...")
    lsa_matrix=load(lsa_matrix_file)

lsa_matrix does not exist, computing...
Computing LSA matrix...
LSA matrix saved to file.


### Getting user input to put into recommendation system

In [9]:
# Get the user input
user_product = input("Enter a product ")

In [10]:
# Start timer after user input
start_time = time.time()

# Use fuzzy matching to find the closest product name
match = process.extractOne(user_product, df['product_name'])
closest_match = match[0]
score = match[1]

print("closest match and score: ", closest_match, score)

if score < 70:
    print("No close match found")
else:
    # find the index of the closes product
    product_index = df[df['product_name'] == closest_match].index[0]

     # Compute the cosine similarities using the lsa_matrix
    similarity_scores = cosine_similarity(lsa_matrix[product_index].reshape(1, -1), lsa_matrix)

    # Get the top 10 most similar products
    similar_products = list(enumerate(similarity_scores[0]))
    sorted_similar_products = sorted(similar_products, key=lambda x: x[1], reverse=True)[1:10]

    # Print the top 10 similar products
    for i, score in sorted_similar_products:
        print("{}: {}".format(i, df.loc[i, 'product_name']))
        # print(f"Product: {df.loc[i, 'product_name']} | Price: {df.loc[i, 'retail_price']} | Rating: {df.loc[i, 'overall_rating']} | Similarity: {score}")


# End timer for the entire program
end_time = time.time()

# Print time taken
print("Time taken to find recommendations: {:.2f} seconds".format(end_time - start_time))

closest match and score:  hello dolly women's leggings 90
1772: hello dolly women's leggings
1777: hello dolly women's leggings
9343: hello dolly women's leggings
9366: hello dolly women's leggings
11367: hello dolly women's leggings
11467: hello dolly women's leggings
11604: hello dolly women's leggings
14605: hello dolly women's leggings
16236: hello dolly women's leggings
Time taken to find recommendations: 3.31 seconds
