In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
data_file = pd.read_csv('data.csv')
data_file.head(20)

Unnamed: 0,Product ID,Title,Category,Brand,Price,Rating,User ID
0,101,Laptop,Electronics,HP,800,4.5,123
1,101,Laptop,Electronics,HP,800,4.7,567
2,101,Laptop,Electronics,HP,800,4.2,890
3,102,Smartphone,Electronics,Samsung,600,4.8,456
4,102,Smartphone,Electronics,Samsung,600,4.6,678
5,102,Smartphone,Electronics,Samsung,600,4.3,432
6,103,Running Shoes,Sports,Nike,100,4.2,789
7,103,Running Shoes,Sports,Nike,100,4.5,987
8,104,Headphones,Electronics,Sony,50,4.0,123
9,104,Headphones,Electronics,Sony,50,4.3,675


In [3]:
def preprocessing(initial_data):
    # Handle missing values
    filtered_data = initial_data.dropna(subset=['Rating', 'User ID', 'Product ID', 'Title', 'Category', 'Price'])

    # Remove duplicates
    filtered_data = filtered_data.drop_duplicates()

    # Filter out ratings outside the range of 1 to 5
    filtered_data = filtered_data[(filtered_data['Rating'] >= 1) & (filtered_data['Rating'] <= 5)]

    # Return the preprocessed DataFrame 
    return filtered_data


In [4]:
def newUser(data):
    # Group by 'Category' and 'Product ID', and avg out the 'Rating' column
    popular_products = pd.DataFrame(data.groupby(['Category', 'Product ID'])['Rating'].mean())

    # Sort the DataFrame by 'Category' and 'Rating' in descending order
    popular_products = popular_products.sort_values(['Category', 'Rating'], ascending=[True, False])

    # Reset the index to convert the grouped columns into regular columns
    popular_products = popular_products.reset_index()
    # print(popular_products)

    selected_columns = data[['Product ID', 'Title','Price', 'Brand']].drop_duplicates(subset='Product ID')
    # print(popular_products.shape,selected_columns.shape)
    popular_products = pd.merge(popular_products, selected_columns, on='Product ID')

    # Group by 'Category' again and select the top 10 products for each category
    top_10_products = popular_products.groupby('Category').head(10)
    print(type(top_10_products))

    # Print the top items of each category
    for  category,group in top_10_products.groupby('Category'):
        print(f"Top items in {category}:")
        print(group.drop('Category', axis=1))
        print()


In [5]:
def extract_user_purchases(user_id, data):
    
    # Filter the data for the given user ID
    user_purchases = data[data['User ID'] == user_id]

    # Extract the product IDs for the specific user as a list
    product_ids = user_purchases['Product ID'].tolist()

    return product_ids

In [6]:
def sim_matrix(data):

    # Generate the user-item matrix
    user_item_matrix = data.pivot_table(index='User ID', columns='Product ID', values='Rating',fill_value=0)


    # Print the user-item matrix
#     print("User-Item Matrix:")
    # print(user_item_matrix.head(20))
#     print(user_item_matrix.T) 
    
    # # Calculate the similarity matrix
    similarity_matrix = user_item_matrix.corr(method='pearson')
    similarity_matrix = similarity_matrix.applymap(lambda x: 0 if x < 0 else x)

    
    return similarity_matrix


In [7]:
def existingUser(data,user_id):
    product_ids = extract_user_purchases(user_id,data)
    similarity_matrix = sim_matrix(data)
#     print("User has bought these products in the past: ", product_ids)
    
#     Print the item-item similarity matrix
#     print("Item-Item Similarity Matrix:")
#     print(similarity_matrix)

    recommendations = set()  # Initialize an empty set to store the recommendations
    # At each iteration, add the products from correlated_row to the set
    for product_id in product_ids:
        correlated_row = similarity_matrix[product_id]
        products = list(correlated_row.index[correlated_row > 0])

#         print(products)
        
        recommendations.update(products)

    #remove the same product whose correlation has to be found
    for product_id in product_ids:
        recommendations.remove(product_id)
    if not recommendations:
        newUser(data)
    else:
#         print(recommendations)

        # get other information about the recommended products
        filtered_data = data[data['Product ID'].isin(recommendations)][['Product ID','Title', 'Brand', 'Price', 'Category','Rating']]
        # Group the filtered rows by categories
        grouped_data = filtered_data.groupby('Category')

        # Print the grouped rows
        for category, group in grouped_data:    
           # Group by 'Product ID' and calculate the mean of ratings for each product
            product_ratings_mean = group.groupby('Product ID')['Rating'].mean()

            # Sort the products based on their mean ratings in descending order
            top_5_products = product_ratings_mean.sort_values(ascending=False).head(5)

            # Get the details of the top 5 products from the original DataFrame
            top_5_products_details = group[group['Product ID'].isin(top_5_products.index)][['Title', 'Brand', 'Price', 'Category']]
            top_5_products_details = top_5_products_details.drop_duplicates()
            # Display the results
            print(f"Category: {category}")
            print(top_5_products_details.drop('Category', axis=1))
            print()
            

In [8]:
data = preprocessing(data_file)
# Print the preprocessed data
# print("Preprocessed data:")
# print(data.head(20))

user_id = 123
# user_id = int(input("Enter your user ID: "))

if user_id in data['User ID'].values:
    existingUser(data,user_id)
else:
    newUser(data)

Category: Appliances
             Title   Brand  Price
32  Vacuum Cleaner   Dyson    400
58  Vacuum Cleaner  Roomba    350

Category: Electronics
         Title      Brand  Price
3   Smartphone    Samsung    600
25     Monitor       Dell    300
33      Laptop       Dell    700
43      Tablet    Samsung    400
46    Keyboard  Microsoft     60

Category: Kitchen
           Title       Brand  Price
24       Blender       Ninja     70
42  Coffee Maker      Keurig    100
52       Blender    Blendtec    300
64  Coffee Maker  Mr. Coffee     50

Category: Sports
            Title        Brand  Price
27  Running Shoes       Reebok     80
54  Running Shoes  New Balance     70

