#Stores Recommendation System

#Libraries Import

In [None]:
# Suppressing warnings for cleaner output
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Importing essential data handling libraries
import pandas as pd
import numpy as np

# Importing visualization tools
import matplotlib.pyplot as plt
import seaborn as sns

# Machine learning utilities
from sklearn.metrics import mean_squared_error
from sklearn.metrics.pairwise import cosine_similarity

# Set the aesthetic style of the plots
sns.set_style("whitegrid")

## Importing Dataset

In [None]:
# Loading the dataset
import pandas as pd

# Path to the dataset
file_path = '/content/drive/MyDrive/ratings_stores.csv'

# Reading the data without headers
store_ratings = pd.read_csv(file_path, header=None)

# Assigning new column labels
store_ratings.columns = ['UserID', 'StoreID', 'Rating', 'Timestamp']

# Removing the Timestamp column as it's not needed for our analysis
store_ratings.drop(columns=['Timestamp'], inplace=True)

# Creating a backup of the DataFrame for further use
ratings_backup = store_ratings.copy()


## Exploratory Data Analysis Overview

In this section, we delve into preliminary data exploration to understand our dataset's structure and quality better. Key areas of focus include:

- **Dataset Structure**: Examining the dimensions of the data (rows and columns).
- **Data Types**: Identifying the type of data each column holds.
- **Missing Values**: Investigating the presence of any incomplete or missing data in the dataset.

Post this initial analysis, we aim to extract insightful summaries:

- **Rating Distribution Analysis**: Studying how ratings are distributed across the dataset.
- **Count of Users and Products**: Quantifying the total number of unique users and products in our dataset.
- **Top Reviewers**: Identifying users who have provided the most ratings.

This exploratory phase is crucial for guiding further data processing and analysis steps.

### Shape

In [None]:
# Code to determine and print the dimensions of a DataFrame

# Assuming 'dataframe' is your DataFrame variable
row_count, column_count = dataframe.shape

# Displaying the number of rows and columns
print(f"Number of Rows: {row_count}")
print(f"Number of Columns: {column_count}")

No of rows =  7824482
No of columns =  3


### Datatypes

In [None]:
df.info()

### Missing value analysis

In [None]:
# Find number of missing values in each column
df.isna().sum()

### Summary

In [None]:
# Summary statistics of 'rating' variable
df['rating'].describe()

### Rating distribution

In [None]:
# Code to create a bar plot for rating distribution

import matplotlib.pyplot as plt

# Assuming 'data' is your DataFrame and it contains a column named 'rating'
plt.figure(figsize=(12, 6))  # Setting the figure size for the plot

# Creating a bar plot of the normalized value counts of the 'rating' column
data['rating'].value_counts(normalize=True).plot(kind='bar', color='skyblue')  # Added color for a different look

# Displaying the plot
plt.show()


### No of unique users and items

In [None]:
# Code to print the number of unique user IDs and store IDs

# Assuming 'dataframe' is your DataFrame variable and it contains columns 'user_id' and 'store_id'
unique_users = dataframe['user_id'].nunique()
unique_stores = dataframe['store_id'].nunique()

# Displaying the number of unique users and stores
print(f"Number of Unique Users in the Data: {unique_users}")
print(f"Number of Unique Stores in the Data: {unique_stores}")

# This code snippet assumes you have a DataFrame named 'dataframe' with the required columns.

Number of unique USERS in Raw data = 4201696
Number of unique STORES in Raw data = 476002


### Users with most no of rating

In [None]:
# Top 10 users based on rating
most_rated = df.groupby('user_id').size().sort_values(ascending=False)[:10]
most_rated

## Pre-Processing

In [None]:
# Assuming 'data' is your DataFrame and it contains a column named 'user_id'
# Counting the frequency of each user_id
user_interaction_counts = data['user_id'].value_counts()

# Filtering to include only users with 50 or more interactions
filtered_data = data[data['user_id'].isin(user_interaction_counts[user_interaction_counts >= 50].index)]


In [None]:
# Code to print the number of observations, unique user IDs, and unique store IDs in a DataFrame

# Assuming 'dataframe' is your DataFrame variable and it contains columns 'user_id' and 'store_id'
observations_final_count = len(dataframe)
unique_users_final_count = dataframe['user_id'].nunique()
unique_stores_final_count = dataframe['store_id'].nunique()

# Displaying the information
print(f"Number of Observations in the Final Data: {observations_final_count}")
print(f"Number of Unique Users in the Final Data: {unique_users_final_count}")
print(f"Number of Unique Stores in the Final Data: {unique_stores_final_count}")

- The dataframe **df_final has users who have rated 50 or more items**
- **We will use df_final to build recommendation systems**

### Checking the density of the rating matrix

In [None]:
#Creating the interaction matrix of products and users based on ratings and replacing NaN value with 0
final_ratings_matrix = df_final.pivot(index = 'user_id', columns ='prod_id', values = 'rating').fillna(0)
print('Shape of final_ratings_matrix: ', final_ratings_matrix.shape)

#Finding the number of non-zero entries in the interaction matrix
given_num_of_ratings = np.count_nonzero(final_ratings_matrix)
print('given_num_of_ratings = ', given_num_of_ratings)

#Finding the possible number of ratings as per the number of users and products
possible_num_of_ratings = final_ratings_matrix.shape[0] * final_ratings_matrix.shape[1]
print('possible_num_of_ratings = ', possible_num_of_ratings)

#Density of ratings
density = (given_num_of_ratings/possible_num_of_ratings)
density *= 100
print ('density: {:4.2f}%'.format(density))

final_ratings_matrix.head()

# Collaborative Filtering based Recommendation System

## User based collaborative filtering

In [None]:
final_ratings_matrix.head()

Here, user_id (index) is of the object data type. We will replace the user_id by numbers so that the index is of integer type and represents a user id in the same format

In [None]:
final_ratings_matrix['user_index'] = np.arange(0, final_ratings_matrix.shape[0])
final_ratings_matrix.set_index(['user_index'], inplace=True)

# Actual ratings given by users
final_ratings_matrix.head()

### Function to find Similar users and their similarity scores

In [None]:
def find_similar_users(user_id, matrix):
    user_similarity_scores = []
    for other_user in range(matrix.shape[0]):
        # Compute the cosine similarity between the input user and every other user
        similarity = cosine_similarity([matrix.loc[user_id]], [matrix.loc[other_user]])

        # Store the user ID and similarity score
        user_similarity_scores.append((other_user, similarity[0][0]))

    # Sort based on similarity scores in descending order
    user_similarity_scores.sort(key=lambda x: x[1], reverse=True)

    # Extracting user IDs and their corresponding similarity scores
    similar_user_ids = [pair[0] for pair in user_similarity_scores]
    scores = [pair[1] for pair in user_similarity_scores]

    # Remove the original user's ID and score
    similar_user_ids.remove(user_id)
    scores.remove(scores[0])

    return similar_user_ids, scores

#### Finding out top 10 similar users to the user index 3 and their similarity score

In [None]:
similar = find_similar_users(3,final_ratings_matrix)[0][0:10]
similar

In [None]:
#Print the similarity score
similar_users(3,final_ratings_matrix)[1][0:10]

#### Finding out top 10 similar users to the user and their similarity score

In [None]:
similar = find_similar_users(1289, final_ratings_matrix)[0][0:10]
similar

In [None]:
#Print the similarity score
find_similar_users(1289,final_ratings_matrix)[1][0:10]

### Function to recommend products

In [None]:
def get_recommendations(user_id, number_of_items, user_item_matrix):
    # Get similar users from the previously defined function
    similar_users_list = find_similar_users(user_id, user_item_matrix)[0]

    # Retrieve items that the input user has already interacted with
    user_items = set(user_item_matrix.columns[user_item_matrix.loc[user_id] > 0])
    recommended_items = []

    # Keeping track of items that have already been considered
    considered_items = user_items.copy()

    for user in similar_users_list:
        if len(recommended_items) < number_of_items:
            # Identify items rated by similar users but not by the input user
            items_from_similar = set(user_item_matrix.columns[user_item_matrix.loc[user] > 0])
            new_recommendations = items_from_similar - considered_items
            recommended_items.extend(new_recommendations)

            # Update the considered items
            considered_items.update(items_from_similar)
        else:
            break

    # Return the top 'number_of_items' recommendations
    return recommended_items[:number_of_items]

#### Recommend 5 products to user index 3 based on similarity based collaborative filtering

In [None]:
recommendations(3,5,final_ratings_matrix)

['B001TAAVP4', 'B0016E5X5Q', 'B0054U6CEE', 'B00006IW1X', 'B000HWVOFQ']

#### Recommend 5 products to user index 1521 based on similarity based collaborative filtering

In [None]:
recommendations(1289,5,final_ratings_matrix)

['B007X3VEUW', 'B005TDWUII', 'B0040XQ7PK', 'B009O7XGCY', 'B00A7PPLP2']