In [2]:
import nbformat

# Load the Jupyter notebook file
file_path = 'D:/Work/Gre/UTD/Courses/Winter/Projects/Recommendation Systems/Kaggle/Recommendation_model_ecommerce.ipynb'
with open(file_path, 'r') as file:
    notebook = nbformat.read(file, as_version=4)

# Extract the code cells and their outputs
code_cells = [cell for cell in notebook.cells if cell.cell_type == 'code']
code_cells

[{'cell_type': 'code',
  'execution_count': None,
  'metadata': {},
  'outputs': [],
  'source': ''}]

## Import Libraries and Load Data
#### The code imports necessary libraries such as pandas for data manipulation, numpy for numerical operations, and scikit-learn modules for machine learning.

In [3]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

## Load Data
#### The code loads the dataset from a CSV file and drops rows with missing values in the product_name column.

In [10]:
data = pd.read_csv('D:/Work/Gre/UTD/Courses/Winter/Projects/Recommendation Systems/Kaggle/ratings_Beauty.csv')
data = data.dropna(subset=['ProductId'])
data.reset_index(drop=True, inplace=True)

## Text Vectorization
#### Converts the product_name text data into a matrix of TF-IDF features.

In [11]:
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(data['ProductId'])

## Clustering
#### Initializes and fits a K-Means clustering model with true_k clusters to the TF-IDF feature matrix.

In [12]:
true_k = 10
model = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1)
model.fit(X)

## Defining a Clustering Function
#### A function to print the top terms in each cluster.

In [13]:
def print_cluster(i):
    print("Cluster %d:" % i)
    order_centroids = model.cluster_centers_.argsort()[:, ::-1]
    terms = vectorizer.get_feature_names_out()
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind])

## Determine Optimal Clusters and Display Top Terms
#### Fits the K-Means model with 'true_k' clusters and prints the top terms for each cluster.

In [14]:
true_k = 10

model = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1)
model.fit(X)

print("Top terms per cluster:")
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names_out()
for i in range(true_k):
    print_cluster(i)

Top terms per cluster:
Cluster 0:
 b001ma0qy2
 b0009v1yr8
 b0043oyfku
 b0000yuxi0
 b003v265qw
 b000zmbspe
 b003bq6qxk
 b004ohqr1q
 b00121uvu0
 b000fs05vg
Cluster 1:
 b001dkk7y0
 b00lu0ltou
 b002rcxs1c
 b002rcxl3m
 b002rcxjei
 b002rcxj0c
 b002rcw6qa
 b002rcqj2w
 b002rcqifu
 b002rcoyvk
Cluster 2:
 b0006b66d8
 b00lu0ltou
 b002rcz6p8
 b002rcxs1c
 b002rcxr9u
 b002rcxl3m
 b002rcxjei
 b002rcxj0c
 b002rcw6qa
 b002rcqj2w
Cluster 3:
 b0009oai8g
 b00lu0ltou
 b002rcz6p8
 b002rcxs1c
 b002rcxr9u
 b002rcxl3m
 b002rcxjei
 b002rcxj0c
 b002rcw6qa
 b002rcqj2w
Cluster 4:
 b0002z8hai
 b00lu0ltou
 b002rch4zi
 b002rcxs1c
 b002rcxr9u
 b002rcxl3m
 b002rcxjei
 b002rcxj0c
 b002rcw6qa
 b002rcqj2w
Cluster 5:
 b00ae078tm
 b00lu0ltou
 b002rcf6mq
 b002rcxjei
 b002rcxj0c
 b002rcw6qa
 b002rcqj2w
 b002rcqifu
 b002rcoyvk
 b002rcoh7g
Cluster 6:
 b00d6edgye
 b00lu0ltou
 b002rcf6mq
 b002rcxl3m
 b002rcxjei
 b002rcxj0c
 b002rcw6qa
 b002rcqj2w
 b002rcqifu
 b002rcoyvk
Cluster 7:
 b0038baxjg
 b00lu0ltou
 b002rcz6i0
 b002rcxr9u
 

## Defining Recommendation Function
####  A function that takes a product name, predicts its cluster, and prints the top terms of that cluster.

In [15]:
def show_recommendations(product):
    Y = vectorizer.transform([product])
    prediction = model.predict(Y)
    print_cluster(prediction[0])

## Testing Recommendations
#### Calls show_recommendations with different product names to see the cluster and its top terms.

In [18]:
show_recommendations("cutting tool")
show_recommendations("spray paint")
show_recommendations("steel drill")
show_recommendations("water")

Cluster 0:
 b001ma0qy2
 b0009v1yr8
 b0043oyfku
 b0000yuxi0
 b003v265qw
 b000zmbspe
 b003bq6qxk
 b004ohqr1q
 b00121uvu0
 b000fs05vg
Cluster 0:
 b001ma0qy2
 b0009v1yr8
 b0043oyfku
 b0000yuxi0
 b003v265qw
 b000zmbspe
 b003bq6qxk
 b004ohqr1q
 b00121uvu0
 b000fs05vg
Cluster 0:
 b001ma0qy2
 b0009v1yr8
 b0043oyfku
 b0000yuxi0
 b003v265qw
 b000zmbspe
 b003bq6qxk
 b004ohqr1q
 b00121uvu0
 b000fs05vg
Cluster 0:
 b001ma0qy2
 b0009v1yr8
 b0043oyfku
 b0000yuxi0
 b003v265qw
 b000zmbspe
 b003bq6qxk
 b004ohqr1q
 b00121uvu0
 b000fs05vg


# Conclusion

### Explaination of Key Steps:

#### Data Loading and Preprocessing:

***Load the dataset and clean it by removing missing value***

#### Vectorization:

***Convert the product names into numerical features using TF-IDF vectorization.***

#### Clustering:

***Use K-Means clustering to group similar products based on their names.***

#### Cluster Analysis:

***Define functions to print the clusters and recommend products based on their clusters.***

#### Recommendations Systems

***Given a product name, predict which cluster it belongs to and display the top terms of that cluster to give similar product recommendations***