In [1]:
import numpy as np
import pandas as pd
import zipfile

In [2]:
zip_path = "D:/LLM_Based_Recommendation_System/products.zip"

with zipfile.ZipFile(zip_path, 'r') as z:
    print(z.namelist())

    csv_filename1 = z.namelist()[0]
    csv_filename2 = z.namelist()[1]
    df1 = pd.read_csv(z.open(csv_filename1))
    df2 = pd.read_csv(z.open(csv_filename2))

['amazon_categories.csv', 'amazon_products.csv']


In [3]:
df1.head(3)

Unnamed: 0,id,category_name
0,1,Beading & Jewelry Making
1,2,Fabric Decorating
2,3,Knitting & Crochet Supplies


In [4]:
df2.head(3)

Unnamed: 0,asin,title,imgUrl,productURL,stars,reviews,price,listPrice,category_id,isBestSeller,boughtInLastMonth
0,B014TMV5YE,"Sion Softside Expandable Roller Luggage, Black...",https://m.media-amazon.com/images/I/815dLQKYIY...,https://www.amazon.com/dp/B014TMV5YE,4.5,0,139.99,0.0,104,False,2000
1,B07GDLCQXV,Luggage Sets Expandable PC+ABS Durable Suitcas...,https://m.media-amazon.com/images/I/81bQlm7vf6...,https://www.amazon.com/dp/B07GDLCQXV,4.5,0,169.99,209.99,104,False,1000
2,B07XSCCZYG,Platinum Elite Softside Expandable Checked Lug...,https://m.media-amazon.com/images/I/71EA35zvJB...,https://www.amazon.com/dp/B07XSCCZYG,4.6,0,365.49,429.99,104,False,300


In [5]:
df2['category_id'] = df2['category_id'].map(df1.set_index('id')['category_name'])
df2.head(3)

Unnamed: 0,asin,title,imgUrl,productURL,stars,reviews,price,listPrice,category_id,isBestSeller,boughtInLastMonth
0,B014TMV5YE,"Sion Softside Expandable Roller Luggage, Black...",https://m.media-amazon.com/images/I/815dLQKYIY...,https://www.amazon.com/dp/B014TMV5YE,4.5,0,139.99,0.0,Suitcases,False,2000
1,B07GDLCQXV,Luggage Sets Expandable PC+ABS Durable Suitcas...,https://m.media-amazon.com/images/I/81bQlm7vf6...,https://www.amazon.com/dp/B07GDLCQXV,4.5,0,169.99,209.99,Suitcases,False,1000
2,B07XSCCZYG,Platinum Elite Softside Expandable Checked Lug...,https://m.media-amazon.com/images/I/71EA35zvJB...,https://www.amazon.com/dp/B07XSCCZYG,4.6,0,365.49,429.99,Suitcases,False,300


In [6]:
# Dropping few columns
print("Columns before dropping")
print(df2.columns)
df2 = df2.drop(columns=['imgUrl', 'productURL'])
print("Columns afters dropping")
print(df2.columns)
df2.head(3)

Columns before dropping
Index(['asin', 'title', 'imgUrl', 'productURL', 'stars', 'reviews', 'price',
       'listPrice', 'category_id', 'isBestSeller', 'boughtInLastMonth'],
      dtype='object')
Columns afters dropping
Index(['asin', 'title', 'stars', 'reviews', 'price', 'listPrice',
       'category_id', 'isBestSeller', 'boughtInLastMonth'],
      dtype='object')


Unnamed: 0,asin,title,stars,reviews,price,listPrice,category_id,isBestSeller,boughtInLastMonth
0,B014TMV5YE,"Sion Softside Expandable Roller Luggage, Black...",4.5,0,139.99,0.0,Suitcases,False,2000
1,B07GDLCQXV,Luggage Sets Expandable PC+ABS Durable Suitcas...,4.5,0,169.99,209.99,Suitcases,False,1000
2,B07XSCCZYG,Platinum Elite Softside Expandable Checked Lug...,4.6,0,365.49,429.99,Suitcases,False,300


In [7]:
# Renaming the column name from 'asin' to 'product_id' and 'category_id' to 'category'ArithmeticError

df2 = df2.rename(columns={'asin':'product_id', 'category_id': 'category'})
df2.head(3)

Unnamed: 0,product_id,title,stars,reviews,price,listPrice,category,isBestSeller,boughtInLastMonth
0,B014TMV5YE,"Sion Softside Expandable Roller Luggage, Black...",4.5,0,139.99,0.0,Suitcases,False,2000
1,B07GDLCQXV,Luggage Sets Expandable PC+ABS Durable Suitcas...,4.5,0,169.99,209.99,Suitcases,False,1000
2,B07XSCCZYG,Platinum Elite Softside Expandable Checked Lug...,4.6,0,365.49,429.99,Suitcases,False,300


In [8]:
df2.isna().sum()

product_id           0
title                1
stars                0
reviews              0
price                0
listPrice            0
category             0
isBestSeller         0
boughtInLastMonth    0
dtype: int64

In [9]:
df2['title'] = df2['title'].fillna("")
df2.isna().sum()

product_id           0
title                0
stars                0
reviews              0
price                0
listPrice            0
category             0
isBestSeller         0
boughtInLastMonth    0
dtype: int64

In [10]:
# Sampling just 10k products
df2 = df2.head(10000)
print(len(df2))

10000


In [11]:
!pip install python-dotenv



In [12]:
# Initializing Pinecone
from pinecone import Pinecone, ServerlessSpec
from sentence_transformers import SentenceTransformer
import os
from dotenv import load_dotenv

# Get API key
load_dotenv()  # Loads variables from .env into environment
PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")
# with open("pineconeAPI.txt", "r") as file:
#     PINECONE_API_KEY = file.read().strip()

pc = Pinecone(api_key=PINECONE_API_KEY)
index_name = "products-list"

# Create an index if it doesn't exist
if index_name not in pc.list_indexes():
    pc.create_index(index_name, dimension=384, 
    metric="cosine",
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    ) ) 

# Connect to the index
index = pc.Index(index_name)

# Load embedding model
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")  # Small & fast model


  from tqdm.autonotebook import tqdm


In [13]:
df2.columns

Index(['product_id', 'title', 'stars', 'reviews', 'price', 'listPrice',
       'category', 'isBestSeller', 'boughtInLastMonth'],
      dtype='object')

In [14]:
def format_product_data(row):
    """Formats product data into a string for embedding."""
    return (f"Title: {row['title']}\n"
            f"Stars: {row['stars']}\n"
            f"Reviews: {row['reviews']}\n"
            f"Price: ${row['price']}\n"
            f"List Price: ${row['listPrice']}\n"
            f"Category: {row['category']}\n"
            f"Best Seller: {'Yes' if row['isBestSeller'] else 'No'}\n"
            f"Bought in Last Month: {row['boughtInLastMonth']} units"
           )

# Convert each product into an embedding and store in Pinecone
for idx, row in df2.iterrows():
    text_data = format_product_data(row)
    embedding = embedding_model.encode(text_data).tolist()  # Convert to list
    metadata = {
        "title": row["title"],
        "stars": row["stars"],
        "reviews": row["reviews"],
        "price": row["price"],
        "listPrice": row["listPrice"],
        "category": row["category"],
        "isBestSeller": row["isBestSeller"],
        "boughtInLastMonth": row["boughtInLastMonth"],
    }

    # Store in Pinecone
    index.upsert([(row["product_id"], embedding, metadata)])  # Unique ID for each product


KeyboardInterrupt: 