In [None]:
import dask.dataframe as dd
import pandas as pd
import gzip
import json


# Loading Dataset
file_path = 'Clothing_Shoes_and_Jewelry_5.json.gz'


def read_json_gz_in_chunks(file_path, chunk_size=100000): #    1000000):
    with gzip.open(file_path, 'rt', encoding='utf-8') as f:
        chunk = []
        for i, line in enumerate(f):
            chunk.append(json.loads(line))
            if (i + 1) % chunk_size == 0:
                yield pd.DataFrame(chunk)
                break
        # if chunk:
        #     yield pd.DataFrame(chunk)
        # if 

# Initializing an empty DataFrame to concatenate chunks
df = pd.DataFrame()

# Read and process in chunks
for chunk_df in read_json_gz_in_chunks(file_path):
    df = pd.concat([df, chunk_df], ignore_index=True)

del(chunk_df)
df.head()

In [None]:
reviews_df = df[['overall', 'reviewerID', 'asin', 'reviewText']]
reviews_df = reviews_df.dropna()

# change column names of reviews_df
reviews_df.columns = ['rating', 'user-id', 'product-id', 'review']
del(df)

In [None]:
import matplotlib.pyplot as plt

plt.hist(reviews_df['review'].str.len(), bins=40, edgecolor = 'black')

In [None]:
import pandas as pd
from textblob import TextBlob

# Function to compute sentiment polarity
def get_sentiment(text):
    return TextBlob(text).sentiment.polarity

# Apply sentiment analysis
reviews_df['sentiment'] = reviews_df['review'].apply(get_sentiment)

# Aggregate reviews and average sentiment for each product
aggregated = reviews_df.groupby('product-id').agg({
    'review': lambda x: ' '.join(x),
    'rating': 'mean',
    'sentiment': 'mean'
}).reset_index()

# Display the first few rows of the aggregated dataset
print(aggregated.head())


In [None]:
from sentence_transformers import SentenceTransformer
import torch
import numpy as np

# Load the pre-trained model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Convert the reviews to embeddings in batches
def batch_embed_texts(texts, batch_size=32):
    embeddings = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        batch_embeddings = model.encode(batch, convert_to_tensor=True)
        if torch.cuda.is_available():
            batch_embeddings = batch_embeddings.cpu()  # Move to CPU if using GPU
        embeddings.extend(batch_embeddings.numpy())
    return np.array(embeddings)

aggregated['review_embedding'] = list(batch_embed_texts(aggregated['review'].tolist()))


In [None]:
from sklearn.metrics.pairwise import cosine_similarity

def recommend_products(query, aggregated_df, top_n=15):
    # Convert the query to an embedding
    query_embedding = model.encode(query, convert_to_tensor=True)
    if torch.cuda.is_available():
        query_embedding = query_embedding.cpu()  # Move to CPU if using GPU
    query_embedding = query_embedding.numpy()
    
    # Calculate the cosine similarity between the query and each review
    similarities = []
    for embedding in aggregated_df['review_embedding']:
        similarity = cosine_similarity([query_embedding], [embedding])[0][0]
        similarities.append(similarity)
    aggregated_df['similarity'] = similarities
    
    # Combine similarity with sentiment score
    aggregated_df['final_score'] = aggregated_df['similarity'] * aggregated_df['sentiment']
    
    # Sort the products by final score and rating
    recommendations = aggregated_df.sort_values(by=['final_score', 'rating'], ascending=False)
    
    # Select the top N recommended products
    top_recommendations = recommendations.head(top_n)
    
    return top_recommendations[['product-id', 'rating', 'review', 'similarity', 'sentiment', 'final_score']]

# Example query
query = "nice kids books"

# Get the top 5 recommendations
top_recommendations = recommend_products(query, aggregated)
print(top_recommendations)


In [None]:
top_recommendations['product-id'].unique().size

In [None]:
del(reviews_df)