The function of this notebook is to load, clean and embed the dataframe to be used in the final model. The dataframe is then saved in a .csv file, which can be loaded in the final program. This achieves optimization in the final program and website deployment. 

In [None]:
from pathlib import Path
import kagglehub
import pandas as pd
import os
import pickle
from pathlib import Path
import joblib
import re
from sentence_transformers import SentenceTransformer
import warnings
warnings.filterwarnings("ignore")  # Suppress warnings for cleaner output
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from dotenv import load_dotenv
import openai
from datetime import datetime


In [None]:
# Import libraries
# Import all necessary libraries for this program



# Download dataset
path = kagglehub.dataset_download("datafiniti/consumer-reviews-of-amazon-products")

# Read CSV files
file_path1 = os.path.join(path, "1429_1.csv")
file_path2 = os.path.join(path, "Datafiniti_Amazon_Consumer_Reviews_of_Amazon_Products.csv")
file_path3 = os.path.join(path, "Datafiniti_Amazon_Consumer_Reviews_of_Amazon_Products_May19.csv")
df1 = pd.read_csv(file_path1)
df2 = pd.read_csv(file_path2)
df3 = pd.read_csv(file_path3)

# Load pickle mapping

pickle_file_path = Path.cwd() / "Joblib_files" / "unique_categories_dict.pkl"
if not pickle_file_path.exists():
    raise FileNotFoundError(f"Missing pickle file: {pickle_file_path}")
with open(pickle_file_path, "rb") as f:
    meta_category_mapping = pickle.load(f)

# Filter and prepare data
columns_df1 = ['name', 'asins', 'categories', 'reviews.doRecommend', 'reviews.numHelpful', 'reviews.rating', 'reviews.text', 'reviews.title']
columns_other = columns_df1 + ['imageURLs']
df1_filtered = df1[columns_df1]
df1_filtered['imageURLs'] = "https://upload.wikimedia.org/wikipedia/commons/a/ac/No_image_available.svg"
df2_filtered = df2[columns_other]
df3_filtered = df3[columns_other]

# Combine the dataframes, and map the meta-categories that we extracted via the clustering model to the 'categories' column
df_combined = pd.concat([df1_filtered, df2_filtered, df3_filtered], ignore_index=True)
df_combined['meta_category'] = df_combined['categories'].map(meta_category_mapping).fillna("Unknown")

# Print output of the combined dataframe
print("Shape of the combined dataframe:", df_combined.shape)
print(df_combined.head())


# First we need to clean the data with our data cleaning function, taking into account that the reviews.text column is a string but might contain floats
# Clean the review text column and append the cleaned text to a new column, given the size of the dataset this will not increase the model runtime significantly
def light_clean(text):
    if isinstance(text, float):
        text = str(text)  # Convert float to string
    text = text.strip()
    text = re.sub(r"\s+", " ", text)  # normalize whitespace
    return text

df_combined["cleaned_text"] = df_combined["reviews.text"].apply(light_clean)

# Then we convert the relevant columns of the dataframe to word embeddings using a pre-trained sentence transformer model. We use a sentence transformer as these are designed 
# to work well with natural, unaltered sentences and are trained on a large corpus of text data, making them suitable for generating embeddings for a wide range of text inputs.

model = SentenceTransformer('all-MiniLM-L6-v2')
# Generate embeddings for the cleaned review text
embeddings = model.encode(df_combined["cleaned_text"].tolist(), show_progress_bar=True, convert_to_tensor= True, device='cuda')
# Append the embeddings to the dataframe
df_combined["embeddings"] = embeddings.tolist()

In [None]:
# Save the combined dataframe with embeddings to a CSV file
output_file_path = Path.cwd() / "Joblib_files" / "amazon_reviews_with_embeddings.csv"
df_combined.to_csv(output_file_path, index=False)

In [None]:
load_dotenv()  # Load environment variables from .env file

# Use the standard environment variable name for OpenAI API key
api_key = os.getenv("OPENAI_TEST_KEY_KdR")
if not api_key:
    raise ValueError("OpenAI API key not found. Please set the OPENAI_TEST_KEY_KdR environment variable.")

client = openai.OpenAI(api_key=api_key)

# Print the OpenAI API key to confirm it is loaded correctly (for debugging, remove in production)
print("OpenAI API key loaded successfully.")

In [None]:
# Combine all the dataframes that contain the final model output into a single dataframe, that can be loaded into the website
files_to_load = ["Batteries_Summary.csv", "Connected_Home_Electronics.csv", "Kitchen_Storage_Summary.csv", 
                  "Office_Supplies_Summary.csv", "Pet_Products_Summary.csv", "Portable_Electronics.csv"]

# Load each file to load and concatenate them into a single dataframe
dataframes = []
for file_name in files_to_load:
    file_path = Path.cwd() / "Product Summaries" / file_name
    if not file_path.exists():
        print(f"File {file_name} does not exist, skipping.")
        continue
    df = pd.read_csv(file_path)
    dataframes.append(df)

# Concatenate all dataframes into a single dataframe
Summaries_df = pd.concat(dataframes, ignore_index=True)
# Save the combined dataframe to a CSV file
output_summaries_path = Path.cwd() / "Product Summaries" / "Summaries Combined.csv"
Summaries_df.to_csv(output_summaries_path, index=False)



In [None]:
def extract_best_image_url(url_string):
    """
    Extract the best valid image URL from a comma-separated list.
    Decodes URL-encoded parts, filters for known image hosts.
    """
    placeholder = "https://upload.wikimedia.org/wikipedia/commons/a/ac/No_image_available.svg"
    
    if pd.isna(url_string) or not url_string.strip():
        return placeholder

    urls = [url.strip() for url in url_string.split(",") if url.strip()]
    
    trusted_domains = [
        'amazon.com',
        'ebayimg.com'
    ]

    for url in urls:
        decoded_url = urllib.parse.unquote(url)
        for domain in trusted_domains:
            if domain in decoded_url:
                return decoded_url  # return first valid match

    # fallback if none matched
    return placeholder