In [1]:
import json
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import pandas as pd
# Ensure you download the NLTK stopwords dataset if you haven't already
# Uncomment the line below for the first run
#import nltk
#nltk.download('stopwords')
#nltk.download('punkt')

# Read the data from products.json
with open('../scrapping/products.json', 'r') as f: 
    products = json.load(f)

# Define stop words
stop_words = set(stopwords.words("english"))

# Function to preprocess text
def preprocess_text(text):
    # Lowercasing
    text = text.lower()
    # Removing punctuation and numbers
    text = ''.join(char for char in text if char.isalpha() or char.isspace())
    # Tokenize and remove stop words
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    # Join tokens back into a single string
    return ' '.join(tokens)

# Function to limit title to 3 words
def limit_title_to_three_words(title):
    words = title.split()  # Split title into words
    return ' '.join(words[:3])  # Join the first 3 words

# Process and clean the data
cleaned_products = []
for product in products:
    # Fix the price format (remove double dots)
    if "price" in product and ".." in product["price"]:
        product["price"] = product["price"].replace("..", ".")
    
    # Fix the rating format (extract only the number)
    if "rating" in product and " out of 5 stars" in product["rating"]:
        product["rating"] = product["rating"].split(" ")[0]
    
    # Remove products with rating = "N/A"
    if product.get("rating") == "N/A":
        continue
    
    # Remove products with price = "N/A"
    if product.get("price") == "N/A":
        continue
    
    # Preprocess the description
    if "description" in product:
        product["description"] = preprocess_text(product["description"])
    
    # Limit the title to 3 words
    if "title" in product:
        product["title"] = limit_title_to_three_words(product["title"])
    
    # Add the cleaned product to the list
    cleaned_products.append(product)

# Convert cleaned products back to DataFrame
df = pd.DataFrame(cleaned_products)

# Combine 'category' and 'description' into one feature (if both exist)
if 'category' in df.columns and 'description' in df.columns:
    df['combined'] = df['category'] + " " + df['description']

# Save the cleaned data to a new JSON file
with open("products.json", "w", encoding="utf-8") as json_file:
    json.dump(cleaned_products, json_file, ensure_ascii=False, indent=4)

print("Data cleaning and preprocessing complete. Cleaned data saved to products.json")


Data cleaning and preprocessing complete. Cleaned data saved to products.json
