# Process product_info.cvs

In [11]:
import pandas as pd
import os

row_data_directroy = 'data'
row_data_path = 'data/product_info.csv'
processed_data_directory = 'processed_data'
processed_data_path = processed_data_directory + '/skincare.csv'

# read all products inf0
df = pd.read_csv(row_data_path)

# Filter the DataFrame for rows where 'primary_category' is 'Skincare'
skincare_df = df[df['primary_category'] == 'Skincare']
print("Skin care data size", len(skincare_df))

# remove rows where highlight are non existent
skincare_df = skincare_df[skincare_df['highlights'].notna() & (skincare_df['highlights'] != '')]
print("Skin care data size after removing empty highlights",len(skincare_df))

# remove rows where ingredients are non existent
skincare_df = skincare_df[skincare_df['ingredients'].notna() & (skincare_df['ingredients'] != '')]
print("Skin care data size after removing empty ingredients",len(skincare_df))

# remove where the secondary_category is empty
skincare_df = skincare_df[skincare_df['secondary_category'].notna() & (skincare_df['secondary_category'] != '')]
print("Skin care data size after removing empty secondary_category",len(skincare_df))

# remove 'Mini Size' from data
skincare_df = skincare_df[skincare_df['secondary_category'] != 'Mini Size']
print("Skin care data size after removing 'Mini Size' secondary_category",len(skincare_df))

# remove 'Value & Gift Sets' from data
skincare_df = skincare_df[skincare_df['secondary_category'] != 'Value & Gift Sets']
print("Skin care data size after removing 'Value & Gift Sets' secondary_category",len(skincare_df))


# clean highlight column
highlights = skincare_df['highlights']
highlights = [h.replace("[", "").replace("]", "").replace("'", "").replace("Best for ", "").replace("Good for: ", "").replace(" Skin", "").replace("/", ", ") for h in highlights]
skincare_df['highlights'] = highlights

# clean ingredients column
import re

# Function to clean each row
def clean_ingredients(row):
   # Step 1: Replace unwanted characters
   row = row.replace("[", "").replace("]", "").replace("'", "").replace(" (Vegan)*", "").replace(".", "")
   
   # Step 2: Remove text inside parentheses
   row = re.sub(r'\([^)]*\)', '', row)
   
   # Step 3: Replace " , " with a single comma (in case extra spaces after commas)
   row = row.replace(" ,", ",")
   
   # Step 4: Check for "water", "aqua", or "eau" and replace first occurrence
   if "water" in row.lower() or "aqua" in row.lower() or "eau" in row.lower():
      row_list = row.split(", ")
      # Find the first occurrence of "water", "aqua", or "eau"
      index = next((i for i, s in enumerate(row_list) if 'water' in s.lower() or "aqua" in s.lower() or "eau" in s.lower()), -1)
      if index != -1:
         # Replace the identified word with "Water"
         row_list[index] = "Water"
      row = ", ".join(row_list)
   
   return row

skincare_df['ingredients'] = skincare_df['ingredients'].apply(clean_ingredients)

# Ensure the directory exists
output_dir = os.path.dirname(processed_data_path)
if not os.path.exists(output_dir):
   os.makedirs(output_dir)
# save cleaned file
skincare_df.to_csv(processed_data_path, index=False)

Skin care data size 2420
Skin care data size after removing empty highlights 2003
Skin care data size after removing empty ingredients 1927
Skin care data size after removing empty secondary_category 1927
Skin care data size after removing 'Mini Size' secondary_category 1824
Skin care data size after removing 'Value & Gift Sets' secondary_category 1659


# Process reviews

In [23]:
product_ids = skincare_df['product_id']

path = 'data/reviews_0-250.csv'

def clean_reviews(row_data_path, processed_data_path):
   # read the data
   df = pd.read_csv(row_data_path)

   # keep only the skincare data
   df = df[df['product_id'].isin(product_ids)]

   # keep only reviews with rating 5,4 or with rating 3 and review text
   df = df[(df['rating'].isin((5, 4))) | ((df['rating'] == 3) & (df['review_text'] != ''))]
   
   # Ensure the directory exists
   output_dir = os.path.dirname(processed_data_path)
   if not os.path.exists(output_dir):
      os.makedirs(output_dir)
      
   # save cleaned file
   df.to_csv(processed_data_path, index=False)

clean_reviews('data/reviews_0-250.csv', 'processed_data/reviews_0-250.csv')
clean_reviews('data/reviews_250-500.csv', 'processed_data/reviews_250-500.csv')
clean_reviews('data/reviews_500-750.csv', 'processed_data/reviews_500-750.csv')
clean_reviews('data/reviews_750-1250.csv', 'processed_data/reviews_750-1250.csv')
clean_reviews('data/reviews_1250-end.csv', 'processed_data/reviews_1250-end.csv')


  df = pd.read_csv(row_data_path)
  df = pd.read_csv(row_data_path)
  df = pd.read_csv(row_data_path)
