# Download the data (run only once)

In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("nadyinky/sephora-products-and-skincare-reviews")

print("Path to dataset files:", path)

  from .autonotebook import tqdm as notebook_tqdm


Path to dataset files: C:\Users\Ana Marija Pavičić\.cache\kagglehub\datasets\nadyinky\sephora-products-and-skincare-reviews\versions\2


# Load and clean the dataset

In [3]:
import pandas as pd

# Load the CSV data into a DataFrame
df = pd.read_csv("data/product_info.csv")

primary_category = set(df['primary_category'])
print(primary_category)

# Filter the DataFrame for rows where 'primary_category' is 'Skincare'
skincare_df = df[df['primary_category'] == 'Skincare']

# Display data size
print("Skin care data size", len(skincare_df))

# remove rows where highlight are non existent
skincare_df = skincare_df[skincare_df['highlights'].notna() & (skincare_df['highlights'] != '')]

print("Skin care data size after removing empty highlights",len(skincare_df))

# remove rows where ingredients are non existent
skincare_df = skincare_df[skincare_df['ingredients'].notna() & (skincare_df['ingredients'] != '')]

print("Skin care data size after removing empty ingredients",len(skincare_df))

# remove where the secondary_category is empty
skincare_df = skincare_df[skincare_df['secondary_category'].notna() & (skincare_df['secondary_category'] != '')]

print("Skin care data size after removing empty secondary_category",len(skincare_df))

# # remove where the tertiary_category is empty
# skincare_df = skincare_df[skincare_df['tertiary_category'].notna() & (skincare_df['tertiary_category'] != '')]
# print("Skin care data size after removing empty tertiary_category",len(skincare_df))


{'Hair', 'Bath & Body', 'Skincare', 'Gifts', 'Mini Size', 'Men', 'Fragrance', 'Tools & Brushes', 'Makeup'}
Skin care data size 2420
Skin care data size after removing empty highlights 2003
Skin care data size after removing empty ingredients 1927
Skin care data size after removing empty secondary_category 1927


# Clean highlights column

In [4]:
highlights = skincare_df['highlights']

highlights = [h.replace("[", "").replace("]", "").replace("'", "").replace("Best for ", "").replace("Good for: ", "").replace(" Skin", "").replace("/", ", ") for h in highlights]

skincare_df['highlights'] = highlights

# Clean ingredients

In [24]:
# Filter rows containing 'Beta-Carotene' in the ingredients column
contains_beta_carotene = skincare_df[skincare_df['ingredients'].str.contains('Beta-Carotene \(', case=False, na=False)]['ingredients']
print(contains_beta_carotene)

90      ['Collagen (Vegan)*, Water (Aqua, Eau), Propan...
92      ['Collagen (Vegan)*, Water (Aqua, Eau), Glycer...
97      ['GENIUS Liquid Collagen:', 'Collagen (Vegan),...
116     ['Collagen (Vegan)*, Water (Aqua, Eau), Propan...
120     ['Collagen (Vegan)*, Water (Aqua, Eau), Glycer...
6867    ['Water (Aqua/Eau), Pentaerythrityl Tetraethyl...
Name: ingredients, dtype: object


  contains_beta_carotene = skincare_df[skincare_df['ingredients'].str.contains('Beta-Carotene \(', case=False, na=False)]['ingredients']


In [None]:
ingredients = skincare_df['ingredients']
first = ingredients.iloc[1].replace("[", "").replace("]", "").replace("'", "").replace(" (Vegan)*", "")
print(first)

seen_ingredients = []

set_ingredients = set()
for row in ingredients:
   list_ingredients = set(row.replace("[", "").replace("]", "").split(","))
   set_ingredients.update(list_ingredients)
   
   
#print(set_ingredients)

'Collagen (Vegan)*, Water (Aqua, Eau), Propanediol, Isononyl Isononanoate, Butylene Glycol, Glycerin, Betaine, Pentylene Glycol, Dextrin Palmitate, Collagen Amino Acids (Vegan)*, Parachlorella Beijerinckii Exopolysaccharides, Chlorella Protothecoides Oil, Helichrysum Stoechas Flower Extract, Cylindrotheca Fusiformis Extract, Niacinamide, Tocopherol, Palmitoyl Tripeptide-1, Palmitoyl Tetrapeptide-7, Adenosine, Helianthus Annuus (Sunflower) Seed Oil, Leuconostoc/Radish Root Ferment Filtrate, Caprylic/Capric Triglyceride, Palmitic Acid, Disodium EDTA, Carbomer, Sodium Lactate, Amodimethicone, Homosalate, Polysorbate 20, Butyl Methoxydibenzoylmethane, 1,2-Hexanediol, Caprylyl Glycol, Octocrylene, Sodium Hydroxide, Phenoxyethanol, Fragrance (Parfum), Limonene, Beta-Carotene (CI 40800).'


# Save cleaned dataset

In [8]:
skincare_df.to_csv('data/skincare.csv', index=False)

In [9]:
import pandas as pd

# Load the CSV data into a DataFrame
count = 0
df = pd.read_csv("data/reviews_0-250.csv")
count += len(df)

product_ids = skincare_df['product_id']
print(len(df))

df_review1 = df[df['product_id'].isin(product_ids)]
print(len(df_review1))
print("Different users, ", len(set(df['author_id'])))

  df = pd.read_csv("data/reviews_0-250.csv")


602130
545756
Different users,  383697


In [11]:
df_2 = pd.read_csv("data/reviews_250-500.csv")
count += len(df_2)
df_review2 = df_2[df_2['product_id'].isin(product_ids)]

df_3 = pd.read_csv("data/reviews_500-750.csv")
count += len(df_3)
df_review3 = df_3[df_3['product_id'].isin(product_ids)]

df_4 = pd.read_csv("data/reviews_750-1250.csv")
count += len(df_4)
df_review4 = df_4[df_4['product_id'].isin(product_ids)]

df_5 = pd.read_csv("data/reviews_1250-end.csv")
count += len(df_5)
df_review5 = df_5[df_5['product_id'].isin(product_ids)]

combined_df = pd.concat([df_review2, df_review1], axis=0, ignore_index=True)
combined_df = pd.concat([combined_df, df_review3], axis=0, ignore_index=True)
combined_df = pd.concat([combined_df, df_review4], axis=0, ignore_index=True)
combined_df = pd.concat([combined_df, df_review5], axis=0, ignore_index=True)

print("skincare", len(combined_df))
print("all", count)
print(len(set(combined_df['author_id'])))


  df_4 = pd.read_csv("data/reviews_750-1250.csv")
  df_5 = pd.read_csv("data/reviews_1250-end.csv")


skincare 966018
all 1094411
513998


### Users with at least 2 reviews

In [14]:
# Group by 'author_id' and filter authors who have reviewed more than one product
filtered_df = combined_df.groupby('author_id').filter(lambda x: x['product_id'].nunique() > 1)

# Display the filtered DataFrame
print(len(filtered_df))
print(len(set(filtered_df['author_id'])))

635362
185228


### Users with at least 3 reviews

In [16]:
filtered_df = filtered_df.groupby('author_id').filter(lambda x: x['product_id'].nunique() > 2)

# Display the filtered DataFrame
print(len(filtered_df))
print(len(set(filtered_df['author_id'])))

424595
80341


### Users with at least 4 reviews

In [17]:
filtered_df = filtered_df.groupby('author_id').filter(lambda x: x['product_id'].nunique() > 3)

# Display the filtered DataFrame
print(len(filtered_df))
print(len(set(filtered_df['author_id'])))

315929
44270


### Users with at least 5 reviews

In [18]:
filtered_df = filtered_df.groupby('author_id').filter(lambda x: x['product_id'].nunique() > 4)

# Display the filtered DataFrame
print(len(filtered_df))
print(len(set(filtered_df['author_id'])))

247953
27350


### Users with at least 6 reviews

In [19]:
filtered_df = filtered_df.groupby('author_id').filter(lambda x: x['product_id'].nunique() > 5)

# Display the filtered DataFrame
print(len(filtered_df))
print(len(set(filtered_df['author_id'])))

202496
18287


#### Users with at least 7 reviews

In [20]:
filtered_df = filtered_df.groupby('author_id').filter(lambda x: x['product_id'].nunique() > 6)

# Display the filtered DataFrame
print(len(filtered_df))
print(len(set(filtered_df['author_id'])))

170851
13033
