In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report


In [2]:
products=pd.read_csv("product_info.csv")
print("Dataset loaded successfully.")
print(f"Total products: {len(products)}")

Dataset loaded successfully.
Total products: 8494


In [3]:
print(products.columns)

Index(['product_id', 'product_name', 'brand_id', 'brand_name', 'loves_count',
       'rating', 'reviews', 'size', 'variation_type', 'variation_value',
       'variation_desc', 'ingredients', 'price_usd', 'value_price_usd',
       'sale_price_usd', 'limited_edition', 'new', 'online_only',
       'out_of_stock', 'sephora_exclusive', 'highlights', 'primary_category',
       'secondary_category', 'tertiary_category', 'child_count',
       'child_max_price', 'child_min_price'],
      dtype='object')


In [4]:
print(products.head())

  product_id               product_name  brand_id brand_name  loves_count  \
0    P473671    Fragrance Discovery Set      6342      19-69         6320   
1    P473668    La Habana Eau de Parfum      6342      19-69         3827   
2    P473662  Rainbow Bar Eau de Parfum      6342      19-69         3253   
3    P473660       Kasbah Eau de Parfum      6342      19-69         3018   
4    P473658  Purple Haze Eau de Parfum      6342      19-69         2691   

   rating  reviews            size                      variation_type  \
0  3.6364     11.0             NaN                                 NaN   
1  4.1538     13.0  3.4 oz/ 100 mL  Size + Concentration + Formulation   
2  4.2500     16.0  3.4 oz/ 100 mL  Size + Concentration + Formulation   
3  4.4762     21.0  3.4 oz/ 100 mL  Size + Concentration + Formulation   
4  3.2308     13.0  3.4 oz/ 100 mL  Size + Concentration + Formulation   

  variation_value  ... online_only out_of_stock  sephora_exclusive  \
0             NaN  ...

In [5]:
print(products['primary_category'].unique())

['Fragrance' 'Bath & Body' 'Mini Size' 'Hair' 'Makeup' 'Skincare'
 'Tools & Brushes' 'Men' 'Gifts']


In [6]:
skincare=products[products['primary_category']=="Skincare"]

In [7]:
skincare=skincare[['product_name','brand_name', 'rating', 'reviews', 'ingredients','primary_category']]

In [8]:
print(f"Skincare products: {len(skincare)}")

Skincare products: 2420


In [9]:
ingredients_to_check = ["Hyaluronic Acid", "Niacinamide", "Retinol",
    "Glycerin", "Squalane", "Panthenol",
    "Salicylic Acid", "Glycolic Acid", "Lactic Acid",
    "Vitamin C", "Peptides", "Ceramides",
    "Fragrance", "Alcohol Denat"
]


In [10]:

for ing in ingredients_to_check:
    col_name = f"has_{ing.lower().replace(' ', '_')}"
    skincare[col_name] = skincare['ingredients'].str.contains(
        ing,
        na=False
    ).astype(int)
    count = skincare[col_name].sum()
    print(f"{ing}: {count} products")

Hyaluronic Acid: 207 products
Niacinamide: 348 products
Retinol: 97 products
Glycerin: 1791 products
Squalane: 548 products
Panthenol: 342 products
Salicylic Acid: 252 products
Glycolic Acid: 202 products
Lactic Acid: 325 products
Vitamin C: 63 products
Peptides: 1 products
Ceramides: 1 products
Fragrance: 747 products
Alcohol Denat: 189 products


In [11]:
skincare['worth_it'] = (
    (skincare['rating'] >= 4.5) &
    (skincare['reviews'] >= 50)
).astype(int)


In [12]:
print("\nTarget variable distribution:")
print(skincare['worth_it'].value_counts())



Target variable distribution:
0    1934
1     486
Name: worth_it, dtype: int64


In [13]:
ingredient_cols = [
    col for col in skincare.columns
    if col.startswith("has_")
]

X = skincare[ingredient_cols]
y = skincare['worth_it']

In [14]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [15]:
model = RandomForestClassifier(
    n_estimators=200,
    random_state=42,
    class_weight="balanced"
)

model.fit(X_train, y_train)


In [16]:
y_pred = model.predict(X_test)

In [17]:
print("\nModel Performance:")
print(classification_report(y_test, y_pred))



Model Performance:
              precision    recall  f1-score   support

           0       0.82      0.65      0.72       387
           1       0.24      0.44      0.31        97

    accuracy                           0.61       484
   macro avg       0.53      0.54      0.52       484
weighted avg       0.71      0.61      0.64       484



In [18]:
importance = pd.Series(
    model.feature_importances_,
    index=ingredient_cols
)

importance = importance.sort_values(ascending=False)

print("\nTop important ingredients:")
print(importance.head(10))



Top important ingredients:
has_squalane           0.113451
has_fragrance          0.112782
has_lactic_acid        0.100757
has_glycerin           0.095273
has_niacinamide        0.087994
has_panthenol          0.086949
has_salicylic_acid     0.079944
has_glycolic_acid      0.079639
has_hyaluronic_acid    0.079307
has_alcohol_denat      0.061775
dtype: float64


In [19]:
results = skincare.loc[X_test.index, [
    'product_name',
    'brand_name',
    'worth_it'
]].copy()

results['predicted'] = y_pred

print("\nProducts predicted as WORTH the hype:")
print(
    results[results['predicted'] == 1][
        ['product_name', 'brand_name']
    ].head(10)
)

print("\nProducts predicted as NOT worth the hype:")
print(
    results[results['predicted'] == 0][
        ['product_name', 'brand_name']
    ].head(10)
)


Products predicted as WORTH the hype:
                                           product_name  \
3407           OMG! Omega The Great Fish Oil Supplement   
7243               Blue Moon Clean-Rinse Cleansing Balm   
3586  Bye Bye Under Eye Brightening Eye Cream for Da...   
2469               Face Coach Lifting Squalane Face Oil   
2074                         The Good C Vitamin C Serum   
2153                       Alpha Beta Daily Moisturizer   
7133     Gradual Tan Classic Daily Youth Boosting Cream   
5591                   Glow Cycle Retin-ALT Power Serum   
6868                    Wrinkle Correcting Skincare Set   
3312  Equilibrium Day Fluid Sunscreen Broad Spectrum...   

                     brand_name  
3407              HUM Nutrition  
7243               Sunday Riley  
3586               IT Cosmetics  
2469                    FaceGym  
2074          Dr. Barbara Sturm  
2153  Dr. Dennis Gross Skincare  
7133                 St. Tropez  
5591               OLEHENRIKSEN  
6868 

In [20]:
correct = (results['worth_it'] == results['predicted']).sum()
total = len(results)

print(f"\nCorrect predictions: {correct}/{total}")
print(f"Accuracy: {correct/total:.2f}")





Correct predictions: 293/484
Accuracy: 0.61
