In [1]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans

# Load the dataset
dataset = pd.read_csv('static/allShades.csv')

def hex_to_rgb(hex_color):
    hex_color = hex_color.lstrip('#')
    if len(hex_color) == 3:
        hex_color = ''.join([char * 2 for char in hex_color])
    return tuple(int(hex_color[i:i+2], 16) for i in (0, 2, 4))

def recommend_foundation(hex_color, top_n=3):
    # Convert hex color to RGB
    rgb_color = np.array(hex_to_rgb(hex_color)).reshape(1, -1)

    # Use KMeans to cluster the dataset into color groups
    kmeans = KMeans(n_clusters=5, random_state=42)  # Adjust the number of clusters as needed
    dataset['cluster'] = kmeans.fit_predict(dataset[['red', 'green', 'blue']])

    # Find the cluster of the input color
    input_cluster = kmeans.predict(rgb_color)[0]

    # Filter dataset to include only products in the same cluster
    cluster_dataset = dataset[dataset['cluster'] == input_cluster]

    # Calculate Euclidean distance for each row in the filtered dataset
    cluster_dataset['distance'] = cluster_dataset.apply(lambda row: np.linalg.norm(np.array(row[['red', 'green', 'blue']]) - np.array(rgb_color)), axis=1)

    # Sort the filtered dataset based on distance
    sorted_cluster_dataset = cluster_dataset.sort_values(by='distance')

    # Select the top N recommendations
    recommendations = sorted_cluster_dataset.head(top_n)

    return recommendations[['brand', 'product', 'description', 'url', 'name', 'hex']]

# Add red, green, and blue columns to the dataset
dataset[['red', 'green', 'blue']] = pd.DataFrame([hex_to_rgb(hex_color) for hex_color in dataset['hex']])

# Example: Get recommendations for a predicted skin color
predicted_hex_color = '#FFE8D8'  
recommendations = recommend_foundation(predicted_hex_color)
print(recommendations)

  super()._check_params_vs_input(X, default_n_init=10)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cluster_dataset['distance'] = cluster_dataset.apply(lambda row: np.linalg.norm(np.array(row[['red', 'green', 'blue']]) - np.array(rgb_color)), axis=1)


             brand                                            product  \
5573        La Mer         The Soft Fluid Long Wear Foundation SPF 20   
1401  Estée Lauder  Double Wear Stay In Place Matte Powder Foundation   
1540           MAC                  Studio Fix Powder Plus Foundation   

                                            description  \
5573  130 Warm Ivory - Very Light Skin with Neutral ...   
1401        1N2 Ecru (light w/ neutral rosy undertones)   
1540  NC10 (very fair w/ neutral golden undertone fo...   

                                                    url        name      hex  
5573  https://www.sephora.com/product/the-soft-fluid...  Warm Ivory  #FDE7D7  
1401  https://www.ulta.com/double-wear-stay-in-place...        Ecru  #FDE9D7  
1540  https://www.ulta.com/studio-fix-powder-plus-fo...         NaN  #FEEAD9  


In [1]:
import pandas as pd
import requests

def is_url_accessible(url):
    try:
        response = requests.get(url)
        return response.status_code == 200
    except requests.ConnectionError:
        return False

def remove_inaccessible_urls(dataset):
    for index, row in dataset.iterrows():
        url = row['url']
        if not is_url_accessible(url):
            print(f"Removing inaccessible URL: {url}")
            dataset.at[index, 'url'] = 'NA'  # Ganti dengan nilai yang sesuai

# Baca dataset dari file CSV
dataset = pd.read_csv('allShades.csv')

# Hapus URL yang tidak dapat diakses
remove_inaccessible_urls(dataset)

# Hapus baris yang memiliki nilai 'NA' pada kolom 'url'
dataset = dataset[dataset['url'] != 'NA']

# Simpan dataset yang sudah diperbarui ke file CSV baru
dataset.to_csv('allShades_cleaned.csv', index=False)

Removing inaccessible URL: https://www.ulta.com/ultimate-coverage-24-hour-foundation?productId=xlsImpprod18191081
Removing inaccessible URL: https://www.ulta.com/ultimate-coverage-24-hour-foundation?productId=xlsImpprod18191081
Removing inaccessible URL: https://www.ulta.com/ultimate-coverage-24-hour-foundation?productId=xlsImpprod18191081
Removing inaccessible URL: https://www.ulta.com/ultimate-coverage-24-hour-foundation?productId=xlsImpprod18191081
Removing inaccessible URL: https://www.ulta.com/ultimate-coverage-24-hour-foundation?productId=xlsImpprod18191081
Removing inaccessible URL: https://www.ulta.com/ultimate-coverage-24-hour-foundation?productId=xlsImpprod18191081
Removing inaccessible URL: https://www.ulta.com/ultimate-coverage-24-hour-foundation?productId=xlsImpprod18191081
Removing inaccessible URL: https://www.ulta.com/ultimate-coverage-24-hour-foundation?productId=xlsImpprod18191081
Removing inaccessible URL: https://www.ulta.com/ultimate-coverage-24-hour-foundation?pro

In [2]:
import os
import pandas as pd
import requests
from PIL import Image
from io import BytesIO

# Load the dataset
df = pd.read_csv('allShades_cleaned.csv')

# Create a new column for storing image filenames
df['image_filename'] = ''

# Directory to save images
image_dir = 'product_images'
os.makedirs(image_dir, exist_ok=True)

# Function to download and save the image
def download_and_save_image(url, filename):
    try:
        response = requests.get(url)
        img = Image.open(BytesIO(response.content))
        img.save(os.path.join(image_dir, filename))
        return filename
    except Exception as e:
        print(f"Error downloading image from {url}: {e}")
        return None

# Iterate through each row in the dataset
for index, row in df.iterrows():
    # Extract the URL from the 'imgSrc' column
    img_url = row['imgSrc']
    
    # Generate a unique filename based on the index
    filename = f"image_{index}.png"
    
    # Download and save the image
    saved_filename = download_and_save_image(img_url, filename)
    
    # If the image was successfully downloaded and saved, store the filename
    if saved_filename:
        df.at[index, 'image_filename'] = saved_filename

# Save the modified dataset
df.to_csv('allShades_with_images.csv', index=False)

Error downloading image from https://images.ulta.com/is/image/Ulta/2160686?$sm$: cannot identify image file <_io.BytesIO object at 0x0000018FFD7E7100>
Error downloading image from https://images.ulta.com/is/image/Ulta/2160687?$sm$: cannot identify image file <_io.BytesIO object at 0x0000018FF1DBE020>
Error downloading image from https://images.ulta.com/is/image/Ulta/2160688?$sm$: cannot identify image file <_io.BytesIO object at 0x0000018FF1DBDE90>
Error downloading image from https://images.ulta.com/is/image/Ulta/2160690?$sm$: cannot identify image file <_io.BytesIO object at 0x0000018FF1DBDB20>
Error downloading image from https://images.ulta.com/is/image/Ulta/2160691?$sm$: cannot identify image file <_io.BytesIO object at 0x0000018FF1DBE020>
Error downloading image from https://images.ulta.com/is/image/Ulta/2260625?$sm$: cannot identify image file <_io.BytesIO object at 0x0000018FFD320EA0>
Error downloading image from https://images.ulta.com/is/image/Ulta/2153596?$sm$: cannot identi

PermissionError: [Errno 13] Permission denied: 'allShades_with_images.csv'