In [6]:
# Install required libraries in Kaggle
!pip install pytesseract pillow spacy pandas scikit-learn transformers sentence-transformers

# Import libraries
import pytesseract
from PIL import Image
import pandas as pd
import re
import spacy
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer





In [7]:


# Load datasets
banned_ingredients = pd.read_csv("/kaggle/input/cosmet-hotlist-canada-t2/cosmetic_ingredient_hotlist_t2.csv")
banned_ingredients_additional = pd.read_csv("/kaggle/input/cosmetic-hotlist-ingred-cananda-t1-csv/cosmetic_ingredient_hotlist.csv")
banned_df = pd.concat([banned_ingredients, banned_ingredients_additional], ignore_index=True)
cosmetics_data = pd.read_csv("/kaggle/input/cosmetic-git-sephora/cosmetics (1).csv")

# Pre-trained NLP model
nlp_model = SentenceTransformer('all-MiniLM-L6-v2')

# Helper functions
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'[^a-z\s]', '', text)
    return text

banned_df['Ingredient'] = banned_df['Ingredient'].apply(clean_text)
banned_set = set(banned_df['Ingredient'].dropna())

def extract_text_from_image(image_path):
    image = Image.open(image_path)
    return pytesseract.image_to_string(image)

def preprocess_ingredient_list(ingredients):
    ingredients = clean_text(ingredients)
    return ingredients.split(', ')

def classify_ingredients(ingredients, banned_set):
    tokens = preprocess_ingredient_list(ingredients)
    banned_tokens = [token for token in tokens if token in banned_set]
    return banned_tokens, len(banned_tokens)

def recommend_similar_products(banned_tokens, cosmetics_data, n=5):
    cosmetics_data['embedding'] = cosmetics_data['Ingredients'].apply(
        lambda x: nlp_model.encode(clean_text(x) if pd.notna(x) else "")
    )
    banned_text = ", ".join(banned_tokens)
    banned_embedding = nlp_model.encode(banned_text)
    cosmetics_data['similarity'] = cosmetics_data['embedding'].apply(
        lambda x: cosine_similarity([banned_embedding], [x])[0][0]
    )
    recommendations = cosmetics_data.sort_values('similarity', ascending=False).head(n)
    return recommendations[['Product_Name', 'Brand', 'Ingredients']]

# Load and process the image
image_path = "/kaggle/input/ingred-img/WhatsApp Image 2024-11-16 at 17.44.05 (1).jpeg"  # Replace with your image path
extracted_text = extract_text_from_image(image_path)
print("Extracted Text:")
print(extracted_text)

# Classify ingredients
banned_tokens, banned_count = classify_ingredients(extracted_text, banned_set)
print("\nBanned Ingredients:")
print(banned_tokens)
print(f"Number of Banned Ingredients: {banned_count}")



Extracted Text:
=

Exe tent=: Light Liquid Paraffin, White Soft Paraffin, 1, <r)
Extract, Cetomacrogol 1000, Cetyl Alcohol, Stearic “Acid,
'sopropyt Myristate, Fragrance, Kojic Acid, Glycerin, Glutathione,
Propylene Glycol, Alpha Arbutin, Liquorice Extract,
Niacinamide, Cyclomethicone, Phenoxyethanol, Allantoin, -

i


Banned Ingredients:
[]
Number of Banned Ingredients: 0


In [8]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
import pytesseract
from PIL import Image


model = SentenceTransformer('all-MiniLM-L6-v2')


def extract_text_from_image(image_path):
    image = Image.open(image_path)
    text = pytesseract.image_to_string(image)
    
    ingredients = []
    for line in text.split("\n"):
        if line.strip():  
            ingredients.extend([item.strip() for item in line.split(",") if item.strip()])
    return ingredients


def convert_to_vectors(text_list):
    return model.encode(text_list)


def find_similar_rows(image_vectors, image_texts, dataset_vectors, dataset):
    results = []
    for img_vector, img_text in zip(image_vectors, image_texts):
        similarities = cosine_similarity([img_vector], dataset_vectors)
        best_match_idx = np.argmax(similarities)
        matched_row = dataset.iloc[best_match_idx].copy() 
        matched_row['Ingredient'] = img_text  
        results.append(matched_row)
    return results


dataset = pd.read_csv('/kaggle/input/pro-ingred/processed_ingred.csv')
dataset['vectors'] = list(convert_to_vectors(dataset['Ingredient'].tolist()))


image_text = extract_text_from_image('/kaggle/input/ingred-img/WhatsApp Image 2024-11-16 at 17.44.05 (1).jpeg')
image_vectors = convert_to_vectors(image_text)


matches = find_similar_rows(image_vectors, image_text, np.vstack(dataset['vectors']), dataset)


print("Extracted Ingredients from Image:")
for text in image_text:
    print(f"- {text}")

print("\nMatched Results:")
for match in matches:
    print(match)


Batches:   0%|          | 0/114 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Extracted Ingredients from Image:
- =
- Exe tent=: Light Liquid Paraffin
- White Soft Paraffin
- 1
- <r)
- Extract
- Cetomacrogol 1000
- Cetyl Alcohol
- Stearic “Acid
- 'sopropyt Myristate
- Fragrance
- Kojic Acid
- Glycerin
- Glutathione
- Propylene Glycol
- Alpha Arbutin
- Liquorice Extract
- Niacinamide
- Cyclomethicone
- Phenoxyethanol
- Allantoin
- -
- i

Matched Results:
Ingredient                                                    =
Type                                                Antioxidant
Purpose                Preserves the formulation, protects skin
Skin Type                                        All skin types
vectors       [-0.0789611, 0.04086839, -0.007252005, -0.0040...
Name: 37, dtype: object
Ingredient                     Exe tent=: Light Liquid Paraffin
Type                                                  Emollient
Purpose                              Moisturizing, skin barrier
Skin Type                                        All skin types
vectors       [-0.01