<a href="https://colab.research.google.com/github/saitejasri1/Shared-ML-project/blob/final-from-prit/merged.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install paddlepaddle-gpu==2.3.0 -i https://mirror.baidu.com/pypi/simple
!pip install paddleocr
# git clone https://github.com/PaddlePaddle/PaddleOCR
!pip install PaddleOCR
!pip install langdetect
!pip install paddlepaddle

#German to English

!pip install googletrans==4.0.0-rc1

In [None]:
!pip install --upgrade tensorflow

In [None]:
import pandas as pd
import math
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import tensorflow_datasets as tfds
from tensorflow.keras.models import *
from tensorflow.keras.layers import *
from tensorflow.keras.utils import *
from tensorflow.keras import *
from tensorflow.keras.optimizers import *
from tensorflow.keras import models
import os
from tensorflow.keras.preprocessing.image import *
from sklearn.metrics import *
from paddleocr import PaddleOCR
from PIL import Image
import numpy as np
from googletrans import Translator
from langdetect import detect
import logging
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Suppress debug messages from ppocr
logging.getLogger("ppocr").setLevel(logging.WARNING)
logging.getLogger("ppocr").setLevel(logging.ERROR)

def process_images_and_detect_ingredients(N_ensemble, N_OCR, ensemble_image_paths, ocr_image_paths):
    """
    Process images using Ensemble model and detect ingredients using PaddleOCR.

    Args:
    N_ensemble (int): Number of times to run the Ensemble process.
    N_OCR (int): Number of times to run the OCR process.
    ensemble_image_paths (list): List of paths to the images for Ensemble model.
    ocr_image_paths (list): List of paths to the images for PaddleOCR.

    Returns:
    tuple: A tuple containing:
        - str: Comma-separated output keywords for Ensemble model.
        - list: List of strings containing comma-separated detected ingredients for each OCR image.
    """
    # Ensemble model
    def process_images(N, image_paths):
        results = []

        # Load pre-trained models
        VGG16 = load_model('/content/drive/MyDrive/do-not-delete/ML-Project/VGG16.keras')
        DenseNet121 = load_model('/content/drive/MyDrive/do-not-delete/ML-Project/InceptionV3.keras')
        InceptionV3 = load_model('/content/drive/MyDrive/do-not-delete/ML-Project/InceptionV3.keras')

        for path in image_paths:
            input_image = tf.io.read_file(path)
            image = tf.image.decode_image(input_image, channels=3)

            image = tf.image.resize(image, size=(224, 224))
            image = image / 255.0
            image = tf.expand_dims(image, axis=0)

            pred_probs_VGG16 = VGG16.predict(image)
            pred_probs_DenseNet121 = DenseNet121.predict(image)
            pred_probs_InceptionV3 = InceptionV3.predict(image)

            ensemble_pred = np.maximum.reduce([pred_probs_VGG16, pred_probs_DenseNet121, pred_probs_InceptionV3])

            predicted_label = np.argmax(ensemble_pred)
            pred_prob = ensemble_pred[0][predicted_label]

            class_names = test.class_names
            output_label = class_names[predicted_label]
            results.append(output_label)  # Append the output label to results list

        return ','.join(results)  # Join all output labels with commas and return

    # PaddleOCR model
    def detect_ingredients_from_images(N, image_paths):
        # Setup OCR model with English language
        ocr_model = PaddleOCR(lang='en')

        # Define the list of ingredients
        ingredients = {'beans', 'salt', 'butter', 'sugar', 'onion', 'water', 'eggs', 'oliveoil', 'flour', 'milk',
                       'garliccloves', 'pepper', 'brownsugar', 'garlic', 'all-purposeflour', 'bakingpowder', 'egg',
                       'saltandpepper', 'parmesancheese', 'lemonjuice', 'bakingsoda', 'vegetableoil', 'vanilla',
                       'blackpepper', 'cinnamon', 'tomatoes', 'sourcream', 'garlicpowder', 'vanillaextract', 'oil',
                       'honey', 'onions', 'creamcheese', 'garlicclove', 'celery', 'cheddarcheese', 'unsaltedbutter',
                       'soysauce'}

        # Initialize an empty list to store results for each image
        all_detected_ingredients = []

        for _ in range(N):
            # Initialize an empty list to store detected ingredients for this iteration
            detected_ingredients_iteration = []

            for image_path in image_paths:
                # Check if the image is in HEIC format
                if image_path.lower().endswith('.heic'):
                    # Convert HEIC image to PNG format
                    heic_img = Image.open(image_path)
                    image_np = np.array(heic_img.convert('RGB'))
                else:
                    # Load the image
                    image = Image.open(image_path)
                    # Convert image to numpy array
                    image_np = np.array(image)

                # Perform OCR on the image
                result = ocr_model.ocr(image_np)

                # Extract text from OCR result
                text = ""
                if result is not None:
                    for line in result:
                        for word in line:
                            text += word[1][0] + ' '
                else:
                    print("No text detected in image:", image_path)

                # Check if the detected text is in English
                if detect(text) != 'en':
                    # Translate text to English
                    translator = Translator()
                    translated = translator.translate(text, src='auto', dest='en')
                    text = translated.text

                # Convert text to lowercase and split into words
                text = text.lower().split()

                # Find intersection of detected ingredients and predefined ingredients list
                detected_ingredients = ingredients.intersection(text)

                # Join detected ingredients into a comma-separated string
                detected_ingredients_str = ','.join(detected_ingredients)

                # Add detected ingredients string to the list for this iteration
                detected_ingredients_iteration.append(detected_ingredients_str)

            # Add the list of detected ingredients for this iteration to the main list
            all_detected_ingredients.append(detected_ingredients_iteration)

        return all_detected_ingredients

    # Execute Ensemble model
    output_Ensemble = process_images(N_ensemble, ensemble_image_paths)

    # Execute PaddleOCR model
    detected_ingredients = detect_ingredients_from_images(N_OCR, ocr_image_paths)

    return output_Ensemble, detected_ingredients

# RecipeNLG model
def load_data(filepath):
    """ Load the dataset and preprocess the NER column """
    data = pd.read_csv(filepath)
    data['ingredients'] = data['ingredients'].apply(eval)  # Convert ingredients from string to list
    return data

def build_tfidf_model(data):
    """ Build and return a TF-IDF model and matrix """
    tfidf_vectorizer = TfidfVectorizer(tokenizer=lambda x: x, lowercase=False)
    tfidf_matrix = tfidf_vectorizer.fit_transform(data['ingredients'])
    return tfidf_vectorizer, tfidf_matrix

def find_similar_recipes(user_input, tfidf_vectorizer, tfidf_matrix, data):
    """ Find and return similar recipes based on user input """
    # Transform user input using the same tfidf vectorizer
    user_tfidf = tfidf_vectorizer.transform([user_input])
    # Calculate cosine similarities between user input and all recipes
    cosine_similarities = cosine_similarity(user_tfidf, tfidf_matrix)
    # Get the indices of recipes sorted by the number of ingredient matches
    sorted_indices = cosine_similarities.argsort()[0][::-1]
    # Return top 20 similar recipes with max n matches
    similar_recipes = data.iloc[sorted_indices[:20]][['title', 'ingredients', 'directions', 'link']]
    return similar_recipes

# Main execution flow
if __name__ == "__main__":
    # Paths to images for Ensemble model
    ensemble_image_paths = ["/content/drive/MyDrive/do-not-delete/ML-Project/Images/Ensemble/1.jpg",
                            "/content/drive/MyDrive/do-not-delete/ML-Project/Images/Ensemble/2.jpg",
                            "/content/drive/MyDrive/do-not-delete/ML-Project/Images/Ensemble/3.jpg",
                            "/content/drive/MyDrive/do-not-delete/ML-Project/Images/Ensemble/4.jpg",
                            "/content/drive/MyDrive/do-not-delete/ML-Project/Images/Ensemble/5.jpg"]

    # Paths to images for PaddleOCR model
    ocr_image_paths = ["/content/drive/MyDrive/do-not-delete/ML-Project/Images/for_OCR/1.jpg",
                       "/content/drive/MyDrive/do-not-delete/ML-Project/Images/for_OCR/2.jpg",
                       "/content/drive/MyDrive/do-not-delete/ML-Project/Images/for_OCR/2.jpg",
                       "/content/drive/MyDrive/do-not-delete/ML-Project/Images/for_OCR/2.jpg",
                       ,"/content/drive/MyDrive/do-not-delete/ML-Project/Images/for_OCR/2.jpg"]

    # Execute Ensemble and PaddleOCR models
    output_Ensemble, detected_ingredients = process_images_and_detect_ingredients(1, 1, ensemble_image_paths, ocr_image_paths)

    # Load the dataset for RecipeNLG
    filepath = '/content/drive/MyDrive/do-not-delete/ML-Project/RecipeNLG/RecipeNLG-all.csv'
    data = load_data(filepath)

    # Build the TF-IDF model and matrix
    tfidf_vectorizer, tfidf_matrix = build_tfidf_model(data)

    # Merge output keywords from Ensemble and detected ingredients from PaddleOCR
    user_input = output_Ensemble + ',' + ','.join(detected_ingredients)

    # Find similar recipes
    similar_recipes = find_similar_recipes(user_input, tfidf_vectorizer, tfidf_matrix, data)
    print("Recommended Recipes Based on Your Ingredients:")
    print(similar_recipes)
