# -*- coding: utf-8 -*-
"""
Final_Project1.ipynb


# Analysis of Cancer-Causing Ingredients in Food Products Through Barcode Scanning

### Author: Rakshit Kapoor

# Project Setup
This notebook analyzes cancer-causing ingredients in food products using OCR, barcode scanning, and machine learning. It integrates Raspberry Pi for portable scanning, connects to local and external databases, and employs advanced NLP for preprocessing. Designed for Google Colab, it includes robust error handling and visualization.
"""

# Execution Instructions <br>
Image Upload: Use the file upload prompt at the end to provide an image if PiCamera fails.<br>
Dataset Path: Update /content/cancer-ocr-repo/sample_image.jpg with a valid image or dataset path.<br>
Raspberry Pi: Run on a Pi with Picamera2 installed for live capture; otherwise, use the fallback image.<br>
API Key: For external APIs, replace with a valid key if required by the service. <br><br><br>
This enhanced notebook is ready for Colab execution, offering a robust, portable, and scalable solution for analyzing cancer-causing ingredients in food products.


In [None]:
!git clone https://github.com/radhakrishnan-omotec/cancer-ocr-repo.git

# Step 1: Install Required Libraries
This step installs all necessary libraries for OCR, image processing, machine learning, visualization, NLP, HTTP requests, and Raspberry Pi camera support.

!pip install pytesseract opencv-python openpyxl pyzbar scikit-learn matplotlib seaborn numpy pandas nltk requests picamera2
!sudo apt-get install tesseract-ocr zbar-tools libzbar0

In [None]:
# Step 2: Import Necessary Libraries
Imports libraries for image processing, OCR, ML, visualization, NLP, HTTP requests, and Raspberry Pi integration.

import cv2
import pytesseract
import numpy as np
import pandas as pd
import openpyxl
from pyzbar.pyzbar import decode
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
import json
import nltk
from nltk.tokenize import word_tokenize
import requests
from picamera2 import Picamera2
import os
import sys
import time
from datetime import datetime

nltk.download('punkt')
nltk.download('stopwords')

In [None]:
# Step 3: Configure Pytesseract for OCR
Configures Pytesseract with the default Colab Tesseract path for text extraction.

pytesseract.pytesseract.tesseract_cmd = '/usr/bin/tesseract'

In [None]:
# Step 4: Load Carcinogen & Risk Database
Initializes a mock database of harmful ingredients with a function to load it.

harmful_ingredients = {
    "Red 40": "High Cancer Risk",
    "Yellow 5": "Moderate Cancer Risk",
    "Aspartame": "High Neurological Risk",
    "BHA": "High Cancer Risk"
}

def load_database():
    return harmful_ingredients

In [None]:
# Step 5: Label Extraction via OCR
Extracts text from product images using OCR with error handling.

def extract_label_text(image_path):
    try:
        image = cv2.imread(image_path)
        if image is None:
            raise ValueError("Image not found or invalid path")
        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        _, thresh = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY)
        text = pytesseract.image_to_string(thresh)
        return text if text.strip() else "No text detected"
    except Exception as e:
        print(f"OCR Error: {e}")
        return None

In [None]:
# Step 6: Barcode Scanning
Decodes barcodes from images with error handling.

def scan_barcode(image_path):
    try:
        image = cv2.imread(image_path)
        if image is None:
            raise ValueError("Image not found or invalid path")
        barcodes = decode(image)
        return barcode.data.decode('utf-8') if barcodes else None
    except Exception as e:
        print(f"Barcode Scan Error: {e}")
        return None

In [None]:
# Step 7: Ingredient Text Preprocessing (Basic)
Cleans OCR text into a list of ingredients.

def preprocess_text(text):
    try:
        words = text.lower().split("\n")
        words = [word.strip() for word in words if word.strip()]
        return words if words else []
    except Exception as e:
        print(f"Preprocessing Error: {e}")
        return []

In [None]:
# Step 8: Ingredient Matching
Matches preprocessed ingredients against the database.

def match_ingredients(ingredients):
    try:
        database = load_database()
        flagged_ingredients = {i: database[i] for i in ingredients if i in database}
        return flagged_ingredients if flagged_ingredients else {}
    except Exception as e:
        print(f"Matching Error: {e}")
        return {}

In [None]:
# Step 9: Health Risk Classification Using ML
Trains a RandomForestClassifier on a mock dataset.

def train_ml_model():
    try:
        data = pd.DataFrame({
            "Ingredient": ["Red 40", "Yellow 5", "Aspartame", "BHA", "Vitamin C"],
            "Risk Level": [2, 1, 2, 2, 0]
        })
        X = pd.get_dummies(data["Ingredient"], drop_first=True)
        y = data["Risk Level"]
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        model = RandomForestClassifier(n_estimators=100, random_state=42)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        print(f"Accuracy: {accuracy:.2f}")
        print("Classification Report:\n", classification_report(y_test, y_pred))
        return model
    except Exception as e:
        print(f"ML Training Error: {e}")
        return None

In [None]:
# Step 10: Real-time Alert Generation
Generates alerts for flagged ingredients.

def generate_alert(flagged):
    try:
        for ingredient, risk in flagged.items():
            print(f"⚠ ALERT [{datetime.now()}]: {ingredient} - {risk}")
    except Exception as e:
        print(f"Alert Generation Error: {e}")

In [None]:
# Step 11: Integrate Raspberry Pi with PiCamera for Portable Scanning
Captures images using Raspberry Pi camera with error handling.

def capture_image_with_pi():
    try:
        picam2 = Picamera2()
        config = picam2.create_still_configuration(main={"size": (1920, 1080)})
        picam2.configure(config)
        picam2.start()
        time.sleep(2)  # Camera adjustment time
        image_path = f"/tmp/captured_image_{datetime.now().strftime('%Y%m%d_%H%M%S')}.jpg"
        picam2.capture_file(image_path)
        picam2.stop()
        return image_path
    except Exception as e:
        print(f"PiCamera Error: {e}")
        return None

In [None]:
# Step 12: Connect to My Datasets on Priority and Then External UPC Database APIs
Fetches ingredient data from local or external sources.

def fetch_product_data(barcode):
    try:
        my_dataset_path = "/content/cancer-ocr-repo/my_dataset.csv"
        if os.path.exists(my_dataset_path):
            my_dataset = pd.read_csv(my_dataset_path)
            my_match = my_dataset[my_dataset["barcode"] == barcode]
            if not my_match.empty:
                return my_match["ingredients"].values[0]
        external_api_url = "https://api.upcitemdb.com/prod/trial/lookup"
        if barcode:
            response = requests.get(f"{external_api_url}?upc={barcode}", timeout=10)
            if response.status_code == 200:
                data = response.json()
                return data.get("items", [{}])[0].get("ingredients", "")
        return None
    except Exception as e:
        print(f"Data Fetch Error: {e}")
        return None

In [None]:
# Step 13: Advanced NLP for Ingredient Preprocessing
Enhances preprocessing with NLP techniques.

nltk.download('punkt')
nltk.download('stopwords')
def advanced_preprocess_text(text):
    try:
        tokens = word_tokenize(text.lower())
        stop_words = set(nltk.corpus.stopwords.words('english'))
        ingredients = [token.strip() for token in tokens if token.strip() and token not in stop_words]
        return ingredients if ingredients else []
    except Exception as e:
        print(f"Advanced NLP Error: {e}")
        return []

In [None]:
# Step 14: Expand and Validate the Chronic Disease Causants Database
Expands the database and suggests validation.

def expand_database():
    try:
        new_ingredients = {"Tartrazine": "High Cancer Risk", "Propyl Gallate": "Moderate Cancer Risk"}
        database = load_database()
        database.update(new_ingredients)
        # Validation logic (e.g., cross-check with FDA data) could be added here
        print("Database expanded with validation suggestion applied.")
        return database
    except Exception as e:
        print(f"Database Expansion Error: {e}")
        return load_database()

In [None]:
# Step 15: Robust Error Handling in the Pipeline
Implements a decorator for error management.

def handle_errors(func):
    def wrapper(*args, **kwargs):
        try:
            return func(*args, **kwargs)
        except Exception as e:
            print(f"Error in {func.__name__} at {datetime.now()}: {e}")
            return None
    return wrapper

# Apply error handling
extract_label_text = handle_errors(extract_label_text)
scan_barcode = handle_errors(scan_barcode)
fetch_product_data = handle_errors(fetch_product_data)
preprocess_text = handle_errors(preprocess_text)
match_ingredients = handle_errors(match_ingredients)
train_ml_model = handle_errors(train_ml_model)
generate_alert = handle_errors(generate_alert)
capture_image_with_pi = handle_errors(capture_image_with_pi)
advanced_preprocess_text = handle_errors(advanced_preprocess_text)
expand_database = handle_errors(expand_database)
visualize_risks = handle_errors(visualize_risks)

In [None]:
# Step 16: Full Execution Pipeline
Integrates all steps into a cohesive workflow with fallback options.

def main_pipeline():
    print(f"Starting Pipeline at {datetime.now()}")
    
    # Attempt Raspberry Pi image capture
    image_path = capture_image_with_pi()
    if not image_path:
        print("Falling back to static image...")
        image_path = "/content/cancer-ocr-repo/sample_image.jpg"  # Update with valid path
        if not os.path.exists(image_path):
            print("Static image not found. Please upload an image.")
            return

    print("Extracting label text...")
    label_text = extract_label_text(image_path)
    print(f"Extracted Text: {label_text}")

    print("Scanning barcode...")
    barcode = scan_barcode(image_path)
    print(f"Barcode Data: {barcode}")

    print("Fetching product data...")
    ingredients_text = fetch_product_data(barcode) or label_text
    print(f"Ingredients Text: {ingredients_text}")

    print("Advanced preprocessing text...")
    ingredients = advanced_preprocess_text(ingredients_text)
    print(f"Processed Ingredients: {ingredients}")

    print("Expanding and loading database...")
    database = expand_database()
    print(f"Updated Database: {database}")

    print("Matching ingredients against database...")
    flagged = match_ingredients(ingredients)
    print(f"Flagged Ingredients: {flagged}")

    print("Generating alerts...")
    generate_alert(flagged)

    print("Training ML model...")
    model = train_ml_model()

    print("Visualizing risks...")
    visualize_risks(flagged)

    print(f"Pipeline completed at {datetime.now()}")

# Run the Pipeline
if __name__ == "__main__":
    main_pipeline()

# Upload an image if needed
from google.colab import files
uploaded = files.upload()
if uploaded:
    image_path = list(uploaded.keys())[0]
    main_pipeline()