# Cancer-Causing Ingredient Detection in Food Products - OCR & Barcode Scanning
This notebook implements a full pipeline for detecting harmful food ingredients using OCR, barcode scanning, and machine learning.

### Execution Instructions for Final_Project2.ipynb 🚀

In [None]:
!pip install pytesseract opencv-python openpyxl pyzbar scikit-learn matplotlib seaborn requests nltk picamera2
!sudo apt-get install zbar-tools

In [None]:
import cv2
import pytesseract
import numpy as np
import pandas as pd
import openpyxl
from pyzbar.pyzbar import decode
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns
import requests
import json
import nltk
from nltk.tokenize import word_tokenize
from picamera2 import Picamera2
import time
nltk.download('punkt')

In [None]:
pytesseract.pytesseract.tesseract_cmd = '/usr/bin/tesseract'

In [None]:
harmful_ingredients = {
    'Red 40': 'High Cancer Risk',
    'Yellow 5': 'Moderate Cancer Risk',
    'Aspartame': 'High Neurological Risk',
    'BHA': 'High Cancer Risk',
    'Sodium Nitrite': 'High Cancer Risk',
    'Trans Fats': 'Cardiovascular Risk',
    'BPA': 'Endocrine Disruptor',
    'MSG': 'Neurological Disruptor'
}

def load_database():
    return harmful_ingredients

In [None]:
def extract_label_text(image_path):
    image = cv2.imread(image_path)
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    _, thresh = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY)
    text = pytesseract.image_to_string(thresh)
    return text

In [None]:
def scan_barcode(image_path):
    image = cv2.imread(image_path)
    barcodes = decode(image)
    for barcode in barcodes:
        barcode_data = barcode.data.decode('utf-8')
        return barcode_data
    return None

In [None]:
def preprocess_text(text):
    words = word_tokenize(text.lower())
    words = [word.strip() for word in words if word.isalpha()]
    return words

In [None]:
def match_ingredients(ingredients):
    database = load_database()
    flagged_ingredients = {i: database[i] for i in ingredients if i in database}
    return flagged_ingredients

In [None]:
def train_ml_model():
    data = pd.DataFrame({
        'Ingredient': list(harmful_ingredients.keys()),
        'Risk Level': [2, 1, 2, 2, 2, 2, 3, 3]
    })
    X = pd.get_dummies(data['Ingredient'], drop_first=True)
    y = data['Risk Level']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model = RandomForestClassifier()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print('Accuracy:', accuracy_score(y_test, y_pred))
    return model

In [None]:
def main_pipeline(image_path):
    print('Extracting label text...')
    label_text = extract_label_text(image_path)
    print('Extracted Text:', label_text)
    barcode = scan_barcode(image_path)
    print('Barcode Data:', barcode)
    if barcode:
        upc_data = fetch_upc_info(barcode)
        print('UPC Data:', upc_data)
    ingredients = preprocess_text(label_text)
    flagged = match_ingredients(ingredients)
    for ingredient, risk in flagged.items():
        print(f'⚠ ALERT: {ingredient} - {risk}')
    train_ml_model()

image_path = 'path/to/your/image.jpg'  # Update with actual image
main_pipeline(image_path)