<a href="https://colab.research.google.com/github/radhakrishnan-omotec/cancer-ocr-repo/blob/main/Rakshit_Kapoor_Project3_OCR.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Analysis of Cancer-Causing Ingredients in Food Products Through Barcode Scanning

###Author : Rakshit Kapoor

# Project Setup

In [None]:
!git clone https://github.com/radhakrishnan-omotec/cancer-ocr-repo.git

Cloning into 'cancer-ocr-repo'...
remote: Enumerating objects: 30, done.[K
remote: Counting objects: 100% (30/30), done.[K
remote: Compressing objects: 100% (29/29), done.[K
remote: Total 30 (delta 7), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (30/30), 25.42 MiB | 8.24 MiB/s, done.
Resolving deltas: 100% (7/7), done.


# Step 1: Install Required Libraries

In [None]:
# Step 1: Install Required Libraries (Run in Google Colab)
!pip install pytesseract opencv-python openpyxl pyzbar scikit-learn matplotlib seaborn



Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Collecting pyzbar
  Downloading pyzbar-0.1.9-py2.py3-none-any.whl.metadata (10 kB)
Downloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Downloading pyzbar-0.1.9-py2.py3-none-any.whl (32 kB)
Installing collected packages: pyzbar, pytesseract
Successfully installed pytesseract-0.3.13 pyzbar-0.1.9


In [None]:
!sudo apt-get install zbar-tools

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  fonts-droid-fallback fonts-noto-mono fonts-urw-base35 ghostscript gsfonts
  imagemagick-6-common libdjvulibre-text libdjvulibre21 libfftw3-double3
  libgs9 libgs9-common libidn12 libijs-0.35 libjbig2dec0 libjxr-tools libjxr0
  liblqr-1-0 libmagickcore-6.q16-6 libmagickcore-6.q16-6-extra
  libmagickwand-6.q16-6 libv4l-0 libv4lconvert0 libwmflite-0.2-7 libzbar0
  poppler-data
Suggested packages:
  fonts-noto fonts-freefont-otf | fonts-freefont-ttf fonts-texgyre
  ghostscript-x libfftw3-bin libfftw3-dev inkscape poppler-utils
  fonts-japanese-mincho | fonts-ipafont-mincho fonts-japanese-gothic
  | fonts-ipafont-gothic fonts-arphic-ukai fonts-arphic-uming fonts-nanum
  zbarcam-gtk zbarcam-qt
The following NEW packages will be installed:
  fonts-droid-fallback fonts-noto-mono fonts-urw-base35 ghostscript gsfonts
  imagemagick-6-common libdjv

# Step 2: Import Necessary Libraries


In [None]:
# Step 2: Import Necessary Libraries
import cv2
import pytesseract
import numpy as np
import pandas as pd
import openpyxl
from pyzbar.pyzbar import decode
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
import json

# Step 3: Configure Pytesseract for OCR

In [None]:
# Step 3: Configure Pytesseract for OCR
pytesseract.pytesseract.tesseract_cmd = '/usr/bin/tesseract'  # Modify if necessary


# Step 4: Load Carcinogen & Risk Database

In [None]:
# Step 4: Load Carcinogen & Risk Database (Mock Data for Example)
harmful_ingredients = {
    "Red 40": "High Cancer Risk",
    "Yellow 5": "Moderate Cancer Risk",
    "Aspartame": "High Neurological Risk",
    "BHA": "High Cancer Risk"
}

def load_database():
    return harmful_ingredients

# Step 5: Label Extraction via OCR


In [None]:
# Step 5: Label Extraction via OCR
def extract_label_text(image_path):
    image = cv2.imread(image_path)
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    _, thresh = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY)
    text = pytesseract.image_to_string(thresh)
    return text

# Step 6: Barcode Scanning


In [None]:
# Step 6: Barcode Scanning
def scan_barcode(image_path):
    image = cv2.imread(image_path)
    barcodes = decode(image)
    for barcode in barcodes:
        barcode_data = barcode.data.decode('utf-8')
        return barcode_data
    return None

# Step 7: Ingredient Text Preprocessing

In [None]:
# Step 7: Ingredient Text Preprocessing
def preprocess_text(text):
    words = text.lower().split("\n")
    words = [word.strip() for word in words if word.strip()]
    return words

# Step 8: Ingredient Matching


In [None]:
# Step 8: Ingredient Matching
def match_ingredients(ingredients):
    database = load_database()
    flagged_ingredients = {i: database[i] for i in ingredients if i in database}
    return flagged_ingredients


# Step 9: Health Risk Classification Using ML

In [None]:
# Step 9: Health Risk Classification Using ML (Mock Model Training)
def train_ml_model():
    # Mock dataset
    data = pd.DataFrame({
        "Ingredient": ["Red 40", "Yellow 5", "Aspartame", "BHA", "Vitamin C"],
        "Risk Level": [2, 1, 2, 2, 0]
    })
    X = pd.get_dummies(data["Ingredient"], drop_first=True)
    y = data["Risk Level"]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model = RandomForestClassifier()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print("Accuracy:", accuracy_score(y_test, y_pred))
    return model

# Step 10: Real-time Alert Generation

In [None]:
# Step 10: Real-time Alert Generation
def generate_alert(flagged):
    for ingredient, risk in flagged.items():
        print(f"⚠ ALERT: {ingredient} - {risk}")

# Step 11: Data Visualization


In [None]:
# Step 11: Data Visualization
def visualize_risks(flagged):
    df = pd.DataFrame(list(flagged.items()), columns=["Ingredient", "Risk Level"])
    plt.figure(figsize=(8, 5))
    sns.barplot(x=df["Ingredient"], y=df.index, hue=df["Risk Level"], dodge=False)
    plt.title("Detected Risk Levels")
    plt.show()


# Step 12: Full Execution Pipeline

In [None]:
# Step 12: Full Execution Pipeline
def main_pipeline(image_path):
    print("Extracting label text...")
    label_text = extract_label_text(image_path)
    print("Extracted Text:", label_text)

    print("Scanning barcode...")
    barcode = scan_barcode(image_path)
    print("Barcode Data:", barcode)

    print("Preprocessing text...")
    ingredients = preprocess_text(label_text)
    print("Processed Ingredients:", ingredients)

    print("Matching ingredients against database...")
    flagged = match_ingredients(ingredients)
    print("Flagged Ingredients:", flagged)

    print("Generating alerts...")
    generate_alert(flagged)

    print("Visualizing risks...")
    visualize_risks(flagged)

    print("Training ML model...")
    model = train_ml_model()

# Run the Pipeline with an Image
test_image = "path/to/your/image.jpg"  # Change to the Github Image Files
main_pipeline(test_image)