In [2]:
import easyocr
import numpy as np
import pandas as pd
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
import cv2

# === 🧠 Load model ===
model = load_model(r"C:\Users\sagni\Downloads\Cancer Detection\cancer_model.h5")
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

# === 🧠 Load training data to recreate tokenizer and encoders ===
variants_path = r"C:\Users\sagni\Downloads\msk-redefining-cancer-treatment\training_variants"
text_path = r"C:\Users\sagni\Downloads\msk-redefining-cancer-treatment\training_text"

variants_df = pd.read_csv(variants_path)
text_df = pd.read_csv(text_path, sep="\|\|", engine="python", names=["ID", "Text"], skiprows=1)
data = pd.merge(variants_df, text_df, on="ID")
data["Text"] = data["Text"].fillna("unknown")

tokenizer = Tokenizer(num_words=20000)
tokenizer.fit_on_texts(data["Text"])

gene_encoder = LabelEncoder()
gene_encoder.fit(data["Gene"])

variation_encoder = LabelEncoder()
variation_encoder.fit(data["Variation"])

# === 🧠 OCR + Prediction ===
def extract_text_from_image(image_path):
    reader = easyocr.Reader(['en'])
    result = reader.readtext(image_path, detail=0)
    full_text = " ".join(result)
    return full_text

def predict_from_image(image_path, gene, variation):
    # OCR
    input_text = extract_text_from_image(image_path)
    print(f"📝 Extracted Text: {input_text}")

    # Tokenize
    seq = tokenizer.texts_to_sequences([input_text])
    padded_seq = pad_sequences(seq, maxlen=500)

    # Gene
    if gene not in gene_encoder.classes_:
        print(f"❌ Unknown gene: {gene}")
        return
    gene_encoded = gene_encoder.transform([gene])
    gene_encoded = np.expand_dims(gene_encoded, -1)

    # Variation
    if variation not in variation_encoder.classes_:
        print(f"❌ Unknown variation: {variation}")
        return
    var_encoded = variation_encoder.transform([variation])
    var_encoded = np.expand_dims(var_encoded, -1)

    # Predict
    prediction = model.predict([padded_seq, gene_encoded, var_encoded])
    predicted_class = np.argmax(prediction, axis=1)[0] + 1
    print(f"✅ Predicted Cancer Class: {predicted_class}")
    return predicted_class

# === Example Usage ===
if __name__ == "__main__":
    image_path = r"C:\Users\sagni\Downloads\Cancer Detection\images.jpg"
    gene = "EGFR"
    variation = "L858R"
    predict_from_image(image_path, gene, variation)




📝 Extracted Text: 
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 816ms/step
✅ Predicted Cancer Class: 5
