In [2]:
!pip install easyocr

Collecting easyocr
  Downloading easyocr-1.7.2-py3-none-any.whl.metadata (10 kB)
Collecting scikit-image (from easyocr)
  Downloading scikit_image-0.25.2-cp311-cp311-win_amd64.whl.metadata (14 kB)
Collecting python-bidi (from easyocr)
  Downloading python_bidi-0.6.6-cp311-cp311-win_amd64.whl.metadata (5.0 kB)
Collecting pyclipper (from easyocr)
  Downloading pyclipper-1.3.0.post6-cp311-cp311-win_amd64.whl.metadata (9.2 kB)
Collecting ninja (from easyocr)
  Downloading ninja-1.11.1.4-py3-none-win_amd64.whl.metadata (5.0 kB)
Collecting tifffile>=2022.8.12 (from scikit-image->easyocr)
  Downloading tifffile-2025.6.11-py3-none-any.whl.metadata (32 kB)
Downloading easyocr-1.7.2-py3-none-any.whl (2.9 MB)
   ---------------------------------------- 0.0/2.9 MB ? eta -:--:--
   -------------- ------------------------- 1.0/2.9 MB 6.3 MB/s eta 0:00:01
   -------------------------------- ------- 2.4/2.9 MB 6.1 MB/s eta 0:00:01
   ---------------------------------------- 2.9/2.9 MB 6.0 MB/s eta 0:0

In [7]:
import cv2
import easyocr
import numpy as np
import pandas as pd
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder

# === 📥 Load model
model = load_model(r"C:\Users\sagni\Downloads\Cancer Detection\cancer_model.h5")
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

# === 🔃 Recreate tokenizer and encoders from training data
variants_path = r"C:\Users\sagni\Downloads\msk-redefining-cancer-treatment\training_variants"
text_path = r"C:\Users\sagni\Downloads\msk-redefining-cancer-treatment\training_text"

variants_df = pd.read_csv(variants_path)
text_df = pd.read_csv(text_path, sep="\|\|", engine="python", names=["ID", "Text"], skiprows=1)
data = pd.merge(variants_df, text_df, on="ID")
data["Text"] = data["Text"].fillna("unknown")

tokenizer = Tokenizer(num_words=20000)
tokenizer.fit_on_texts(data["Text"])

gene_encoder = LabelEncoder()
gene_encoder.fit(data["Gene"])

variation_encoder = LabelEncoder()
variation_encoder.fit(data["Variation"])

# === 🔍 Prediction function
def predict_from_text(input_text, input_gene, input_variation):
    seq = tokenizer.texts_to_sequences([input_text])
    padded_seq = pad_sequences(seq, maxlen=500)

    if input_gene not in gene_encoder.classes_:
        print(f"❌ Unknown gene: {input_gene}")
        return None
    gene_encoded = gene_encoder.transform([input_gene])
    gene_encoded = np.expand_dims(gene_encoded, -1)

    if input_variation not in variation_encoder.classes_:
        print(f"❌ Unknown variation: {input_variation}")
        return None
    var_encoded = variation_encoder.transform([input_variation])
    var_encoded = np.expand_dims(var_encoded, -1)

    prediction = model.predict([padded_seq, gene_encoded, var_encoded])
    predicted_class = np.argmax(prediction, axis=1)[0] + 1
    print(f"✅ Predicted Cancer Class: {predicted_class}")
    return predicted_class

# === 🖼️ OCR and Prediction from image
def predict_from_image(image_path):
    reader = easyocr.Reader(['en'])
    result = reader.readtext(image_path, detail=0)
    text = " ".join(result)
    print(f"\n📝 Extracted Text: {text}")

    # Basic gene/variation extractor from OCR output
    gene = None
    variation = None

    for word in result:
        upper_word = word.upper()
        if upper_word in gene_encoder.classes_:
            gene = upper_word
        if word in variation_encoder.classes_:
            variation = word

    if gene and variation:
        return predict_from_text(text, gene, variation)
    else:
        print("❌ Could not extract gene/variation from image text.")
        return None

# === 🧪 Run Prediction
if __name__ == "__main__":
    image_path = r"C:\Users\sagni\Downloads\Cancer Detection\images.jpg"  # Replace with your image path
    predict_from_image(image_path)





📝 Extracted Text: 
❌ Could not extract gene/variation from image text.


In [14]:
# 📦 Install required libraries (uncomment if running in a new environment)
# !pip install pandas scikit-learn tensorflow

import pandas as pd
import numpy as np
import pickle
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Dense, LSTM, Concatenate
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# === 📂 Load data ===
variants_path = r"C:\Users\sagni\Downloads\msk-redefining-cancer-treatment\training_variants"
text_path = r"C:\Users\sagni\Downloads\msk-redefining-cancer-treatment\training_text"
variants_df = pd.read_csv(variants_path)
text_df = pd.read_csv(text_path, sep="\|\|", engine="python", names=["ID", "Text"], skiprows=1)

# === 🔗 Merge data ===
data = pd.merge(variants_df, text_df, on="ID")

# === 🧹 Clean text column (fix AttributeError due to NaN) ===
data["Text"] = data["Text"].fillna("unknown")

# === 🧪 Prepare categorical features ===
gene_encoder = LabelEncoder()
variation_encoder = LabelEncoder()
data["Gene_enc"] = gene_encoder.fit_transform(data["Gene"])
data["Variation_enc"] = variation_encoder.fit_transform(data["Variation"])

# === 🧠 Prepare text features ===
tokenizer = Tokenizer(num_words=20000)
tokenizer.fit_on_texts(data["Text"])
X_text = tokenizer.texts_to_sequences(data["Text"])
X_text = pad_sequences(X_text, maxlen=500)

# === 🎯 Labels (1–9), one-hot encode
y = to_categorical(data["Class"] - 1, num_classes=9)

# === ✂️ Train/test split ===
X_train_text, X_test_text, X_train_gene, X_test_gene, X_train_var, X_test_var, y_train, y_test = train_test_split(
    X_text, data["Gene_enc"], data["Variation_enc"], y, test_size=0.2, random_state=42
)

# === 🧩 Model ===
# Inputs
input_text = Input(shape=(500,))
input_gene = Input(shape=(1,))
input_var = Input(shape=(1,))

# Embeddings
text_emb = Embedding(input_dim=20000, output_dim=128, input_length=500)(input_text)
x_text = LSTM(64)(text_emb)

gene_emb = Embedding(input_dim=len(gene_encoder.classes_), output_dim=8)(input_gene)
x_gene = LSTM(8)(gene_emb)

var_emb = Embedding(input_dim=len(variation_encoder.classes_), output_dim=8)(input_var)
x_var = LSTM(8)(var_emb)

# Combine
merged = Concatenate()([x_text, x_gene, x_var])
output = Dense(64, activation='relu')(merged)
output = Dense(9, activation='softmax')(output)

model = Model(inputs=[input_text, input_gene, input_var], outputs=output)
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

# === 🏋️‍♂️ Train ===
model.fit(
    [X_train_text, np.expand_dims(X_train_gene, -1), np.expand_dims(X_train_var, -1)],
    y_train,
    epochs=5,
    batch_size=64,
    validation_split=0.1
)

# === ✅ Evaluate ===
loss, acc = model.evaluate(
    [X_test_text, np.expand_dims(X_test_gene, -1), np.expand_dims(X_test_var, -1)],
    y_test
)
print(f"\n✅ Test Accuracy: {acc:.4f}")

# === 💾 Save model and processors ===
save_dir = r"C:\Users\sagni\Downloads\Cancer Detection"

# 1. Model
model.save(f"{save_dir}\\cancer_model.h5")
print("✅ Model saved!")

# 2. Tokenizer
with open(f"{save_dir}\\tokenizer.pickle", "wb") as f:
    pickle.dump(tokenizer, f)
print("✅ Tokenizer saved!")

# 3. Gene encoder
with open(f"{save_dir}\\gene_encoder.pickle", "wb") as f:
    pickle.dump(gene_encoder, f)
print("✅ Gene encoder saved!")

# 4. Variation encoder
with open(f"{save_dir}\\variation_encoder.pickle", "wb") as f:
    pickle.dump(variation_encoder, f)
print("✅ Variation encoder saved!")


Epoch 1/5




[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 236ms/step - accuracy: 0.2700 - loss: 2.0899 - val_accuracy: 0.3233 - val_loss: 1.8132
Epoch 2/5
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 207ms/step - accuracy: 0.3778 - loss: 1.6710 - val_accuracy: 0.4887 - val_loss: 1.4690
Epoch 3/5
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 209ms/step - accuracy: 0.5926 - loss: 1.1734 - val_accuracy: 0.5677 - val_loss: 1.3304
Epoch 4/5
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 209ms/step - accuracy: 0.6816 - loss: 0.9102 - val_accuracy: 0.5263 - val_loss: 1.3343
Epoch 5/5
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 211ms/step - accuracy: 0.7635 - loss: 0.7326 - val_accuracy: 0.5639 - val_loss: 1.2820
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 44ms/step - accuracy: 0.5678 - loss: 1.2597





✅ Test Accuracy: 0.5805
✅ Model saved!
✅ Tokenizer saved!
✅ Gene encoder saved!
✅ Variation encoder saved!
