In [2]:
import numpy as np
import pandas as pd
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder

# === 📥 Load model ===
model = load_model(r"C:\Users\sagni\Downloads\Cancer Detection\cancer_model.h5")

# ✅ Optional: compile to suppress warning
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

# === 🔃 Load training data to recreate tokenizer and encoders ===
variants_path = r"C:\Users\sagni\Downloads\msk-redefining-cancer-treatment\training_variants"
text_path = r"C:\Users\sagni\Downloads\msk-redefining-cancer-treatment\training_text"

# Load and merge
variants_df = pd.read_csv(variants_path)
text_df = pd.read_csv(text_path, sep="\|\|", engine="python", names=["ID", "Text"], skiprows=1)
data = pd.merge(variants_df, text_df, on="ID")
data["Text"] = data["Text"].fillna("unknown")

# === 🔁 Recreate tokenizer and encoders ===
tokenizer = Tokenizer(num_words=20000)
tokenizer.fit_on_texts(data["Text"])

gene_encoder = LabelEncoder()
gene_encoder.fit(data["Gene"])

variation_encoder = LabelEncoder()
variation_encoder.fit(data["Variation"])

# === 🔍 Prediction function ===
def predict_cancer_class(input_text, input_gene, input_variation):
    # Text processing
    seq = tokenizer.texts_to_sequences([input_text])
    padded_seq = pad_sequences(seq, maxlen=500)

    # Gene encoding
    if input_gene not in gene_encoder.classes_:
        print(f"❌ Unknown gene: {input_gene}")
        return None
    gene_encoded = gene_encoder.transform([input_gene])
    gene_encoded = np.expand_dims(gene_encoded, -1)

    # Variation encoding
    if input_variation not in variation_encoder.classes_:
        print(f"❌ Unknown variation: {input_variation}")
        return None
    var_encoded = variation_encoder.transform([input_variation])
    var_encoded = np.expand_dims(var_encoded, -1)

    # Predict
    prediction = model.predict([padded_seq, gene_encoded, var_encoded])
    predicted_class = np.argmax(prediction, axis=1)[0] + 1  # original labels were 1-9

    print(f"✅ Predicted Cancer Class: {predicted_class}")
    return predicted_class

# === 🧪 Example use ===
if __name__ == "__main__":
    dna_text = "This mutation affects the kinase domain of the EGFR gene."
    gene = "EGFR"
    variation = "L858R"

    predict_cancer_class(dna_text, gene, variation)




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 562ms/step
✅ Predicted Cancer Class: 2
