In [1]:
# 📦 Install required libraries (uncomment if running in a new environment)
# !pip install pandas scikit-learn tensorflow

import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Dense, LSTM, Concatenate
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# === 📂 Load data ===
variants_path = r"C:\Users\sagni\Downloads\msk-redefining-cancer-treatment\training_variants"
text_path = r"C:\Users\sagni\Downloads\msk-redefining-cancer-treatment\training_text"

variants_df = pd.read_csv(variants_path)
text_df = pd.read_csv(text_path, sep="\|\|", engine="python", names=["ID", "Text"], skiprows=1)

# === 🔗 Merge data ===
data = pd.merge(variants_df, text_df, on="ID")

# === 🧹 Clean text column (fix AttributeError due to NaN) ===
data["Text"] = data["Text"].fillna("unknown")

# === 🧪 Prepare categorical features ===
gene_encoder = LabelEncoder()
variation_encoder = LabelEncoder()
data["Gene_enc"] = gene_encoder.fit_transform(data["Gene"])
data["Variation_enc"] = variation_encoder.fit_transform(data["Variation"])

# === 🧠 Prepare text features ===
tokenizer = Tokenizer(num_words=20000)
tokenizer.fit_on_texts(data["Text"])
X_text = tokenizer.texts_to_sequences(data["Text"])
X_text = pad_sequences(X_text, maxlen=500)

# === 🎯 Labels (1–9), one-hot encode
y = to_categorical(data["Class"] - 1, num_classes=9)

# === ✂️ Train/test split ===
X_train_text, X_test_text, X_train_gene, X_test_gene, X_train_var, X_test_var, y_train, y_test = train_test_split(
    X_text, data["Gene_enc"], data["Variation_enc"], y, test_size=0.2, random_state=42
)

# === 🧩 Model ===
# Inputs
input_text = Input(shape=(500,))
input_gene = Input(shape=(1,))
input_var = Input(shape=(1,))

# Embeddings
text_emb = Embedding(input_dim=20000, output_dim=128, input_length=500)(input_text)
x_text = LSTM(64)(text_emb)

gene_emb = Embedding(input_dim=len(gene_encoder.classes_), output_dim=8)(input_gene)
x_gene = LSTM(8)(gene_emb)

var_emb = Embedding(input_dim=len(variation_encoder.classes_), output_dim=8)(input_var)
x_var = LSTM(8)(var_emb)

# Combine
merged = Concatenate()([x_text, x_gene, x_var])
output = Dense(64, activation='relu')(merged)
output = Dense(9, activation='softmax')(output)

model = Model(inputs=[input_text, input_gene, input_var], outputs=output)
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

# === 🏋️‍♂️ Train ===
model.fit(
    [X_train_text, np.expand_dims(X_train_gene, -1), np.expand_dims(X_train_var, -1)],
    y_train,
    epochs=5,
    batch_size=64,
    validation_split=0.1
)

# === ✅ Evaluate ===
loss, acc = model.evaluate(
    [X_test_text, np.expand_dims(X_test_gene, -1), np.expand_dims(X_test_var, -1)],
    y_test
)
print(f"\n✅ Test Accuracy: {acc:.4f}")

# === 💾 Save the model ===
model.save(r"C:\Users\sagni\Downloads\Cancer Detection\cancer_model.h5")
print("✅ Model saved to: C:\\Users\\sagni\\Downloads\\Cancer Detection\\cancer_model.h5")




Epoch 1/5
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 232ms/step - accuracy: 0.2599 - loss: 2.0558 - val_accuracy: 0.3120 - val_loss: 1.8586
Epoch 2/5
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 204ms/step - accuracy: 0.3449 - loss: 1.7397 - val_accuracy: 0.3985 - val_loss: 1.6135
Epoch 3/5
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 206ms/step - accuracy: 0.5287 - loss: 1.2914 - val_accuracy: 0.5000 - val_loss: 1.3483
Epoch 4/5
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 200ms/step - accuracy: 0.6571 - loss: 0.9774 - val_accuracy: 0.5263 - val_loss: 1.2716
Epoch 5/5
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 200ms/step - accuracy: 0.7429 - loss: 0.7515 - val_accuracy: 0.5301 - val_loss: 1.2916
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 41ms/step - accuracy: 0.5779 - loss: 1.2583





✅ Test Accuracy: 0.5714
✅ Model saved to: C:\Users\sagni\Downloads\Cancer Detection\cancer_model.h5
