In [1]:
!pip install deap tensorflow keras scikit-learn pandas numpy tqdm




In [5]:
import numpy as np
import tensorflow as tf
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense, Conv1D, GlobalMaxPooling1D, Dropout
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_digits
from deap import base, creator, tools, algorithms
import random

# === Load and Prepare Data ===
digits = load_digits()
X = digits.images
y = digits.target
X = X.reshape((X.shape[0], X.shape[1], X.shape[2]))  # 3D shape for Conv1D
X = X.astype("float32") / 16.0
y = to_categorical(y, num_classes=10)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# === Create CNN Model ===
def create_model(num_filters, kernel_size, dropout_rate):
    model = Sequential()
    model.add(Conv1D(int(num_filters), kernel_size=(int(kernel_size),), activation='relu', input_shape=(X.shape[1], X.shape[2])))
    model.add(GlobalMaxPooling1D())
    model.add(Dropout(min(max(float(dropout_rate), 0), 1)))  # Clamp between 0 and 1
    model.add(Dense(10, activation='softmax'))
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

# === Fitness Function ===
def evaluate(ind):
    num_filters, kernel_size, dropout_rate = int(ind[0]), int(ind[1]), float(ind[2])
    model = create_model(num_filters, kernel_size, dropout_rate)
    history = model.fit(X_train, y_train, epochs=2, batch_size=64, verbose=0, validation_data=(X_val, y_val))
    acc = history.history["val_accuracy"][-1]
    return (acc,)

# === Genetic Algorithm Setup ===
creator.create("FitnessMax", base.Fitness, weights=(1.0,))
creator.create("Individual", list, fitness=creator.FitnessMax)

toolbox = base.Toolbox()
# Safe bounds: filters [16–64], kernel size [2–5], dropout [0.1–0.5]
toolbox.register("attr_int1", random.randint, 16, 64)    # num_filters
toolbox.register("attr_int2", random.randint, 2, 5)      # kernel_size
toolbox.register("attr_float", lambda: round(random.uniform(0.1, 0.5), 2))  # dropout_rate

toolbox.register("individual", tools.initCycle, creator.Individual, 
                 (toolbox.attr_int1, toolbox.attr_int2, toolbox.attr_float), n=1)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)

toolbox.register("evaluate", evaluate)
toolbox.register("mate", tools.cxTwoPoint)
toolbox.register("mutate", tools.mutGaussian, mu=0, sigma=1, indpb=0.2)
toolbox.register("select", tools.selTournament, tournsize=3)

# === Run Genetic Algorithm ===
pop = toolbox.population(n=5)
hof = tools.HallOfFame(1)
algorithms.eaSimple(pop, toolbox, cxpb=0.5, mutpb=0.2, ngen=3, halloffame=hof, verbose=True)

# === Final Best Model Training ===
best_params = hof[0]
print("\n✅ Best Parameters Found:", best_params)
final_model = create_model(int(best_params[0]), int(best_params[1]), float(best_params[2]))
final_model.fit(X_train, y_train, epochs=5, batch_size=64, verbose=1, validation_data=(X_val, y_val))


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


gen	nevals
0  	5     
1  	3     
2  	4     
3  	2     

✅ Best Parameters Found: [43, 5.684050962694215, -0.1563653945680076]
Epoch 1/5
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 16ms/step - accuracy: 0.0878 - loss: 2.3066 - val_accuracy: 0.2806 - val_loss: 2.1719
Epoch 2/5
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.4177 - loss: 2.1211 - val_accuracy: 0.5917 - val_loss: 2.0100
Epoch 3/5
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.6450 - loss: 1.9669 - val_accuracy: 0.7250 - val_loss: 1.8521
Epoch 4/5
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.7514 - loss: 1.8053 - val_accuracy: 0.7833 - val_loss: 1.6896
Epoch 5/5
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.7892 - loss: 1.6247 - val_accuracy: 0.8139 - val_loss: 1.5229


<keras.src.callbacks.history.History at 0x1c864223a50>

In [9]:
!pip install tensorflow




In [11]:
# 📦 Install required libraries (uncomment if running in a new environment)
# !pip install pandas scikit-learn tensorflow

import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Dense, LSTM, Concatenate
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# === 📂 Load data ===
variants_path = r"C:\Users\sagni\Downloads\msk-redefining-cancer-treatment\training_variants"
text_path = r"C:\Users\sagni\Downloads\msk-redefining-cancer-treatment\training_text"

variants_df = pd.read_csv(variants_path)
text_df = pd.read_csv(text_path, sep="\|\|", engine="python", names=["ID", "Text"], skiprows=1)

# === 🔗 Merge data ===
data = pd.merge(variants_df, text_df, on="ID")

# === 🧹 Clean text column (fix AttributeError due to NaN) ===
data["Text"] = data["Text"].fillna("unknown")

# === 🧪 Prepare categorical features ===
gene_encoder = LabelEncoder()
variation_encoder = LabelEncoder()
data["Gene_enc"] = gene_encoder.fit_transform(data["Gene"])
data["Variation_enc"] = variation_encoder.fit_transform(data["Variation"])

# === 🧠 Prepare text features ===
tokenizer = Tokenizer(num_words=20000)
tokenizer.fit_on_texts(data["Text"])
X_text = tokenizer.texts_to_sequences(data["Text"])
X_text = pad_sequences(X_text, maxlen=500)

# === 🎯 Labels (1–9), one-hot encode
y = to_categorical(data["Class"] - 1, num_classes=9)

# === ✂️ Train/test split ===
X_train_text, X_test_text, X_train_gene, X_test_gene, X_train_var, X_test_var, y_train, y_test = train_test_split(
    X_text, data["Gene_enc"], data["Variation_enc"], y, test_size=0.2, random_state=42
)

# === 🧩 Model ===
# Inputs
input_text = Input(shape=(500,))
input_gene = Input(shape=(1,))
input_var = Input(shape=(1,))

# Embeddings
text_emb = Embedding(input_dim=20000, output_dim=128, input_length=500)(input_text)
x_text = LSTM(64)(text_emb)

gene_emb = Embedding(input_dim=len(gene_encoder.classes_), output_dim=8)(input_gene)
x_gene = LSTM(8)(gene_emb)

var_emb = Embedding(input_dim=len(variation_encoder.classes_), output_dim=8)(input_var)
x_var = LSTM(8)(var_emb)

# Combine
merged = Concatenate()([x_text, x_gene, x_var])
output = Dense(64, activation='relu')(merged)
output = Dense(9, activation='softmax')(output)

model = Model(inputs=[input_text, input_gene, input_var], outputs=output)
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

# === 🏋️‍♂️ Train ===
model.fit(
    [X_train_text, np.expand_dims(X_train_gene, -1), np.expand_dims(X_train_var, -1)],
    y_train,
    epochs=5,
    batch_size=64,
    validation_split=0.1
)

# === ✅ Evaluate ===
loss, acc = model.evaluate(
    [X_test_text, np.expand_dims(X_test_gene, -1), np.expand_dims(X_test_var, -1)],
    y_test
)
print(f"\n✅ Test Accuracy: {acc:.4f}")


Epoch 1/5




[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 243ms/step - accuracy: 0.2737 - loss: 2.0852 - val_accuracy: 0.3421 - val_loss: 1.8692
Epoch 2/5
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 216ms/step - accuracy: 0.3382 - loss: 1.7554 - val_accuracy: 0.4135 - val_loss: 1.6515
Epoch 3/5
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 215ms/step - accuracy: 0.5224 - loss: 1.3617 - val_accuracy: 0.5263 - val_loss: 1.3492
Epoch 4/5
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 214ms/step - accuracy: 0.6683 - loss: 0.9839 - val_accuracy: 0.5226 - val_loss: 1.3771
Epoch 5/5
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 217ms/step - accuracy: 0.7457 - loss: 0.7546 - val_accuracy: 0.5827 - val_loss: 1.2659
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 48ms/step - accuracy: 0.5710 - loss: 1.2411

✅ Test Accuracy: 0.5729
