<b> Notice: Run data_org.ipynb first </b>

In [28]:
# !pip -q install keras-tuner keras_nlp

In [29]:
# from google.colab import drive
# drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [1]:
# path = '/content/drive/MyDrive/Portfolio/treat_cancer/' # colab
# path = '/content/drive/MyDrive/Colab Notebooks/treat_cancer (1)/' # antsb
path = './data_files/' # vscode

# Import libraries

In [2]:
import pandas as pd # type: ignore
import warnings
import string
import joblib # type: ignore

# NLP
import spacy # type: ignore
from spacy.lang.en.stop_words import STOP_WORDS # type: ignore
nlp = spacy.load("en_core_web_sm")

# Config notebook
warnings.filterwarnings('ignore')
pd.set_option('display.max.colwidth', 200)

##### Create Functions

In [3]:
def pre_process(text):
    """
    Remove stop words, tokenize and lemmatize.
    """

    # Process text
    doc = nlp(text)
    tokens = []
    for token in doc:
        if token.text.lower() not in STOP_WORDS and token.text not in string.punctuation:
            tokens.append(token.lemma_)

    return ' '.join(tokens)

# ETL data for train

In [4]:
data_train = pd.read_parquet(path + 'data_for_train.parquet')

In [5]:
data_train.head(3)

Unnamed: 0,gene,variation,clinical_evidence,class
0,FAM58A,Truncating_Mutations,cyclindependent kinases cdks regulate a variety of fundamental cellular processes cdk10 stands out as one of the last orphan cdks for which no activating cyclin has been identified and no kinase a...,1
1,CBL,W802*,abstract background nonsmall cell lung cancer nsclc is a heterogeneous group of disorders with a number of genetic and proteomic alterations ccbl is an e3 ubiquitin ligase and adaptor molecule im...,2
2,CBL,Q249E,abstract background nonsmall cell lung cancer nsclc is a heterogeneous group of disorders with a number of genetic and proteomic alterations ccbl is an e3 ubiquitin ligase and adaptor molecule im...,2


### Data Train

In [None]:
# data_train_tok
data_train['text_tok'] = data_train['clinical_evidence'].apply(pre_process)
# 4 hours for execute

In [None]:
data_train = data_train.drop(columns=['clinical_evidence'])
data_train = data_train.reindex(columns = ['gene', 'variation', 'text_tok', 'class'])

In [None]:
data_train.head(3)

In [None]:
# Save
data_train.to_parquet(path + "data_train_tok.parquet")

##### Load data for train lemmatized

###### Libraries

In [31]:
import numpy as np
import pandas as pd
import joblib
import warnings

import os
os.environ["KERAS_BACKEND"] = "tensorflow"

import keras
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.preprocessing import LabelEncoder

from tensorflow.keras.layers import Dense, Dropout, Embedding, LSTM, Masking, Input
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
from keras.optimizers import Adam
from keras_tuner import RandomSearch
from keras.metrics import CategoricalAccuracy, AUC, Precision, Recall, F1Score

# Config notebook
warnings.filterwarnings('ignore')
pd.set_option('display.max.colwidth', 200)


In [7]:
train_tok = pd.read_parquet(path + "data_train_tok.parquet")
train_tok.shape

(3316, 4)

In [8]:
train_tok.head(3)

Unnamed: 0,gene,variation,text_tok,class
0,FAM58A,Truncating_Mutations,cyclindependent kinases cdks regulate variety fundamental cellular process cdk10 stand orphan cdks activate cyclin identify kinase activity reveal previous work show cdk10 silencing increase ets2 ...,1
1,CBL,W802*,abstract background nonsmall cell lung cancer nsclc heterogeneous group disorder number genetic proteomic alteration ccbl e3 ubiquitin ligase adaptor molecule important normal homeostasis cancer...,2
2,CBL,Q249E,abstract background nonsmall cell lung cancer nsclc heterogeneous group disorder number genetic proteomic alteration ccbl e3 ubiquitin ligase adaptor molecule important normal homeostasis cancer...,2


In [34]:
texts = train_tok[['gene', 'variation', 'text_tok']].apply(lambda x: ' '.join(x), axis=1).values
labels = (train_tok['class'] - 1).values

In [35]:
print(f"Texts: {texts.shape}")
print(f"Labels: {labels.shape}")

Texts: (3316,)
Labels: (3316,)


In [36]:
num_class = len(np.unique(labels))

In [37]:
# Encode labels
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(labels)
labels = to_categorical(labels, num_classes=num_class)

In [38]:
# Tokenization
tokenizer = Tokenizer(num_words=30000)
tokenizer.fit_on_texts(texts) # create dict index
sequences = tokenizer.texts_to_sequences(texts) # transform texts in sequence numbers
word_index = tokenizer.word_index

In [39]:
# Biggest string
longest_string = max(texts, key=len)
max_len = len(longest_string.split())
print(f"Len: {max_len}")

Len: 49680


In [48]:
# Pad sequences
padded_sequences = pad_sequences(sequences, maxlen=15000, padding="post")

In [49]:
padded_sequences.shape, labels.shape

((3316, 15000), (3316, 9))

In [50]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(
    padded_sequences,
    labels,
    test_size=0.2,
    stratify=labels,
    random_state=42)


In [51]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((2652, 15000), (664, 15000), (2652, 9), (664, 9))

In [52]:
max_len = X_train.shape[1]

In [53]:
# def build_model(hp):
#     # Hyperparameters
#     dropout_rate1 = hp.Float('dropout_rate_l1', min_value=0.1, max_value=0.5, step=0.1)
#     dropout_rate2 = hp.Float('dropout_rate_l2', min_value=0.1, max_value=0.5, step=0.1)
#     dropout_rate3 = hp.Float('dropout_rate_l3', min_value=0, max_value=0.5, step=0.1)
#     activation_method = hp.Choice('activation_method', values=['relu', 'leaky_relu'])
#     optimizers = hp.Choice('optimizer', values=['adam', 'adafactor','lamb'])

#     # Model
#     inputs = Input(shape=(max_len,))
#     x = Embedding(input_dim=len(word_index) + 1, output_dim=128, input_length=max_len)(inputs)
#     x = Masking(mask_value=0.0)(x)  # ignore zeros

#     # Encoder
#     x = LSTM(units=128, dropout=dropout_rate1, recurrent_dropout=dropout_rate1, return_sequences=True, name='LSTM1')(x)
#     # x = Dropout(dropout_rate1)(x)
#     # x = LSTM(units=128, dropout=dropout_rate1, recurrent_dropout=dropout_rate1, return_sequences=True, name='LSTM2')(x)
#     # # x = Dropout(dropout_rate1)(x)
#     encoded = LSTM(units=64, name='LSTM3')(x)

#     # Decoder (optional, if you want to reconstruct the input)
#     # x = RepeatVector(max_len)(encoded)
#     # x = LSTM(units=128, return_sequences=True)(x)
#     # x = Dropout(dropout_rate1)(x)
#     # x = LSTM(units=128, return_sequences=True)(x)
#     # x = Dropout(dropout_rate1)(x)
#     # decoded = TimeDistributed(Dense(len(word_index) + 1, activation='softmax'))(x)

#     # Classifier
#     x = Dropout(dropout_rate2)(encoded)
#     x = Dense(units=32, activation=activation_method, name='dense')(x)
#     x = Dropout(dropout_rate3)(x)
#     outputs = Dense(9, activation='softmax', name='softmax')(x)

#     model = Model(inputs, outputs)

#     # Compile
#     model.compile(optimizer=optimizers,
#                   loss='categorical_crossentropy',
#                   metrics=['accuracy', 'precision', 'recall', 'f1_score'])

#     return model


In [54]:
def build_model(hp):
  # Hyperparameters
  dropout_rate1 = hp.Float('dropout_rate_l1', min_value=0.1, max_value=0.5, step=0.1)
  dropout_rate2 = hp.Float('dropout_rate_l2', min_value=0.1, max_value=0.5, step=0.1)
  dropout_rate3 = hp.Float('dropout_rate_l3', min_value=0, max_value=0.5, step=0.1)
  activation_method = hp.Choice('activation_method', values=['relu', 'leaky_relu'])
  optimizers = hp.Choice('optimizer', values=['adam', 'adafactor','lamb'])

  # Model
  model = Sequential()

  model.add(Embedding(input_dim=len(word_index) + 1, output_dim=128, input_length=max_len))

  model.add(Masking(mask_value=0.0)) # ignore zeros

  model.add(LSTM(units=128, return_sequences=True))

  model.add(Dropout(dropout_rate1))

  model.add(LSTM(units=64))

  model.add(Dropout(dropout_rate2))

  model.add(Dense(units=32, activation=activation_method))

  model.add(Dropout(dropout_rate3))

  model.add(Dense(9, activation='softmax'))

  # Compile
  model.compile(optimizer= Adam(
      hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4, 1e-5])),
                loss='categorical_crossentropy',
                metrics=['categorical_accuracy', 'precision', 'recall', 'f1_score', 'AUC'])

  return model


### Keras Tuner

In [55]:
# Config EarlyStop
earlystop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Config RandomSearch
tuner = RandomSearch(
    build_model,
    objective='val_loss',
    max_trials=10, # distinct combinations hyperparameters
    max_retries_per_trial=1,
    directory= path,
    project_name='keras_tuning_LSTM'
)

Reloading Tuner from /content/drive/MyDrive/Colab Notebooks/treat_cancer (1)/keras_tuning_LSTM/tuner0.json


In [56]:
# Balance classes
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train.ravel())
class_weights = {i: class_weights[i] for i in range(len(class_weights))}

#### Fit Model

In [57]:
tuner.search(
    X_train,
    y_train,
    epochs=10,
    batch_size=32,
    validation_split=0.2,
    class_weight=class_weights,
    shuffle=True,
    verbose=1,
    callbacks=[earlystop])


Trial 10 Complete [00h 05m 22s]
val_loss: 2.075413227081299

Best val_loss So Far: 2.0252230167388916
Total elapsed time: 01h 33m 52s


In [58]:
best_models = tuner.get_best_models(num_models=1)
keras.saving.save_model(best_models[0], path + 'best_model_keras_LSTM.keras')
best_models[0]

<Sequential name=sequential, built=True>

In [83]:
 best_hp = tuner.get_best_hyperparameters()[0]
 model = tuner.hypermodel.build(best_hp)

In [81]:
# Best 3 Models
tuner.results_summary(num_trials=3)

Results summary
Results in /content/drive/MyDrive/Colab Notebooks/treat_cancer (1)/keras_tuning_LSTM
Showing 3 best trials
Objective(name="val_loss", direction="min")

Trial 04 summary
Hyperparameters:
dropout_rate_l1: 0.2
dropout_rate_l2: 0.2
dropout_rate_l3: 0.30000000000000004
activation_method: relu
optimizer: adafactor
learning_rate: 0.001
Score: 2.0252230167388916

Trial 03 summary
Hyperparameters:
dropout_rate_l1: 0.2
dropout_rate_l2: 0.30000000000000004
dropout_rate_l3: 0.30000000000000004
activation_method: relu
optimizer: adafactor
learning_rate: 0.0001
Score: 2.060429096221924

Trial 05 summary
Hyperparameters:
dropout_rate_l1: 0.2
dropout_rate_l2: 0.4
dropout_rate_l3: 0.2
activation_method: leaky_relu
optimizer: adafactor
learning_rate: 0.001
Score: 2.0676660537719727


### Data Predict

In [None]:
# Load
data_predict = pd.read_parquet(path + 'data_for_predict.parquet')

In [None]:
data_predict.head(3)

In [None]:
data_predict['text_tok'] = data_predict['clinical_evidence'].apply(pre_process)
# 5:40 hours for execute

In [None]:
data_predict = data_predict.drop(columns=['clinical_evidence'])

In [None]:
data_predict.head()

##### Load data for predict lemmatized

In [None]:
# Save
joblib.dump(data_predict, path + "data_predict_tok.pkl")

In [None]:
predict_tok = joblib.load(path + "data_predict_tok.pkl")
predict_tok.shape

In [None]:
predict_tok