In [3]:
# !pip -q install keras-tuner keras_nlp

In [4]:
# from google.colab import drive
# drive.mount('/content/drive')

In [13]:
# Select path
# path = '/content/drive/MyDrive/Portfolio/treat_cancer/' # colab rbalbinotti
# path = '/content/drive/MyDrive/treat_cancer/' # colab
path = '' # vscode

# Select if use PCA
apply_pca = True

# Select if Train Mode
train_mode = False

# Import libraries

In [14]:
import numpy as np
import pandas as pd
import joblib
import warnings

import os
os.environ["KERAS_BACKEND"] = "tensorflow"
import keras

from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA

from keras.layers import Dense, Dropout, Embedding, LSTM, Masking, Input
from keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping, CSVLogger
from keras.optimizers import Adafactor
from keras_tuner import RandomSearch
from keras.metrics import CategoricalAccuracy, AUC, Precision, Recall, F1Score

# Config notebook
warnings.filterwarnings('ignore')
pd.set_option('display.max.colwidth', 200)


In [15]:
train_tok = pd.read_parquet(path + "./data_files/data_train_tok.parquet")
print(f'Train shape: {train_tok.shape}')

In [16]:
def split():
  padded_sequences = pad_sequences(sequences, maxlen=max_len, padding="post")

  # Split data
  X_train, X_test, y_train, y_test = train_test_split(
      padded_sequences,
      labels,
      test_size=0.25,
      stratify=labels,
      shuffle=True,
      random_state=42)

  return X_train, X_test, y_train, y_test


texts = train_tok[['gene', 'variation', 'text_tok']].apply(lambda x: ' '.join(x), axis=1).values
labels = (train_tok['class'] - 1).values
print(f"Texts: {texts.shape}")
print(f"Labels: {labels.shape}")

num_class = len(np.unique(labels))

# Encode labels
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(labels)
labels = to_categorical(labels, num_classes=num_class)

# Tokenization
tokenizer = Tokenizer(num_words=30000)
tokenizer.fit_on_texts(texts) # create dict index
sequences = tokenizer.texts_to_sequences(texts) # transform texts in sequence numbers
word_index = tokenizer.word_index

# Biggest string
longest_string = max(texts, key=len)
max_len = len(longest_string.split())
print(f"Len: {max_len}")

if apply_pca:
    max_len = max_len
    X_train, X_test, y_train, y_test = split()

    # PCA
    pca = PCA(n_components=400)
    X_train = pca.fit_transform(X_train)
    X_test = pca.transform(X_test)

    # save
    joblib.dump(X_train, path + './data_files/X_train_PCA.joblib')
    joblib.dump(X_test, path + './data_files/X_test_PCA.joblib')
    joblib.dump(y_train, path + './data_files/y_train_PCA.joblib')
    joblib.dump(y_test, path + './data_files/y_test_PCA.joblib')

else:
    # without PCA
    max_len = 18000
    X_train, X_test, y_train, y_test = split()

    # save
    joblib.dump(X_train, path + './data_files/X_train.joblib')
    joblib.dump(X_test, path + './data_files/X_test.joblib')
    joblib.dump(y_train, path + './data_files/y_train.joblib')
    joblib.dump(y_test, path + './data_files/y_test.joblib')

# Create model
Run this section only for train model.

### Keras Tuner

In [17]:
if train_mode:

  batch = 32
  epochs = 30
  patience = int(epochs * 0.2)
  max_trail = 30

  project = 'kerasTuningLSTM_test'
  logger = 'train_log_' + project + '.csv'

  print(f"batch: {batch}\nepochs: {epochs}\npatience: {patience}\nTrail: {max_trail}\nproject name: {project}\ncsv_logger: {logger}")


In [18]:
if train_mode:

  print(f'X_train shape: {X_train.shape}, X_test shape: {X_test.shape}, y_train shape: {y_train.shape}, y_test shape: {y_test.shape}')
  max_len = X_train.shape[1]

  # Model
  def build_model(hp):
    # Hyperparameters
    dropout_rate1 = hp.Float('dropout_rate_l1', min_value=0.1, max_value=0.5, step=0.1)
    dropout_rate2 = hp.Float('dropout_rate_l2', min_value=0, max_value=0.5, step=0.1)
    dropout_rate3 = hp.Float('dropout_rate_l3', min_value=0, max_value=0.5, step=0.1)
    activation_method = hp.Choice('activation_method', values=['relu', 'leaky_relu'])

    # Model
    model = Sequential()

    model.add(Embedding(input_dim=len(word_index) + 1, output_dim=128, input_length=max_len))

    # model.add(Masking(mask_value=0.0)) # ignore zeros

    model.add(LSTM(units=256, return_sequences=True))

    model.add(Dropout(dropout_rate1))

    model.add(LSTM(units=128, return_sequences=True))

    model.add(Dropout(dropout_rate1))

    model.add(LSTM(units=64))

    model.add(Dropout(dropout_rate2))

    model.add(Dense(units=32, activation=hp.Choice('activation_32', values=['sigmoid', 'tanh', 'gelu'])))

    model.add(Dropout(dropout_rate3))

    model.add(Dense(9, activation='softmax'))

    # Compile../best_models/keras
    model.compile(optimizer= Adafactor(
        hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])),
                  loss='categorical_crossentropy',
                  metrics=['categorical_accuracy', 'precision', 'recall', 'AUC'])

    return model

  # Config callbacks
  early_stop = EarlyStopping(monitor='val_AUC', patience=patience, restore_best_weights=True)
  csv_logger = CSVLogger(path + logger, append=True) # 'train_log_LSTM.csv'

  # Config RandomSearch
  tuner = RandomSearch(
      build_model,
      objective='val_AUC',
      max_trials=max_trail, # distinct combinations hyperparameters
      max_retries_per_trial=1,
      directory= path,
      project_name=project
  )

  # Balance classes
  class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train.ravel())
  class_weights = {i: class_weights[i] for i in range(len(class_weights))}

#### Fit Model

In [19]:
if train_mode:

  history = tuner.search(
      X_train,
      y_train,
      epochs=epochs,
      batch_size=batch,
      validation_split=0.22,
      class_weight=class_weights,
      shuffle=True,
      verbose=1,
      callbacks=[early_stop, csv_logger])

  # Save
  best_models = tuner.get_best_models(num_models=1)
  keras.saving.save_model(best_models[0], path + project + '.keras')