# Import Libraries

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip -q install keras keras_nlp keras_tuner

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/644.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━[0m [32m501.8/644.1 kB[0m [31m16.1 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m644.1/644.1 kB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/129.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m129.1/129.1 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/5.2 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m5.1/5.2 MB[0m [31m124.2 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m5.2/5.2 MB[0m [31m117.3 MB/s[0m eta [36m0:00:01[0m[2K   

In [3]:
# Select path
path = '/content/drive/MyDrive/Portfolio/treat_cancer/' # colab rbalbinotti
# path = '/content/drive/MyDrive/treat_cancer/' # colab
# path = '' # vscode

# Select if Train Mode
train_mode = False

In [4]:
import os
os.environ["KERAS_BACKEND"] = "tensorflow"

# Base
import warnings
import joblib

# Plot
import plotly.express as px

# Models
import pandas as pd
import numpy as np
import tensorflow as tf
import keras
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from keras_nlp.models import DistilBertTokenizer, DistilBertPreprocessor, DistilBertClassifier
from keras.optimizers import Adafactor
from keras.callbacks import EarlyStopping, CSVLogger
from keras.utils import to_categorical
from keras_tuner import RandomSearch

# NLP Model
# model_pre = "bert_base_en_uncased"
model_pre = "distil_bert_base_en_uncased"

# Config notebook
warnings.filterwarnings('ignore')

In [5]:
def pre_process(data, col1='gene', col2='variation', col3='clinical_evidence', target_col=None):
    """
        Combine data columns and adjust column classification
    """

    data['combined_text'] = data[col1] + " " + data[col2] + " " + data[col3]

    # Adjust class to 0 - 8 it's necessary by keras to_categorical
    if target_col == None:
        data = data[['combined_text']]

    else:
        data['class_adjusted'] = data[target_col] - 1
        data = data[['combined_text', 'class_adjusted']]

    return data

#### Train Data

In [6]:
# Load data for train
data_train = pd.read_parquet(path + 'data_for_train.parquet')
print(f"data_train: {data_train.shape})")

data_train: (3316, 4))


In [7]:
data_train_pre = pre_process(data_train, target_col='class')

In [8]:
data_train_pre.head(2)

Unnamed: 0,combined_text,class_adjusted
0,FAM58A Truncating_Mutations cyclindependent ki...,0
1,CBL W802* abstract background nonsmall cell l...,1


In [9]:
text = data_train_pre['combined_text'].tolist()

In [10]:
num_class = len(data_train_pre['class_adjusted'].unique())
num_class

9

In [11]:
# Create one-hot encode for classification 9 classes
label = to_categorical(data_train_pre['class_adjusted'], num_classes=num_class)
label

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

##### Split

In [12]:
# Convert One_hot encode
class_reverted = np.argmax(label, axis=1)

fig = px.histogram(class_reverted, histnorm='probability density' ,title='Original Density Class', text_auto=".2%")
fig.update_layout(showlegend=False, yaxis_title='')
fig.update_yaxes(showticklabels=False)
fig.show()

In [13]:
# Make Split train, test
X_train, X_test, y_train, y_test = train_test_split(
    text,
    label,
    test_size=0.2,
    random_state=34,
    stratify=label
)

# save
joblib.dump(X_train, path + './data_files/X_train_DISTIL.joblib')
joblib.dump(X_test, path + './data_files/X_test_DISTIL.joblib')
joblib.dump(y_train, path + './data_files/y_train_DISTIL.joblib')
joblib.dump(y_test, path + './data_files/y_test_DISTIL.joblib')

['/content/drive/MyDrive/Portfolio/treat_cancer/./data_files/y_test_DISTIL.joblib']

In [14]:
print(f"X_train shape: {len(X_train)}\n" \
      f"X_test shape: {len(X_test)}\n" \
      f"y_train shape: {len(y_train)}\n" \
      f"y_test shape: {len(y_test)}")

X_train shape: 2652
X_test shape: 664
y_train shape: 2652
y_test shape: 664


In [15]:
# Flatten array multidimensions after split
class_reverted = np.argmax(y_train, axis=1)

fig = px.histogram(class_reverted, histnorm='probability density' ,title='y_train Density Class', text_auto=".2%")
fig.update_layout(showlegend=False, yaxis_title='')
fig.update_yaxes(showticklabels=False)
fig.show()

In [16]:
# Create validation data
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=27, stratify=y_train)

In [17]:
print(f"X_train shape: {len(X_train)}\n" \
      f"X_val shape: {len(X_val)}\n" \
      f"y_train shape: {len(y_train)}\n" \
      f"y_val shape: {len(y_val)}")

X_train shape: 2386
X_val shape: 266
y_train shape: 2386
y_val shape: 266


### Create Model

In [18]:
if train_mode:

  def build_model(hp):
      # Hyperparameters
      dropout_rate = hp.Float('dropout_rate', min_value=0.1, max_value=0.5, step=0.1)
      learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])

      # Model
      bert_model = DistilBertClassifier.from_preset(
          model_pre,
          num_classes=num_class,
          activation="softmax",
          dropout=dropout_rate)

      # Compile
      bert_model.compile(
          optimizer = Adafactor(learning_rate=learning_rate),
          loss='categorical_crossentropy',
          metrics=['AUC', 'categorical_accuracy', 'precision', 'recall'])

      return bert_model

  # Config Callbacks
  early_stopping = EarlyStopping(monitor='AUC', patience=5, restore_best_weights=True)
  csv_logger = CSVLogger(path + 'train_log_distil.csv', append=True)

  # Config RandomSearch
  tuner = RandomSearch(
      build_model,
      objective='AUC',
      max_trials=10, # distinct combinations hyperparameters
      max_retries_per_trial=2,
      directory= path,
      project_name='DistilBertClass'
  )

  # Balance classes
  class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train.ravel())
  class_weights = {i: class_weights[i] for i in range(len(class_weights))}

#### Fit

In [19]:
if train_mode:
  # Batch size
  batch_sizes = [64]
  epoch = 12

  # Fit Model
  for batch_size in batch_sizes:
      print(f"Train with batch_size: {batch_size}")
      history = tuner.search(
          X_train,
          y_train,
          validation_data=(X_val, y_val),
          epochs=epoch,
          batch_size=batch_size,
          # class_weight=class_weights, # explode memory in colab
          callbacks=[early_stopping, csv_logger])

      best_models = tuner.get_best_models(num_models=1)
      keras.saving.save_model(best_models[0], path + f'best_model_distilbertclassifier_{batch_size}.keras')

  tuner.results_summary()

# Evaluate Model

In [22]:
diltil_model = keras.models.load_model(path + "./best_models/DistilbertClassifier_32.keras")

In [24]:
# Predict
y_pred = diltil_model.predict(X_test)
y_pred_class = np.argmax(y_pred, axis=1)
y_true = np.argmax(y_test, axis=1)

[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 749ms/step


In [25]:
joblib.dump(y_pred, path + './data_files/y_pred_disltil.joblib')
joblib.dump(y_pred_class, path + './data_files/y_pred_class_distil.joblib')

['/content/drive/MyDrive/Portfolio/treat_cancer/./data_files/y_pred_class_distil.joblib']