<b> Notice: Run data_org.ipynb first </b>

# Import libraries

In [1]:
import pandas as pd # type: ignore
import warnings
import string
import joblib

# NLP
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
nlp = spacy.load("en_core_web_sm")

# Config notebook
warnings.filterwarnings('ignore')
pd.set_option('display.max.colwidth', 200)

##### Create Functions

In [2]:
def pre_process(text):
    """
    Remove stop words, tokenize and lemmatize.
    """

    # Process text
    doc = nlp(text)
    tokens = []
    for token in doc:
        if token.text.lower() not in STOP_WORDS and token.text not in string.punctuation:
            tokens.append(token.lemma_)

    return ' '.join(tokens)

# ETL data for train

In [3]:
data_train = pd.read_parquet('./data_files/data_for_train.parquet')

In [None]:
data_train.head(3)

### Data Train

In [8]:
# data_train_tok 
data_train['text_tok'] = data_train['clinical_evidence'].apply(pre_process)
# 4 hours for execute

In [9]:
data_train = data_train.drop(columns=['clinical_evidence'])
data_train = data_train.reindex(columns = ['gene', 'variation', 'text_tok', 'class'])

In [None]:
data_train.head(3)

In [None]:
# Save
joblib.dump(data_train, "data_files/data_train_tok.pkl")

##### Load data for train lemmatized

###### Libraries

In [1]:
import numpy as np
import pandas as pd
import joblib
import warnings

import os
os.environ["KERAS_BACKEND"] = "tensorflow"

from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, TextVectorization, CategoryEncoding
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Config notebook
warnings.filterwarnings('ignore')
pd.set_option('display.max.colwidth', 200)


2024-10-02 08:44:49.360632: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-10-02 08:44:49.896709: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-10-02 08:44:50.126007: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-10-02 08:44:50.517064: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-10-02 08:44:50.601223: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
train_tok = joblib.load("data_files/data_train_tok.pkl")
train_tok.shape

(3316, 4)

In [3]:

train_tok.head(3)

Unnamed: 0,gene,variation,text_tok,class
0,FAM58A,Truncating_Mutations,cyclindependent kinases cdks regulate variety fundamental cellular process cdk10 stand orphan cdks activate cyclin identify kinase activity reveal previous work show cdk10 silencing increase ets2 ...,1
1,CBL,W802*,abstract background nonsmall cell lung cancer nsclc heterogeneous group disorder number genetic proteomic alteration ccbl e3 ubiquitin ligase adaptor molecule important normal homeostasis cancer...,2
2,CBL,Q249E,abstract background nonsmall cell lung cancer nsclc heterogeneous group disorder number genetic proteomic alteration ccbl e3 ubiquitin ligase adaptor molecule important normal homeostasis cancer...,2


In [4]:
vocab = joblib.load("data_files/vocab.pkl")

In [8]:
# One-Hot Encode class label
one = CategoryEncoding(
          num_tokens=len(train_tok["class"].unique()), output_mode="one_hot")
label = one((train_tok['class'] - 1).values)

In [18]:
label

<tf.Tensor: shape=(3316, 9), dtype=float32, numpy=
array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)>

In [19]:
max_tokens=len(vocab)
max_len = 32  # Sequence length to pad the outputs to.

vectorize_layer = keras.layers.TextVectorization(
    max_tokens=max_tokens,
    output_mode='int',
    output_sequence_length=max_len,
    vocabulary=vocab)

In [4]:
class_weights = {i: 1.0 / np.sum(train_tok['class'] == i) for i in np.unique(train_tok['class'])}

### Data Predict

In [3]:
# Load
data_predict = pd.read_parquet('./data_files/data_for_predict.parquet')

In [None]:
data_predict.head(3)

In [5]:
data_predict['text_tok'] = data_predict['clinical_evidence'].apply(pre_process)
# 5:40 hours for execute

In [6]:
data_predict = data_predict.drop(columns=['clinical_evidence'])

In [None]:
data_predict.head()

##### Load data for predict lemmatized

In [None]:
# Save
joblib.dump(data_predict, "data_files/data_predict_tok.pkl")

In [None]:
predict_tok = joblib.load("data_files/data_predict_tok.pkl")
predict_tok.shape

In [None]:
predict_tok