# Libraries

In [42]:
# Basic
import os
import warnings
import pandas as pd # type: ignore
import numpy as np 
import joblib 


# Text-Processing
from nltk.corpus import stopwords # type: ignore
from nltk.tokenize import RegexpTokenizer, word_tokenize # type: ignore
from nltk.stem import WordNetLemmatizer # type: ignore


# Metrics
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# Visualization
import plotly.express as px # type: ignore
import plotly.graph_objs as go


# Models
from sklearn.model_selection import train_test_split, RandomizedSearchCV # type: ignore
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.ensemble import GradientBoostingClassifier


# Inicialize

# Create lemmatizer
lemmatizer = WordNetLemmatizer()

# # set keras run
# os.environ["KERAS_BACKEND"] = "tensorflow"
# import keras_nlp # type: ignore
# import keras # type: ignore
# # Use mixed precision to speed up all training in this guide.
# keras.mixed_precision.set_global_policy("mixed_float16")

warnings.filterwarnings('ignore')

# ETL

In [45]:
data = pd.read_parquet('./data_files/org_train.parquet')
data_predict = pd.read_parquet('./data_files/org_test.parquet')

In [None]:
data.head(3)

In [None]:
# Check Duplicates, in this case don't have duplicates (same length)
data.drop_duplicates()

In [None]:
data[['gene', 'variation']] = data[['gene', 'variation']].apply(lambda x: x.str.lower())
data_predict[['gene', 'variation']] = data_predict[['gene', 'variation']].apply(lambda x: x.str.lower())

## Function

In [None]:
def pre_process(df, col='clinical_evidence', target=True): 

    # Train or Test
    if target is True:
        col_name = df.columns
        
    else:
        col_name = df.columns
    
    # create tokenizer
    tokenizer = RegexpTokenizer(r"\w+-\w+|\b\w+\b")

    # load stopwords
    stop_words = stopwords.words('english')
    stop = set(stop_words)

    # apply 
    for k, i in enumerate(df[col]):
        
        # token and stopwords
        list_tok = ' '.join([token for token in tokenizer.tokenize(i) if token not in stop])
        
        # lemma
        lem = ''.join([lemmatizer.lemmatize(lemma) for lemma in list_tok])
        df.loc[k, col] = lem      
    
    df['text'] = df['gene'] + ' ' + df['variation'] + ' ' + df[col]
    
    df.drop(columns=['gene', 'variation', col], inplace=True)

    return df

Take some time to process. You could load the data below.

In [None]:
# data_train_pre = pre_process(data, 'clinical_evidence', target=True)
# data_train_pre.head(3)

In [None]:
# save data train
# data_train_pre.to_parquet('./data_files/train_pre.parquet')

In [None]:
# data_for_predict = pre_process(data_predict, 'clinical_evidence', target=False)
# data_for_predict.head(3)

In [None]:
# save data predict
# data_for_predict.to_parquet('./data_files/data_for_predict.parquet')

In [None]:
%reset -f
# Clean all variables

# Split data

Notice: 
Reload libraries

In [46]:
# Train data
data = pd.read_parquet('./data_files/train_pre.parquet')
data.head(3)

Unnamed: 0,target,text
0,1,fam58a truncating_mutations cyclin-dependent k...
1,2,cbl w802* abstract background non-small cell l...
2,2,cbl q249e abstract background non-small cell l...


In [47]:
# Predict Data
predict = pd.read_parquet('./data_files/data_for_predict.parquet')
predict.head(3)

Unnamed: 0,text
0,acsl4 r570s 2 mutation resulted myeloprolifera...
1,naglu p521l abstract large tumor suppressor 1 ...
2,pah l333f vascular endothelial growth factor r...


### Distribution target values

In [5]:
target = data.target.value_counts(normalize=True) * 100 

In [6]:
fig = px.bar(target, title='Target Distribution - Percent(%)', text_auto=True)
fig.update_layout(showlegend=False, yaxis_title='')
fig.update_yaxes(showticklabels=False)
fig.show()

## Train, test, Validate 

In [7]:
x = data['text']
y = data['target']

### Split data stratified

In [8]:
x_train, x_test, y_train, y_test = train_test_split(x,y , test_size=0.2, stratify=y, random_state=14)
print(f'Observations in x_train: {x_train.shape[0]}')
print(f'Observations in x_test: {x_test.shape[0]}')

Observations in x_train: 2652
Observations in x_test: 664


In [9]:
fig = px.bar(y_train.value_counts(normalize=True)*100, title='Train: Target Distribution after split - Percent(%)', text_auto=True)
fig.update_layout(showlegend=False, yaxis_title='')
fig.update_yaxes(showticklabels=False)
fig.show()

In [10]:
fig = px.bar(y_test.value_counts(normalize=True)*100, title='Test: Target Distribution after split - Percent(%)', text_auto=True)
fig.update_layout(showlegend=False, yaxis_title='')
fig.update_yaxes(showticklabels=False)
fig.show()

# Tokenização

### Vocabulary

In [10]:
# text_comp = pd.concat([data['text'], predict['text']])

In [11]:
# Words train
# vectorizer = TfidfVectorizer()
# vectorizer = vectorizer.fit(text_comp)


In [12]:
# Save Vocabulary
# joblib.dump(vectorizer, './data/tfidf_vectorizer.pkl')

### Transform

In [11]:
# Load Vocabulary
tf_idf = joblib.load('./data/tfidf_vectorizer.pkl')

In [12]:
# Data transform
X_train_tfidf = tf_idf.transform(x_train)
X_test_tfidf = tf_idf.transform(x_test)

In [13]:
tf_idf.get_feature_names_out()

array(['00', '000', '0000', ..., 'ﬁve', 'ﬁxed', 'ﬂanks'], dtype=object)

# Models

In [16]:
# Create Classifier
GB_clf = GradientBoostingClassifier(validation_fraction=0.15, random_state=0)

In [17]:
# Benchmarking
# GB_clf_bench = GB_clf.fit(X_train_tfidf, y_train)
# joblib.dump(GB_clf_bench, './data/GB_clf_bench.pkl')

# load
GB_clf_bench = joblib.load('./data/GB_clf_bench.pkl')

In [18]:
print(f"Accuracy score (training): {GB_clf_bench.score(X_train_tfidf, y_train):.3f}")
print(f"Accuracy score (Validation): {GB_clf_bench.score(X_test_tfidf, y_test):.3f}")

Accuracy score (training): 0.956
Accuracy score (Validation): 0.666


In [19]:
# Test Bench
bench_pred = GB_clf_bench.predict(X_test_tfidf)

In [20]:
cm = confusion_matrix(y_test, bench_pred)

fig = px.imshow(cm, text_auto=True, aspect='auto', title='Bench Confusion Matrix', 
                range_color=[0, 21], color_continuous_scale='tempo')
fig.update(layout_coloraxis_showscale=False)
fig.update_layout(xaxis=dict(title='Real Values'), yaxis=dict(title='Predict Values'))
fig.show()

In [21]:
print("Classification Report")
print(classification_report(y_test, bench_pred))

Classification Report
              precision    recall  f1-score   support

           1       0.56      0.65      0.60       113
           2       0.67      0.43      0.52        91
           3       0.53      0.50      0.51        18
           4       0.70      0.76      0.73       137
           5       0.53      0.35      0.42        48
           6       0.88      0.55      0.67        55
           7       0.69      0.86      0.77       191
           8       0.33      0.25      0.29         4
           9       1.00      0.71      0.83         7

    accuracy                           0.67       664
   macro avg       0.66      0.56      0.59       664
weighted avg       0.67      0.67      0.66       664



## Model Gradient Boosting

In [22]:
# Randomized Search

parameters = {
    'n_estimators': np.array([80, 100, 120, 200]), 
    'learning_rate': np.array([0.001, 0.01, 0.1, 0.12]),
    'min_samples_split': np.array([2,3,4,5]),
    'min_samples_leaf': np.array([2,3,4,5]),
    'max_depth': np.array([None, 2, 3, 4]),
    }

rand_search = RandomizedSearchCV(estimator=GB_clf,
                            param_distributions=parameters,
                            n_jobs=3,
                            random_state=0)

rand_search.fit(X_train_tfidf, y_train)

joblib.dump(rand_search, './data/rand_search.pkl')


['./data/rand_search.pkl']

Fit time: 126h 41m, core I5, ram: 8Gb

In [14]:
GB_trained = joblib.load('./data/rand_search.pkl')

In [39]:
print(f"Best score: {GB_trained.best_score_}\n"
      f"Best estimator: {GB_trained.best_estimator_}\n"
      f"Best params: {GB_trained.best_params_}")

Best score: 0.6591337099811676
Best estimator: GradientBoostingClassifier(max_depth=4, min_samples_leaf=5, min_samples_split=4,
                           n_estimators=200, random_state=0,
                           validation_fraction=0.15)
Best params: {'n_estimators': 200, 'min_samples_split': 4, 'min_samples_leaf': 5, 'max_depth': 4, 'learning_rate': 0.1}


In [21]:
GB_best_predict = GB_trained.predict(X_test_tfidf)

In [26]:
cm = confusion_matrix(y_test, GB_best_predict)

fig = px.imshow(cm, text_auto=True, aspect='auto', title='Gradient Boosting Confusion Matrix', 
                range_color=[0, 21], color_continuous_scale='tempo')
fig.update(layout_coloraxis_showscale=False)
fig.update_layout(xaxis=dict(title='Real Values'), yaxis=dict(title='Predict Values'))
fig.show()

In [27]:
print("Classification Report - Gradient Boosting")
print(classification_report(y_test, GB_best_predict))

Classification Report - Gradient Boosting
              precision    recall  f1-score   support

           1       0.60      0.69      0.64       113
           2       0.66      0.47      0.55        91
           3       0.57      0.44      0.50        18
           4       0.74      0.75      0.74       137
           5       0.55      0.44      0.49        48
           6       0.89      0.56      0.69        55
           7       0.71      0.86      0.78       191
           8       0.50      0.25      0.33         4
           9       0.86      0.86      0.86         7

    accuracy                           0.69       664
   macro avg       0.67      0.59      0.62       664
weighted avg       0.69      0.69      0.68       664



# keras

In [None]:
# Unbatched input.
tokenizer = keras_nlp.models.BertTokenizer.from_preset(
    "bert_base_en_uncased",
)
tokenizer(data.values[0].tolist())

# Batched input.
# tokenizer(["The quick brown fox jumped. It's cool!", "The fox slept.", "It's cool!"])

# Detokenization.
# tokenizer.detokenize(tokenizer("The quick brown fox jumped."))

# # Custom vocabulary.
# vocab = ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]
# vocab += ["The", "quick", "brown", "fox", "jumped", "."]
# tokenizer = keras_nlp.models.BertTokenizer(vocabulary=vocab)
# tokenizer("The quick brown fox jumped.")

In [None]:
tokenizer.detokenize([1996, 4248, 2829, 4419, 5598, 1012, 2009, 1005, 1055, 4658, 999])

In [None]:
tokenizer.detokenize([4658, 999])

In [None]:
train.shape

In [None]:
predict.shape

In [None]:
corpus = pd.concat([train['corpus'], predict['corpus']]).values

In [None]:
len(corpus)

In [None]:
# Train custom vocabulary from data
vocab = keras_nlp.tokenizers.compute_word_piece_vocabulary(
    corpus,
    vocabulary_size=20_000,
    lowercase=True,
    strip_accents=True,
    reserved_tokens=["[PAD]", "[START]", "[END]", "[MASK]", "[UNK]"],
)
vocab

In [None]:

tokenizer = keras_nlp.tokenizers.WordPieceTokenizer(
    vocabulary=vocab,
    lowercase=True,
    strip_accents=True,
    oov_token="[UNK]",
)