# Step 6 - Pretrained Models from Hugging Face

In [None]:
from google.colab import drive
drive.mount("/content/gdrive", force_remount=True)

Mounted at /content/gdrive


In [None]:
import string

import nltk
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
nltk.download('stopwords')
nltk.download("wordnet")

import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import clear_output

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


## Pipeline

In [None]:
# ! pip3 install transformers
from transformers import pipeline
classifier = pipeline('sentiment-analysis')

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english)


In [None]:
txt_1 = "The weather is good today."
classifier(txt_1)

[{'label': 'POSITIVE', 'score': 0.9998569488525391}]

In [None]:
txt_2 = "I don't really like this soup, mom."
classifier(txt_2)

[{'label': 'NEGATIVE', 'score': 0.9986314177513123}]

## AutoTokenizer, TFAutoModelForSequenceClassification
#### What happens when we use pipelines

In [None]:
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification

model_name = "distilbert-base-uncased-finetuned-sst-2-english"
model = TFAutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

Downloading:   0%|          | 0.00/256M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFDistilBertForSequenceClassification.

All the layers of TFDistilBertForSequenceClassification were initialized from the model checkpoint at distilbert-base-uncased-finetuned-sst-2-english.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForSequenceClassification for predictions without further training.


In [None]:
inputs = tokenizer([txt_1, txt_2])
inputs

{'input_ids': [[101, 1996, 4633, 2003, 2204, 2651, 1012, 102], [101, 1045, 2123, 1005, 1056, 2428, 2066, 2023, 11350, 1010, 3566, 1012, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

In [None]:
inputs_with_padding = tokenizer([txt_1, txt_2], padding = True, truncation = True, max_length = 256, return_tensors="tf")
inputs_with_padding

{'input_ids': <tf.Tensor: shape=(2, 13), dtype=int32, numpy=
array([[  101,  1996,  4633,  2003,  2204,  2651,  1012,   102,     0,
            0,     0,     0,     0],
       [  101,  1045,  2123,  1005,  1056,  2428,  2066,  2023, 11350,
         1010,  3566,  1012,   102]], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(2, 13), dtype=int32, numpy=
array([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
       [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], dtype=int32)>}

In [None]:
outputs = model(inputs_with_padding)
outputs

TFSequenceClassifierOutput([('logits',
                             <tf.Tensor: shape=(2, 2), dtype=float32, numpy=
                             array([[-4.240502 ,  4.611532 ],
                                    [ 3.6350493, -2.9575799]], dtype=float32)>)])

In [None]:
import tensorflow as tf

predictions = tf.nn.softmax(outputs[0], axis=-1)
predictions

<tf.Tensor: shape=(2, 2), dtype=float32, numpy=
array([[1.4306998e-04, 9.9985695e-01],
       [9.9863142e-01, 1.3685562e-03]], dtype=float32)>

# BERT

In [None]:
df = pd.read_csv("/content/gdrive/MyDrive/practice_nlp_2022/data/translated_data.csv")
df.head()

Unnamed: 0,job_no,job_description,job_type,category
0,Id-12765,Zest Scientific is searching for an accomplis...,Permanent,"Pharmaceutical, Healthcare and Medical Sales"
1,Id-22925,"In the world of typical CRO's, this company is...",Permanent,Clinical Research
2,Id-1321,Asha Mistry of Umbilical Life is recruiting an...,Contract/Interim,Pharmaceutical Marketing
3,Id-9473,Sales Representative - Laser in der Medizinte...,Permanent,"Pharmaceutical, Healthcare and Medical Sales"
4,Id-14952,Field Service Engineer - Life Science\r\r\r\nC...,Permanent,Manufacturing & Operations


In [None]:
Job_type = {
    'Permanent': 0,
    'Contract/Interim': 1,
    'Contract/Temp': 2,
    'Temporary/Seasonal': 3,
    'Any': 4,
    'Part-Time': 5
}

df['type_label'] = df['job_type'].apply(lambda x: Job_type[x])
df.head()

Unnamed: 0,job_no,job_description,job_type,category,type_label
0,Id-12765,Zest Scientific is searching for an accomplis...,Permanent,"Pharmaceutical, Healthcare and Medical Sales",0
1,Id-22925,"In the world of typical CRO's, this company is...",Permanent,Clinical Research,0
2,Id-1321,Asha Mistry of Umbilical Life is recruiting an...,Contract/Interim,Pharmaceutical Marketing,1
3,Id-9473,Sales Representative - Laser in der Medizinte...,Permanent,"Pharmaceutical, Healthcare and Medical Sales",0
4,Id-14952,Field Service Engineer - Life Science\r\r\r\nC...,Permanent,Manufacturing & Operations,0


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(
    df['job_description'], 
    df['type_label'], 
    test_size = 0.50, 
    random_state = 42
)

In [None]:
X_train.iloc[0]

'Evolve Selection are currently recruiting for an exciting new opportunity with a dynamic, specialist healthcare organisation. Our client has an extensive product range within the Renal market. This exciting role will be working as an Account Manager across the South Central and South West England and Brighton area. You will be accountable for delivering on sales and profit targets, as well as providing a high level of support and service to our clients’ NHS customers.\r\r\r\nRequirements:\r\r\r\nDegree level qualification or relevant clinical background.\r\r\r\nAble to demonstrate experience working in or around the healthcare sector.\r\r\r\nStrong sales background or able to demonstrate high level of knowledge aligned to relevant therapy area (desirable).\r\r\r\nOur client is willing to consider Pharmaceutical Representatives with if they are commercially focused.\r\r\r\nA current driving licence with no more than 6 points.\r\r\r\nRole Responsibilities:\r\r\r\nPortfolio sell to maxim

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/426k [00:00<?, ?B/s]

In [None]:
def tokenize(sequence):
    tokens = tokenizer.encode_plus(sequence, max_length=512,
                                   truncation=True, padding='max_length',
                                   add_special_tokens=True, return_token_type_ids=False,
                                   return_tensors='tf')
    return tokens['input_ids'], tokens['attention_mask']

In [None]:
import numpy as np
num_of_elements = 5000

Xids = np.zeros((num_of_elements, 512))
Xmask = np.zeros((num_of_elements, 512))

idx = np.random.randint(0, 8828, num_of_elements)  
small_dataset = np.array(X_train)[idx.astype(int)]
small_dataset_labels = np.array(Y_train)[idx.astype(int)]

labels = np.array(small_dataset_labels)

In [None]:
for i, sequence in enumerate(small_dataset):
    tokens = tokenize(sequence)
    Xids[i, :], Xmask[i, :] = tokens[0], tokens[1]

In [None]:
Xids[0]

array([  101.,  8603.,  1161., 12107.,  2050.,  1616.,  1121., 12189.,
       15197.,  4571.,  2583.,  1110., 16226.,   170.,  8787.,  2524.,
        1111.,   170.,  2657.,  4909.,  4792.,  1359.,  1107., 22572.,
       10654.,  5817.,   119.,  9612.,  1439.,   170.,  2898.,  2657.,
        6678.,  4792.,  1110.,   170., 14820.,  3767.,  1111.,   170.,
        8787.,  2524.,  1150., 16615., 24775.,  1114.,  3812.,  3438.,
        1105.,  7204.,   170.,  1264.,  1104.,  2657.,  5094.,   119.,
        1192.,  1209.,  1129.,  2784.,  1111.,  1684.,  4099.,  1114.,
        1672., 18119.,  2380.,  7550.,   117.,  4374.,  3213.,  1105.,
        7061.,  2447.,   119., 15843.,  1105., 18113.,   131.,   138.,
        2598.,  2178.,  1110.,  2320.,   117.,  7891.,  1193.,  1114.,
        1126., 10978.,  1665.,  1137.,  7735.,  4373.,   126.,  1106.,
        1275.,  1201.,  1104.,  2541.,  1107.,  2657.,  1105.,   120.,
        1137., 18119.,  4909.,  5096.,  7912.,  1854.,  1647.,  1112.,
      

In [None]:
from transformers import TFAutoModel

In [None]:
bert = TFAutoModel.from_pretrained('bert-base-cased')

Downloading:   0%|          | 0.00/502M [00:00<?, ?B/s]

Some layers from the model checkpoint at bert-base-cased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [None]:
bert.summary()

Model: "tf_bert_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  108310272 
                                                                 
Total params: 108,310,272
Trainable params: 108,310,272
Non-trainable params: 0
_________________________________________________________________


In [None]:
import tensorflow as tf

input_ids = tf.keras.layers.Input(shape=(512,), name='input_ids', dtype='int32')
mask = tf.keras.layers.Input(shape=(512,), name='attention_mask', dtype='int32')

embeddings = bert.bert(input_ids, attention_mask=mask)[1]  # we access the transformer model within our bert object using the bert attribute (eg bert.bert instead of bert)

# Classifier head
x = tf.keras.layers.Dense(1024, activation ='relu')(embeddings)
y = tf.keras.layers.Dense(6, activation ='softmax', name='outputs')(x)

model = tf.keras.Model(inputs=[input_ids, mask], outputs=y)

model.layers[2].trainable = False

In [None]:
bert.bert(input_ids, attention_mask=mask) # outputs of bert

TFBaseModelOutputWithPoolingAndCrossAttentions([('last_hidden_state',
                                                 <KerasTensor: shape=(None, 512, 768) dtype=float32 (created by layer 'bert')>),
                                                ('pooler_output',
                                                 <KerasTensor: shape=(None, 768) dtype=float32 (created by layer 'bert')>)])

In [None]:
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 512)]        0           []                               
                                                                                                  
 attention_mask (InputLayer)    [(None, 512)]        0           []                               
                                                                                                  
 bert (TFBertMainLayer)         TFBaseModelOutputWi  108310272   ['input_ids[0][0]',              
                                thPoolingAndCrossAt               'attention_mask[0][0]']         
                                tentions(last_hidde                                               
                                n_state=(None, 512,                                         

In [None]:
model.compile(
    optimizer='adam', 
    loss='sparse_categorical_crossentropy', 
    metrics=["accuracy"]
)

In [None]:
history = model.fit(
    [Xids, Xmask], labels,
    validation_split = 0.8,
    batch_size = 16,
    verbose = 1,
    epochs = 1
)



KeyboardInterrupt: ignored