In [None]:
import tensorflow as tf

#With TFAutomodel, we can pull any model from huggingface.
from transformers import TFAutoModel

#To tokenize the tweets
from transformers import AutoTokenizer

#Datasets are loaded from a dataset loading script that downloads and generates the dataset.
#From HuggingFace
from datasets import load_dataset

In [None]:
#Now we will pull the model
#I'm using the BERT_Base_Uncased here

#BERT -Bert model
#bert-base-uncased ==> params: 110M 

#Base - the base version of the model
# The BERT base has 12 transformer encoder layers stacked while the 
# BERT large has 24 transformer encoder layers.

#Uncased - meaning, the model doesnt distinguish b/w upper and
# lower case text.

model = TFAutoModel.from_pretrained("bert-base-uncased")

In [None]:
# Load the Tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [None]:
#We need to tokenize the entire text as the output from the 
#Tokenizer is used as the input to the BERT model,

#Example Cases:

exp = tokenizer(["Stat Models","Stat Thinking and Design"],
                padding=True, return_tensors='tf')
exp

In [None]:
#The 102 and 102 indicate the beginning and end of an input sequence
#These are represented by the string values [CLS] and [SEP], 
#and are inserted automatically into the tokenizer output.

#The 28093 and 4275 are the values for Stat and Models

#Padding is done to keep the input size as constants, in above it is 
# 6 with cls and sep tokens

#Truncation - True truncates any text that is more than 512 words,
#ehich is the maximum context size for BERT, it the max amount of 
#workds a model can accept.

#return_tensors=True converts the usual list returned to tensors

In [None]:
output = model(exp)
output

It gives 2 outputs, the last hidden state(ie the encoder representation) and the pooler output.

shape=(2, 6, 768), is the last hidden states shape.
The 2 is the number of input sentence we gave.
The 6 is the padded size of the tokens.
768 is the hidden size of the bert base model. BERT large - 1024.

Hidden size is the total number of neurons in the feed forward layer of the encoder.

Pooler output size - (2, 768). In pooler output, you wont have the mioddle dimention. For cases like text classification, you dont need the hidden state for every word We just want a single hidden state for each sentence. So each sentence will have a hidden state in pooler output.

The encoder will output a fixed size vector called **encoder representation.** This vector will have the entire summary of every word in the input sequence. 

The bidirectional attention of the BERT enables it to context both previous and future tokens wile generating output.

In [None]:
# Load the dataset
#!pip install -U datasets
emotions = load_dataset("SetFit/emotion")

In [None]:
emotions = load_dataset("SetFit/emotion")

In [None]:
emotions

In [None]:
emotions['train']['text'][0:10]

### Tokenize the text

In [None]:
#Tokenize function
def tokenize(batch):
    return tokenizer(batch['text'],
                padding=True, return_tensors='tf')

In [None]:
#Map or apply the function to all the values of the dataset
'''
emotions.map will take all the train,test and valid text

The map() function supports working with batches of examples. 

Operate on batches by setting batched=True. The default batch size 
is 1000, but you can adjust it with the batch_size parameter. 
Batch processing enables interesting applications such as 
splitting long sentences into shorter chunks and data augmentation.
'''

emotions_encoded = emotions.map(tokenize, batched= True)

In [None]:
emotions_encoded['train']['input_ids'][0]

In [None]:
len(emotions_encoded['train']['input_ids'][0])

Meaning it is padded with 70 as the size of the sentence.

Now we need to convert from huggingface dataset format to tensorflow datasets format

In [None]:
#FOrked from https://www.kaggle.com/code/pritishmishra/fine-tune-bert-for-text-classification?scriptVersionId=116951029

# setting 'input_ids', 'attention_mask', 'token_type_ids', and 'label'
# to the tensorflow format. Now if you access this dataset you will get these
# columns in `tf.Tensor` format

emotions_encoded.set_format('tf', 
                            columns=['input_ids', 'attention_mask', 'token_type_ids', 'label'])

# setting BATCH_SIZE to 64.
BATCH_SIZE = 64

def order(inp):
    '''
    This function will group all the inputs of BERT
    into a single dictionary and then output it with
    labels.
    '''
    data = list(inp.values())
    return {
        'input_ids': data[1],
        'attention_mask': data[2],
        'token_type_ids': data[3]
    }, data[0]

# converting train split of `emotions_encoded` to tensorflow format
train_dataset = tf.data.Dataset.from_tensor_slices(emotions_encoded['train'][:])
# set batch_size and shuffle
train_dataset = train_dataset.batch(BATCH_SIZE).shuffle(1000)
# map the `order` function
train_dataset = train_dataset.map(order, num_parallel_calls=tf.data.AUTOTUNE)

# ... doing the same for test set ...
test_dataset = tf.data.Dataset.from_tensor_slices(emotions_encoded['test'][:])
test_dataset = test_dataset.batch(BATCH_SIZE)
test_dataset = test_dataset.map(order, num_parallel_calls=tf.data.AUTOTUNE)

In [None]:
inp, out = next(iter(train_dataset)) # a batch from train_dataset
print(inp, '\n\n', out)

In [None]:
#Cretate a class for the model 
#Inherting the tf.keras.Model as the input

class BERTforClassification(tf.keras.Model):
    
    def __init__(self,bert_model, num_classes):
        #Calling super.innt which will call the innt of the 
        #parent class to initialize the model
        
        super().__init__()
        
        #store the bertmodel
        self.bert = bert_model
        
        #crete a dense layer with num_classes units
        self.fc = tf.keras.layers.Dense(num_classes,
                                       activation='softmax')
        #softmax as we want the prob dist of which class
        
        
    #Write the forward pass in the call method
    def call(self, inputs):
        #The input is the tokenized text
        
        #For text calssification we need the pooler output, so slice it
        x = self.bert(inputs)[1]
        
        #pass this outout of bert to a dense layer
        return self.fc(x)
        
#This is the whole model        

In [None]:
#Create an instance of the model
classifier = BERTforClassification(model, num_classes = 6)

In [None]:
#Complile the model
classifier.compile(
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5),
loss = tf.keras.losses.SparseCategoricalCrossentropy(),
metrics = ['accuracy']
)

In [None]:
#STart the training
history = classifier.fit(train_dataset, epochs = 3)

In [None]:
print(tf.__version__)