In [1]:
import numpy as np
import random
import pandas as pd
import time
import datetime
from sklearn.model_selection import train_test_split

In [2]:
cleaning_data = pd.read_csv('Data/Tweet_Processed_DataCleaning_Done.csv')

In [3]:
cleaning_data.head()

Unnamed: 0,Tweet,Label
0,euedsonduarte lilovlog jairbolsonaro exatamen...,0.0
1,a china fecha o primeiro laboratorio do mundo ...,0.0
2,janeiro china mente sobre a de mortos nos caso...,0.0
3,nivel de poluicao na china cai drasticamente a...,0.0
4,eikebatista os que cruzam os oceanos trazem u...,0.0


In [4]:
cleaning_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24200 entries, 0 to 24199
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Tweet   24200 non-null  object 
 1   Label   24200 non-null  float64
dtypes: float64(1), object(1)
memory usage: 378.2+ KB


#### Imports to use TensorFlow

In [5]:
!pip install tensorflow

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting tensorflow
  Downloading tensorflow-2.10.0-cp310-cp310-win_amd64.whl (455.9 MB)
     ------------------------------------- 455.9/455.9 MB 54.4 MB/s eta 0:00:00
Installing collected packages: tensorflow
Successfully installed tensorflow-2.10.0


In [7]:
!pip install tensorflow_hub

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com


In [6]:
!pip install tensorflow_text --user

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com


In [8]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text

In [15]:
tf.test.is_built_with_cuda()

True

### Preparing dataset

In [11]:
# dividing in train, validation and text
x_data = cleaning_data['Tweet']
y_data = cleaning_data['Label']

In [12]:
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2, stratify=y_data, random_state=43)
x_train, x_valid, y_train, y_valid = train_test_split(x_train, y_train, test_size=0.2, stratify=y_train, random_state=43)

### Bert

In [14]:
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, BertConfig, get_linear_schedule_with_warmup

In [16]:
def bert_tokenize(data, maxLength, tokenizer) :
    input_ids = []
    attention_masks = []
    for tweet in data:
        encoded = bert_tokenizer.encode_plus(
                                tweet, #Sentence to be tokenized
                                add_special_tokens=True, #Adding [CLS] a token added to beggining of the setence
                                             #and [SEP] a token added to end of the setence
                                max_length=maxLength, #the max size of the setence
                                padding='max_length',#pad_to_max_length = True, #Adding [PAD] a token that represents the real sentence(when the setence is  
                                              #smaller than the max size the spaces will be completed with this token)                                              
                                return_attention_mask=True, #An array of 0 and 1 indicating which tokens are [PAD](space in blank) 
                                                #and the tokens belonging to the sentence
        )
        input_ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])
    return np.array(input_ids),np.array(attention_masks)

In [18]:
modelPath = "Model/BertBaseMultUncased"

In [19]:
#Loading locally the bert model
bert_tokenizer = BertTokenizer.from_pretrained(modelPath, local_files_only=True, do_lower_case=True)

In [20]:
MAX_LEN = 512
inputIdTrain, attentionMaskTrain = bert_tokenize(x_train, MAX_LEN, bert_tokenizer)
inputIdValid, attentionMaskValid = bert_tokenize(x_valid, MAX_LEN, bert_tokenizer)
inputIdTest, attentionMaskTest = bert_tokenize(x_test, MAX_LEN, bert_tokenizer)

In [21]:
bertModel = BertForSequenceClassification.from_pretrained(modelPath,
                                                          local_files_only=True, 
                                                          num_labels = 2, # The number of output labels--2 for binary classification.
                                                                          # You can increase this for multi-class tasks.   
                                                          output_attentions = False, # Whether the model returns attentions weights.
                                                          output_hidden_states = False,)

  obj = cast(Storage, torch._UntypedStorage(nbytes))
Some weights of the model checkpoint at Model/BertBaseMultUncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceCla

In [None]:
def create_model(bert_model, max_len=MAX_LEN):
    
    # parameter
    optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
    loss = tf.keras.losses.CategoricalCrossentropy()
    accuracy = tf.keras.metrics.CategoricalAccuracy()

    input_ids = tf.keras.Input(shape=(max_len,),dtype='int32')
    attention_masks = tf.keras.Input(shape=(max_len,),dtype='int32')
    
    bert = bert_model([input_ids,attention_masks])[1]
    dropout = tf.keras.layers.Dropout(0.2)(bert)
    output = tf.keras.layers.Dense(3, activation="softmax")(dropout)
    
    model = tf.keras.models.Model(inputs = [input_ids,attention_masks], outputs = output)
    model.compile(optimizer, loss=loss, metrics=accuracy)
    
    return model