In [None]:
from tensorflow.keras.utils import plot_model
import nltk
import time
from nltk.corpus import wordnet

nltk.download('wordnet')

def synonym_replacement(sentence, n):
    words = sentence.split()
    new_words = words.copy()
    random_word_list = list(set([word for word in words if word.isalpha()]))
    random.shuffle(random_word_list)
    num_replaced = 0
    for random_word in random_word_list:
        synonyms = get_synonyms(random_word)
        if len(synonyms) >= 1:
            synonym = random.choice(synonyms)
            new_words = [synonym if word == random_word else word for word in new_words]
            num_replaced += 1
        if num_replaced >= n:
            break

    sentence = ' '.join(new_words)
    return sentence

def get_synonyms(word):
    synonyms = set()
    for syn in wordnet.synsets(word):
        for l in syn.lemmas():
            synonym = l.name().replace("_", " ").replace("-", " ").lower()
            synonym = "".join([char for char in synonym if char.isalpha()])
            synonyms.add(synonym)
    if word in synonyms:
        synonyms.remove(word)
    return list(synonyms)


[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
pip install nltk




In [None]:
import csv
train_sentences = []

with open('/content/train_en.tsv', 'r', encoding='utf8') as file:
    reader = csv.reader(file, delimiter='\t')
    next(reader)
    for row in reader:
        sentence = row[1]
        train_sentences.append(sentence)
print(train_sentences[:5])


['Hurray, saving us $$$ in so many ways @potus @realDonaldTrump #LockThemUp #BuildTheWall #EndDACA #BoycottNFL #BoycottNike', "Why would young fighting age men be the vast majority of the ones escaping a war &amp; not those who cannot fight like women, children, and the elderly?It's because the majority of the refugees are not actually refugees they are economic migrants trying to get into Europe.... https://t.co/Ks0SHbtYqn", '@KamalaHarris Illegals Dump their Kids at the border like Road Kill and Refuse to Unite! They Hope they get Amnesty, Free Education and Welfare Illegal #FamilesBelongTogether in their Country not on the Taxpayer Dime Its a SCAM #NoDACA #NoAmnesty #SendThe', "NY Times: 'Nearly All White' States Pose 'an Array of Problems' for Immigrants https://t.co/ACZKLhdMV9 https://t.co/CJAlSXCzR6", 'Orban in Brussels: European leaders are ignoring the will of the people, they do not want migrants https://t.co/NeYFyqvYlX']


In [None]:
import random
augmented_sentences = []
augmentation_factor = 0.1
n_synonyms = 2

for sentence in train_sentences:
    if random.uniform(0, 1) < augmentation_factor:
        augmented_sentence = synonym_replacement(sentence, n_synonyms)
        augmented_sentences.append(augmented_sentence)
    else:
        augmented_sentences.append(sentence)


In [None]:
import csv

train_labels = []

with open('/content/train_en.tsv', 'r', encoding='utf8') as file:
    reader = csv.reader(file, delimiter='\t')
    next(reader)
    for row in reader:
        label = int(row[0])
        train_labels.append(label)
print(train_labels[:5])


[201, 202, 203, 204, 205]


In [None]:
augmented_data = list(zip(augmented_sentences, train_labels))


In [None]:
random.shuffle(augmented_data)
augmented_sentences, train_labels = zip(*augmented_data)


In [None]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def encode_sentences_bert(sentences, tokenizer, max_length=128):
    return tokenizer(sentences, padding='max_length', truncation=True, max_length=max_length, return_tensors="tf")

augmented_encodings = encode_sentences_bert(augmented_sentences, tokenizer, max_length=327)


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
augmented_encodings = encode_sentences_bert(augmented_sentences, tokenizer, max_length=327)


In [None]:
!pip install tensorflow





In [None]:
import tensorflow as tf


augmented_dataset = tf.data.Dataset.from_tensor_slices((
    dict(augmented_encodings),
    tf.convert_to_tensor(train_labels)
))


In [None]:
!pip install transformers tensorflow





In [None]:
from transformers import TFBertForSequenceClassification, BertTokenizer
import tensorflow as tf


In [None]:
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])



In [None]:
import csv

def get_data_bert(train_file, test_file):
    def read_tsv(file_path):
        sentences = []
        labels = []
        with open(file_path, 'r', encoding='utf8') as file:
            reader = csv.reader(file, delimiter='\t')
            next(reader)
            for row in reader:
                sentences.append(row[1])
                labels.append(int(row[2]))
        return sentences, labels

    train_sentences, train_labels = read_tsv(train_file)
    test_sentences, test_labels = read_tsv(test_file)
    return train_sentences, train_labels, test_sentences, test_labels





In [None]:
train_file_path = '/content/train_en.tsv'
test_file_path = '/content/dev_en.tsv'
train_sentences, train_labels, val_sentences, val_labels = get_data_bert(train_file_path, test_file_path)


In [None]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def encode_sentences_bert(sentences, tokenizer, max_length=128):
    return tokenizer(sentences, padding='max_length', truncation=True, max_length=max_length, return_tensors="tf")

train_encodings = encode_sentences_bert(train_sentences, tokenizer, max_length=128)
val_encodings = encode_sentences_bert(val_sentences, tokenizer, max_length=128)


In [None]:
import tensorflow as tf

train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    train_labels
))
val_dataset = tf.data.Dataset.from_tensor_slices((
    dict(val_encodings),
    val_labels
))


In [None]:
from transformers import TFBertForSequenceClassification

model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

plot_model(model, to_file="/content/bert.png", show_shapes=True, show_dtype=False, show_layer_names=False, show_trainable=True, show_layer_activations=True)

start_time = time.time()
history = model.fit(
    train_dataset.shuffle(1000).batch(32),
    epochs=3,
    validation_data=val_dataset.batch(32)
)
end_time = time.time()

total_time = end_time - start_time

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3
Epoch 2/3
Epoch 3/3


In [None]:
model.save_weights('/content/bert_finetuned_weights.h5')

In [None]:
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, precision_score

y_pred = model.predict(val_dataset.batch(32))
y_pred = np.argmax(y_pred.logits, axis=1)

y_true = np.concatenate([y.numpy().reshape(-1) for x, y in val_dataset], axis=0)

accuracy = accuracy_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred, average='macro')
precision = precision_score(y_true, y_pred, average='macro')

print(f'Accuracy: {accuracy}')
print(f'F1 Score: {f1}')
print(f'Precision: {precision}')
print(f'time: {total_time}')

Accuracy: 0.779
F1 Score: 0.7638866185679136
Precision: 0.7903492647058823
