<h1 style="color:DarkBlue;">Model training</h1>

In [1]:
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, ElectraTokenizer, ElectraForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
import accelerate
import torch

In [2]:
df = pd.read_csv('../data/data.csv')

In [3]:
# Let´s split the data into training, testing and validation sets

train_df, temp_df = train_test_split(df, test_size=0.4, stratify=df['Sentiment'])
val_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df['Sentiment'])

Electra will be used, which is a bit different from other masked languages models as it trains a discriminator (similar idea as GANs), and uses less computation compared with models like BERT.
Also, we

In [5]:
tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator')  

train_encodings = tokenizer(list(train_df['Sentence']), truncation=True, padding=True, max_length=70) # 70 as max lenght is reasonable (see the EDA)
val_encodings = tokenizer(list(val_df['Sentence']), truncation=True, padding=True, max_length=70)
test_encodings = tokenizer(list(test_df['Sentence']), truncation=True, padding=True, max_length=70)

# Now, let´s convert the sentiments to numeric values i.e positive = 2, neutral = 1 and negative = 0:

train_labels = train_df['Sentiment'].replace({'positive': 2, 'neutral': 1, 'negative': 0}).values
val_labels = val_df['Sentiment'].replace({'positive': 2, 'neutral': 1, 'negative': 0}).values
test_labels = test_df['Sentiment'].replace({'positive': 2, 'neutral': 1, 'negative': 0}).values

model_Electra = ElectraForSequenceClassification.from_pretrained('google/electra-small-discriminator', num_labels=3)

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-small-discriminator and are newly initialized: ['classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
# Let´s prepare the dataset (similar code as deep learning course)

class SentimentDataset:
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

In [21]:
train_dataset = SentimentDataset(train_encodings, train_labels)
val_dataset = SentimentDataset(val_encodings, val_labels)
test_dataset = SentimentDataset(test_encodings, test_labels)  

In [8]:
# arguments of training:

training_args = TrainingArguments(
    output_dir = "./results",
    per_device_train_batch_size = 8,
    per_device_eval_batch_size = 8,
    num_train_epochs = 5,
    evaluation_strategy = "epoch",
    logging_dir = './logs',
    save_strategy="epoch",
)

trainer = Trainer(
    model=model_Electra,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    
    
)

trainer.train()
results = trainer.evaluate()

print(results)



Epoch,Training Loss,Validation Loss
1,No log,0.745488
2,0.812700,0.564994
3,0.548300,0.555679
4,0.394300,0.603226
5,0.308100,0.623028


{'eval_loss': 0.6230275630950928, 'eval_runtime': 7.4254, 'eval_samples_per_second': 157.298, 'eval_steps_per_second': 19.662, 'epoch': 5.0}


In [14]:
# We can see that the best model is reached at Epoch 3, then:
best_model_path = "./results/checkpoint-1317"
best_model = ElectraForSequenceClassification.from_pretrained(best_model_path)


In [15]:
trainer_best_model = Trainer(
    model=best_model,
)


In [17]:
predictions_train = trainer_best_model.predict(train_dataset)
pred_labels_train = predictions_train.predictions.argmax(axis=1)

accuracy_best_model = accuracy_score(train_labels, pred_labels_train)
f1_best_model = f1_score(train_labels, pred_labels_train, average='weighted')

print(f"Test Accuracy Best Model (train): {accuracy_best_model}")
print(f"Test F1-Score Best Model (train): {f1_best_model}")

Test Accuracy Best Model (train): 0.837660485021398
Test F1-Score Best Model (train): 0.83558115943701


In [19]:
predictions_validation = trainer_best_model.predict(val_dataset)
pred_labels_validation = predictions_validation.predictions.argmax(axis=1)

accuracy_best_model = accuracy_score(val_labels, pred_labels_validation)
f1_best_model = f1_score(val_labels, pred_labels_validation, average='weighted')

print(f"Test Accuracy Best Model (validation): {accuracy_best_model}")
print(f"Test F1-Score Best Model (validation): {f1_best_model}")

Test Accuracy Best Model (validation): 0.8501712328767124
Test F1-Score Best Model (validation): 0.8485860184041975


Let´s look at the results in test:

In [16]:
predictions_test = trainer_best_model.predict(test_dataset)  

pred_labels_test = predictions_test.predictions.argmax(axis=1)

accuracy_best_model = accuracy_score(test_labels, pred_labels_test)
f1_best_model = f1_score(test_labels, pred_labels_test, average='weighted')

print(f"Test Accuracy Best Model: {accuracy_best_model}")
print(f"Test F1-Score Best Model: {f1_best_model}")

Test Accuracy Best Model: 0.8366124893071001
Test F1-Score Best Model: 0.8357335566459931


The results in train, test and validation look similar. Then, we can say that the model is not overfiting.

In [22]:
# Now, let´s try Bert model:

In [23]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')  

train_encodings = tokenizer(list(train_df['Sentence']), truncation=True, padding=True, max_length=70) # 70 as max lenght is reasonable (see the EDA)
val_encodings = tokenizer(list(val_df['Sentence']), truncation=True, padding=True, max_length=70)
test_encodings = tokenizer(list(test_df['Sentence']), truncation=True, padding=True, max_length=70)

# Now, let´s convert the sentiments to numeric values i.e positive = 2, neutral = 1 and negative = 0:

train_labels = train_df['Sentiment'].replace({'positive': 2, 'neutral': 1, 'negative': 0}).values
val_labels = val_df['Sentiment'].replace({'positive': 2, 'neutral': 1, 'negative': 0}).values
test_labels = test_df['Sentiment'].replace({'positive': 2, 'neutral': 1, 'negative': 0}).values

train_dataset = SentimentDataset(train_encodings, train_labels)
val_dataset = SentimentDataset(val_encodings, val_labels)
test_dataset = SentimentDataset(test_encodings, test_labels)  

In [24]:
model_Bert = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [25]:
# arguments of training:

training_args = TrainingArguments(
    output_dir = "./results_Bert",
    per_device_train_batch_size = 8,
    per_device_eval_batch_size = 8,
    num_train_epochs = 5,
    evaluation_strategy = "epoch",
    logging_dir = './logs',
    save_strategy="epoch",
)

trainer = Trainer(
    model=model_Bert,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    
    
)

trainer.train()
results = trainer.evaluate()


Epoch,Training Loss,Validation Loss
1,No log,0.542421
2,0.626100,0.596719
3,0.368400,0.757685
4,0.241400,0.809314
5,0.179600,0.886464


In [26]:
# We can see that the best model is reached at Epoch 1, then:
best_model_path_bert = "./results_Bert/checkpoint-439"
best_model_bert = BertForSequenceClassification.from_pretrained(best_model_path_bert)

trainer_best_model_bert = Trainer(
    model=best_model_bert,
)


In [27]:
predictions_train = trainer_best_model_bert.predict(train_dataset)
pred_labels_train = predictions_train.predictions.argmax(axis=1)

accuracy_best_model = accuracy_score(train_labels, pred_labels_train)
f1_best_model = f1_score(train_labels, pred_labels_train, average='weighted')

print(f"Test Accuracy Best Model (train): {accuracy_best_model}")
print(f"Test F1-Score Best Model (train): {f1_best_model}")

Test Accuracy Best Model (train): 0.8325249643366619
Test F1-Score Best Model (train): 0.8032225821691561


In [30]:
predictions_validation = trainer_best_model_bert.predict(val_dataset)
pred_labels_validation = predictions_validation.predictions.argmax(axis=1)

accuracy_best_model = accuracy_score(val_labels, pred_labels_validation)
f1_best_model = f1_score(val_labels, pred_labels_validation, average='weighted')

print(f"Test Accuracy Best Model (validation): {accuracy_best_model}")
print(f"Test F1-Score Best Model (validation): {f1_best_model}")

Test Accuracy Best Model (validation): 0.788527397260274
Test F1-Score Best Model (validation): 0.7540641896785676


In [31]:
predictions_test = trainer_best_model_bert.predict(test_dataset)  

pred_labels_test = predictions_test.predictions.argmax(axis=1)

accuracy_best_model = accuracy_score(test_labels, pred_labels_test)
f1_best_model = f1_score(test_labels, pred_labels_test, average='weighted')

print(f"Test Accuracy Best Model: {accuracy_best_model}")
print(f"Test F1-Score Best Model: {f1_best_model}")

Test Accuracy Best Model: 0.786142001710864
Test F1-Score Best Model: 0.7519835307691051


The results of the the Electra model is better than the Bert one.