In [None]:
!pip install torch
!pip install transformers
!pip install pandas
!pip install numpy
!pip install scikit-learn

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.27.2-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m57.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m105.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.3-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.8/199.8 KB[0m [31m18.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingfac

In [None]:
import torch
from transformers import BertForSequenceClassification, BertTokenizer
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# You can access the dataset from your Colab notebook using the file path /content/drive/My Drive/path/to/local/dataset.

In [None]:
# Load labeled dataset
df_labeled = pd.read_csv('labeled_data.csv')
# Load unlabeled dataset
df_unlabeled = pd.read_csv('unlabeled_data.csv')

In [None]:
# Initialize the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
# Initialize the BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

In [None]:
from transformers import AutoTokenizer
from transformers import BertModel

bert_tokenizer = AutoTokenizer.from_pretrained("aubmindlab/bert-base-arabertv02")
bert_model = BertModel.from_pretrained("aubmindlab/bert-base-arabertv02", return_dict=True)

Downloading (…)okenizer_config.json:   0%|          | 0.00/381 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/825k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.64M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Some weights of the model checkpoint at aubmindlab/bert-base-arabertv02 were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
# Define the number of epochs for self-training
num_epochs = 5
# Define the size of the self-training batch
self_training_batch_size = 32
# Define the threshold for accepting self-labeled examples
self_training_confidence_threshold = 0.95
# Define the number of iterations for self-training
num_iterations = 3

In [None]:
# Define a function to train the BERT model
def train_model(df_train):
    # Tokenize the training data
    inputs = tokenizer(df_train['text'].tolist(), padding=True, truncation=True, max_length=128, return_tensors='pt')
    labels = torch.tensor(df_train['label'].tolist())

    # Create a dataloader for the training data
    dataset = torch.utils.data.TensorDataset(inputs['input_ids'], inputs['attention_mask'], labels)
    dataloader = torch.utils.data.DataLoader(dataset, batch_size=32)

    # Set the optimizer and learning rate
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

    # Train the model
    for epoch in range(num_epochs):
        model.train()
        for batch in dataloader:
            optimizer.zero_grad()
            outputs = model(input_ids=batch[0], attention_mask=batch[1], labels=batch[2])
            loss = outputs.loss
            loss.backward()
            optimizer.step()

########################################
# Train the model using the labeled data
train_model(df_labeled)

In [None]:
# Define a function to generate predictions for the unlabeled data
def generate_predictions(df_unlabeled):
    # Tokenize the unlabeled data
    inputs = tokenizer(df_unlabeled['text'].tolist(), padding=True, truncation=True, max_length=128, return_tensors='pt')

    # Create a dataloader for the unlabeled data
    dataset = torch.utils.data.TensorDataset(inputs['input_ids'], inputs['attention_mask'])
    dataloader = torch.utils.data.DataLoader(dataset, batch_size=self_training_batch_size)

    # Generate predictions for the unlabeled data
    model.eval()
    predictions = []
    with torch.no_grad():
        for batch in dataloader:
            outputs = model(input_ids=batch[0], attention_mask=batch[1])
            logits = outputs.logits
            softmax = torch.nn.functional.softmax(logits, dim=1)
            predictions.extend(softmax.cpu().numpy()[:, 1])

    # Add the predictions to the unlabeled data as pseudo-labels
    df_unlabeled['label'] = np.array(predictions)

In [None]:
# Define a function to select self-labeled examples
def select_self_labeled_examples(df_unlabeled):
    # Filter the unlabeled data based on the confidence threshold
    df_confident = df_unlabeled[df_unlabeled['label'] > self_training_confidence_threshold]

    # Sort the confident examples by their predicted probability
    df_confident = df_confident.sort_values(by='label', ascending=False)

    # Select the top examples to use for self-training
    df_self_labeled = df_confident.head(self_training_batch_size)

    return df_self_labeled

In [None]:
# Define a function to perform self-training
def self_train():
    for iteration in range(num_iterations):
        print(f'Starting self-training iteration {iteration+1}...')

        # Generate predictions for the unlabeled data
        generate_predictions(df_unlabeled)

        # Select self-labeled examples
        df_self_labeled = select_self_labeled_examples(df_unlabeled)

        # Add the self-labeled examples to the labeled data
        df_labeled = pd.concat([df_labeled, df_self_labeled], ignore_index=True)

        # Retrain the model using the updated labeled data
        train_model(df_labeled)

        # Evaluate the model on the labeled data
        inputs = tokenizer(df_labeled['text'].tolist(), padding=True, truncation=True, max_length=128, return_tensors='pt')
        labels = torch.tensor(df_labeled['label'].tolist())
        dataset = torch.utils.data.TensorDataset(inputs['input_ids'], inputs['attention_mask'], labels)
        dataloader = torch.utils.data.DataLoader(dataset, batch_size=32)
        model.eval()
        predictions = []
        with torch.no_grad():
            for batch in dataloader:
                outputs = model(input_ids=batch[0], attention_mask=batch[1])
                logits = outputs.logits
                softmax = torch.nn.functional.softmax(logits, dim=1)
                predictions.extend(np.argmax(softmax.cpu().numpy(), axis=1))

        # Calculate the F1 score on the labeled data
        f1 = f1_score(df_labeled['label'], predictions)
        print(f'F1 score after self-training iteration {iteration+1}: {f1}')

        # Keep track of the F1 history
        f1_history.append(f1)

    # Return the final trained model and the F1 history
    return model, f1_history


In [None]:
# Perform self-training and track the F1 score history
trained_model, f1_history = self_train()

In [None]:
# Print the final F1 score
print(f'Final F1 score after self-training: {f1_history[-1]}')

In [None]:
# Save the trained model
torch.save(trained_model.state_dict(), 'self_trained_model.pt')

In [None]:
# Plot the F1 score history
import matplotlib.pyplot as plt

plt.plot(f1_history)
plt.xlabel('Self-Training Iteration')
plt.ylabel('F1 Score')
plt.title('F1 Score vs Self-Training Iteration')
plt.show()