In [20]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForMaskedLM, AdamW, get_linear_schedule_with_warmup
from torch.cuda.amp import GradScaler, autocast
import requests
from bs4 import BeautifulSoup
#!pip install datasets
from datasets import load_metric

Defining the Persian Wikipedia Dataset

In [21]:
class PersianWikipediaDataset(Dataset):
    def __init__(self, num_articles):
        self.articles = self.load_articles(num_articles)

    def __len__(self):
        return len(self.articles)

    def __getitem__(self, idx):
        return self.articles[idx]

    def load_articles(self, num_articles):
        articles = []
        error_count = 0
        max_errors = 10

        while len(articles) < num_articles:
            try:
                article = self.fetch_random_article()
                articles.append(article)
                error_count = 0
            except Exception as e:
                error_count += 1
                if error_count >= max_errors:
                    print(f"Reached maximum consecutive errors ({max_errors}). Stopping article loading.")
                    break
                print(f"Error loading article: {str(e)}")

        return articles

    def fetch_random_article(self):
        url = 'https://fa.wikipedia.org/w/api.php?action=query&format=json&list=random&rnnamespace=0&rnlimit=1'
        response = requests.get(url)
        data = response.json()
        random_article_title = data['query']['random'][0]['title']

        article_url = f'https://fa.wikipedia.org/wiki/{random_article_title}'
        article_response = requests.get(article_url)
        article_content = article_response.text

        # Clean the article content
        soup = BeautifulSoup(article_content, 'html.parser')
        article_content = soup.get_text()

        return article_content


Initializing the tokenizer and defining the text generation model

In [22]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
model = BertForMaskedLM.from_pretrained('bert-base-multilingual-cased')

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Pre-training the model

In [23]:
# Define the text generation model
class TextGenerator(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers):
        super(TextGenerator, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x, hidden):
        embedded = self.embedding(x)
        output, hidden = self.lstm(embedded, hidden)
        output = self.fc(output)
        return output, hidden


In [24]:
def pretrain_model(model, dataset, tokenizer, num_epochs=3):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    data_loader = DataLoader(dataset, batch_size=8, shuffle=True, collate_fn=lambda x: tokenizer(x, return_tensors="pt", padding=True, truncation=True))
    criterion = nn.CrossEntropyLoss()
    optimizer = AdamW(model.parameters(), lr=0.001)
    scaler = GradScaler()

    # Add learning rate scheduler
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(data_loader)*num_epochs)

    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        for batch in data_loader:
            inputs = batch['input_ids'].to(device)
            targets = batch['input_ids'].to(device)

            optimizer.zero_grad()
            hidden = (torch.zeros(num_layers, inputs.size(0), hidden_dim).to(device),
                      torch.zeros(num_layers, inputs.size(0), hidden_dim).to(device))

            inputs = inputs[:, :-1]
            targets = targets[:, 1:].reshape(-1)

            with autocast():
                output, hidden = model(inputs, hidden)
                output = output.view(-1, vocab_size)
                loss = criterion(output, targets)

            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()

            # Update the learning rate
            scheduler.step()

            total_loss += loss.item()

        print(f'Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(data_loader)}')

    torch.save(model.state_dict(), 'pretrained_text_generator.pth')

Fine-tuning the model

In [25]:
def finetune_model(model, dataset, tokenizer, num_epochs=10):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    data_loader = DataLoader(dataset, batch_size=4, shuffle=True, collate_fn=lambda x: tokenizer(x, return_tensors="pt", padding=True, truncation=True))
    criterion = nn.CrossEntropyLoss()
    optimizer = AdamW(model.parameters(), lr=0.001)
    scaler = GradScaler()

    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        for batch in data_loader:
            inputs = batch['input_ids'].to(device)
            targets = batch['input_ids'].to(device)

            optimizer.zero_grad()
            hidden = (torch.zeros(num_layers, inputs.size(0), hidden_dim).to(device),
                      torch.zeros(num_layers, inputs.size(0), hidden_dim).to(device))

            inputs = inputs[:, :-1]
            targets = targets[:, 1:].reshape(-1)

            with autocast():
                output, hidden = model(inputs, hidden)
                output = output.view(-1, vocab_size)
                loss = criterion(output, targets)

            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()

            total_loss += loss.item()

        print(f'Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(data_loader)}')

    torch.save(model.state_dict(), 'finetuned_text_generator.pth')

Evaluating the model

In [32]:
def evaluate_model(model, dataset, tokenizer, device):
    model.eval()
    criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)

    data_loader = DataLoader(dataset, batch_size=4, shuffle=True, collate_fn=lambda x: tokenizer(x, return_tensors="pt", padding=True, truncation=True))

    total_loss = 0
    total_tokens = 0
    rouge_metric = load_metric("rouge")
    bleu_metric = load_metric("bleu")
    # Add METEOR and CIDEr metrics
    meteor_metric = load_metric("meteor")
    #cider_metric = load_metric("cider")

    with torch.no_grad():
        for batch in data_loader:
            inputs = batch['input_ids'].to(device)
            targets = batch['input_ids'].to(device)

            hidden = (torch.zeros(num_layers, inputs.size(0), hidden_dim).to(device),
                      torch.zeros(num_layers, inputs.size(0), hidden_dim).to(device))

            inputs = inputs[:, :-1]
            targets = targets[:, 1:].reshape(-1)

            with autocast():
                output, hidden = model(inputs, hidden)
                output = output.view(-1, vocab_size)
                loss = criterion(output, targets)

            total_loss += loss.item() * targets.size(0)
            total_tokens += targets.size(0)

            generated_texts = tokenizer.batch_decode(torch.argmax(output, dim=-1), skip_special_tokens=True)
            reference_texts = tokenizer.batch_decode(targets, skip_special_tokens=True)

            rouge_metric.add_batch(predictions=generated_texts, references=reference_texts)
            bleu_metric.add_batch(predictions=[text.split() for text in generated_texts], references=[[text.split()] for text in reference_texts])
            # Update METEOR and CIDEr metrics
            meteor_metric.add_batch(predictions=generated_texts, references=reference_texts)
            #cider_metric.add_batch(predictions=[text.split() for text in generated_texts], references=[[text.split()] for text in reference_texts])

    perplexity = torch.exp(torch.tensor(total_loss / total_tokens))
    rouge_results = rouge_metric.compute()
    bleu_results = bleu_metric.compute()
    # Compute METEOR and CIDEr results
    meteor_results = meteor_metric.compute()
    #cider_results = cider_metric.compute()

    return perplexity, rouge_results, bleu_results, meteor_results#, cider_results

Running the model

In [27]:
# Parameters
embedding_dim = 64
hidden_dim = 128
num_layers = 1
vocab_size = tokenizer.vocab_size

# Initialize the model
model = TextGenerator(vocab_size, embedding_dim, hidden_dim, num_layers)

# Create the dataset
num_articles = 200
dataset = PersianWikipediaDataset(num_articles=num_articles)

# Pre-train the model on a larger corpus (using Persian Wikipedia dataset as a placeholder)
#pretrain_model(model, dataset, tokenizer, num_epochs=3)

# Fine-tune the model on the Persian Wikipedia dataset
#finetune_model(model, dataset, tokenizer, num_epochs=10)

# Evaluate the model
#device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#perplexity, rouge_results, bleu_results, meteor_results, cider_results = evaluate_model(model, dataset, tokenizer, device)

#print(f'Perplexity: {perplexity}')
#print(f'ROUGE Results: {rouge_results}')
#print(f'BLEU Results: {bleu_results}')
#print(f'METEOR Results: {meteor_results}')
#print(f'CIDEr Results: {cider_results}')

In [28]:
# Pre-train the model on a larger corpus (using Persian Wikipedia dataset as a placeholder)
pretrain_model(model, dataset, tokenizer, num_epochs=3)

# Fine-tune the model on the Persian Wikipedia dataset
finetune_model(model, dataset, tokenizer, num_epochs=10)



Epoch 1/3, Loss: 11.530370559692383
Epoch 2/3, Loss: 10.26057933807373
Epoch 3/3, Loss: 8.429941368103027
Epoch 1/10, Loss: 6.623826608657837
Epoch 2/10, Loss: 5.595525150299072
Epoch 3/10, Loss: 4.884409608840943
Epoch 4/10, Loss: 4.129149875640869
Epoch 5/10, Loss: 3.5287521648406983
Epoch 6/10, Loss: 3.1546111249923707
Epoch 7/10, Loss: 2.921341004371643
Epoch 8/10, Loss: 2.7563712930679323
Epoch 9/10, Loss: 2.6267275047302245
Epoch 10/10, Loss: 2.518162579536438


In [33]:
# Evaluate the model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
perplexity, rouge_results, bleu_results, meteor_results = evaluate_model(model, dataset, tokenizer, device)

print(f'Perplexity: {perplexity}')
print(f'ROUGE Results: {rouge_results}')
print(f'BLEU Results: {bleu_results}')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Perplexity: 11.513700485229492
ROUGE Results: {'rouge1': AggregateScore(low=Score(precision=0.05432768323082657, recall=0.05399677511415525, fmeasure=0.05400964306540386), mid=Score(precision=0.055660046438666747, recall=0.055321378094616847, fmeasure=0.05533075535032485), high=Score(precision=0.056986855255800904, recall=0.05662715050632124, fmeasure=0.05665212539146009)), 'rouge2': AggregateScore(low=Score(precision=0.03656772871819963, recall=0.03648397749510766, fmeasure=0.036498931833007196), mid=Score(precision=0.03767408675799089, recall=0.03759540117416833, fmeasure=0.03761301369863016), high=Score(precision=0.03879327707110243, recall=0.03870201402478802, fmeasure=0.03872027071102418)), 'rougeL': AggregateScore(low=Score(precision=0.05422386776628455, recall=0.053941575614263944, fmeasure=0.05392712986757753), mid=Score(precision=0.05568203103159068, recall=0.0553311317677756, fmeasure=0.05534343610835784), high=Score(precision=0.0570960616438356, recall=0.05669790462988845, f

In [30]:
!pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24933 sha256=307000e5b67bdbe5afc6331a0db22e53d703843429dff784b3c1c586466ad214
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [34]:
print(f'Perplexity: {perplexity}')
print(f'ROUGE Results: {rouge_results}')
print(f'BLEU Results: {bleu_results}')
print(f'METEOR Results: {meteor_results}')

Perplexity: 11.513700485229492
ROUGE Results: {'rouge1': AggregateScore(low=Score(precision=0.05432768323082657, recall=0.05399677511415525, fmeasure=0.05400964306540386), mid=Score(precision=0.055660046438666747, recall=0.055321378094616847, fmeasure=0.05533075535032485), high=Score(precision=0.056986855255800904, recall=0.05662715050632124, fmeasure=0.05665212539146009)), 'rouge2': AggregateScore(low=Score(precision=0.03656772871819963, recall=0.03648397749510766, fmeasure=0.036498931833007196), mid=Score(precision=0.03767408675799089, recall=0.03759540117416833, fmeasure=0.03761301369863016), high=Score(precision=0.03879327707110243, recall=0.03870201402478802, fmeasure=0.03872027071102418)), 'rougeL': AggregateScore(low=Score(precision=0.05422386776628455, recall=0.053941575614263944, fmeasure=0.05392712986757753), mid=Score(precision=0.05568203103159068, recall=0.0553311317677756, fmeasure=0.05534343610835784), high=Score(precision=0.0570960616438356, recall=0.05669790462988845, f