In [6]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer
from torch.cuda.amp import GradScaler, autocast
import requests
from datasets import load_metric

# Define the Persian Wikipedia Dataset
class PersianWikipediaDataset(Dataset):
    def __init__(self, num_articles):
        self.articles = self.load_articles(num_articles)

    def __len__(self):
        return len(self.articles)

    def __getitem__(self, idx):
        return self.articles[idx]

    def load_articles(self, num_articles):
        articles = []
        error_count = 0
        max_errors = 10

        while len(articles) < num_articles:
            try:
                article = self.fetch_random_article()
                articles.append(article)
                error_count = 0
            except Exception as e:
                error_count += 1
                if error_count >= max_errors:
                    print(f"Reached maximum consecutive errors ({max_errors}). Stopping article loading.")
                    break
                print(f"Error loading article: {str(e)}")

        return articles

    def fetch_random_article(self):
        url = 'https://fa.wikipedia.org/w/api.php?action=query&format=json&list=random&rnnamespace=0&rnlimit=1'
        response = requests.get(url)
        data = response.json()
        random_article_title = data['query']['random'][0]['title']

        article_url = f'https://fa.wikipedia.org/wiki/{random_article_title}'
        article_response = requests.get(article_url)
        article_content = article_response.text

        return article_content


In [7]:
# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]



config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

In [8]:
# Define the text generation model
class TextGenerator(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers):
        super(TextGenerator, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x, hidden):
        embedded = self.embedding(x)
        output, hidden = self.lstm(embedded, hidden)
        output = self.fc(output)
        return output, hidden


In [9]:
# Pre-training the model
def pretrain_model(model, dataset, tokenizer, num_epochs=3):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    data_loader = DataLoader(dataset, batch_size=8, shuffle=True, collate_fn=lambda x: tokenizer(x, return_tensors="pt", padding=True, truncation=True))
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    scaler = GradScaler()

    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        for batch in data_loader:
            inputs = batch['input_ids'].to(device)
            targets = batch['input_ids'].to(device)

            optimizer.zero_grad()
            hidden = (torch.zeros(num_layers, inputs.size(0), hidden_dim).to(device),
                      torch.zeros(num_layers, inputs.size(0), hidden_dim).to(device))

            inputs = inputs[:, :-1]
            targets = targets[:, 1:].reshape(-1)

            with autocast():
                output, hidden = model(inputs, hidden)
                output = output.view(-1, vocab_size)
                loss = criterion(output, targets)

            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()

            total_loss += loss.item()

        print(f'Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(data_loader)}')

    torch.save(model.state_dict(), 'pretrained_text_generator.pth')


In [10]:
# Fine-tuning the model
def finetune_model(model, dataset, tokenizer, num_epochs=10):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    data_loader = DataLoader(dataset, batch_size=4, shuffle=True, collate_fn=lambda x: tokenizer(x, return_tensors="pt", padding=True, truncation=True))
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    scaler = GradScaler()

    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        for batch in data_loader:
            inputs = batch['input_ids'].to(device)
            targets = batch['input_ids'].to(device)

            optimizer.zero_grad()
            hidden = (torch.zeros(num_layers, inputs.size(0), hidden_dim).to(device),
                      torch.zeros(num_layers, inputs.size(0), hidden_dim).to(device))

            inputs = inputs[:, :-1]
            targets = targets[:, 1:].reshape(-1)

            with autocast():
                output, hidden = model(inputs, hidden)
                output = output.view(-1, vocab_size)
                loss = criterion(output, targets)

            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()

            total_loss += loss.item()

        print(f'Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(data_loader)}')

    torch.save(model.state_dict(), 'finetuned_text_generator.pth')


In [11]:
# Evaluation
def evaluate_model(model, dataset, tokenizer, device):
    model.eval()
    criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)

    data_loader = DataLoader(dataset, batch_size=4, shuffle=True, collate_fn=lambda x: tokenizer(x, return_tensors="pt", padding=True, truncation=True))

    total_loss = 0
    total_tokens = 0
    rouge_metric = load_metric("rouge")
    bleu_metric = load_metric("bleu")

    with torch.no_grad():
        for batch in data_loader:
            inputs = batch['input_ids'].to(device)
            targets = batch['input_ids'].to(device)

            hidden = (torch.zeros(num_layers, inputs.size(0), hidden_dim).to(device),
                      torch.zeros(num_layers, inputs.size(0), hidden_dim).to(device))

            inputs = inputs[:, :-1]
            targets = targets[:, 1:].reshape(-1)

            with autocast():
                output, hidden = model(inputs, hidden)
                output = output.view(-1, vocab_size)
                loss = criterion(output, targets)

            total_loss += loss.item() * targets.size(0)
            total_tokens += targets.size(0)

            generated_texts = tokenizer.batch_decode(torch.argmax(output, dim=-1), skip_special_tokens=True)
            reference_texts = tokenizer.batch_decode(targets, skip_special_tokens=True)

            rouge_metric.add_batch(predictions=generated_texts, references=reference_texts)
            bleu_metric.add_batch(predictions=[text.split() for text in generated_texts], references=[[text.split()] for text in reference_texts])

    perplexity = torch.exp(torch.tensor(total_loss / total_tokens))
    rouge_results = rouge_metric.compute()
    bleu_results = bleu_metric.compute()

    return perplexity.item(), rouge_results, bleu_results


In [12]:
# Parameters
embedding_dim = 64
hidden_dim = 128
num_layers = 1
vocab_size = tokenizer.vocab_size

# Initialize the model
model = TextGenerator(vocab_size, embedding_dim, hidden_dim, num_layers)

# Create the dataset
num_articles = 200
dataset = PersianWikipediaDataset(num_articles=num_articles)


In [13]:
# Pre-train the model on a larger corpus (using Persian Wikipedia dataset as a placeholder)
pretrain_model(model, dataset, tokenizer, num_epochs=3)

# Fine-tune the model on the Persian Wikipedia dataset
finetune_model(model, dataset, tokenizer, num_epochs=10)


Epoch 1/3, Loss: 10.079336013793945
Epoch 2/3, Loss: 4.549957284927368
Epoch 3/3, Loss: 3.722768840789795
Epoch 1/10, Loss: 3.1819964122772215
Epoch 2/10, Loss: 1.8373880434036254
Epoch 3/10, Loss: 0.948320871591568
Epoch 4/10, Loss: 0.5139753979444504
Epoch 5/10, Loss: 0.2987236219644547
Epoch 6/10, Loss: 0.1965003004670143
Epoch 7/10, Loss: 0.1485966657102108
Epoch 8/10, Loss: 0.12447078317403794
Epoch 9/10, Loss: 0.11009812757372855
Epoch 10/10, Loss: 0.10073553785681724


In [14]:
# Evaluate the model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
perplexity, rouge_results, bleu_results = evaluate_model(model, dataset, tokenizer, device)

print(f'Perplexity: {perplexity}')
print(f'ROUGE Results: {rouge_results}')
print(f'BLEU Results: {bleu_results}')


  rouge_metric = load_metric("rouge")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/2.48k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Perplexity: 1.1000173091888428
ROUGE Results: {'rouge1': AggregateScore(low=Score(precision=0.535825482247693, recall=0.5358965264187866, fmeasure=0.5358526581865616), mid=Score(precision=0.5388809290839621, recall=0.5389448793215915, fmeasure=0.5389009567297236), high=Score(precision=0.5418721461187211, recall=0.5419635518590998, fmeasure=0.5419012122200474)), 'rouge2': AggregateScore(low=Score(precision=0.4476725782778865, recall=0.4476953277886497, fmeasure=0.4476782656555773), mid=Score(precision=0.45069879321591655, recall=0.4507240704500978, fmeasure=0.45070572407045006), high=Score(precision=0.45374437377690796, recall=0.45375758317025444, fmeasure=0.45374767612524464)), 'rougeL': AggregateScore(low=Score(precision=0.5356721065138386, recall=0.5357489399869539, fmeasure=0.5356967329854316), mid=Score(precision=0.5387449911471442, recall=0.5388193085453361, fmeasure=0.5387697325505545), high=Score(precision=0.5416973196812972, recall=0.5417745433789957, fmeasure=0.541718199608610