# Required Libararies

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import math
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from collections import Counter

## Dataset Preparation

## We use wikipedia data from hugging face for the train our model

In [None]:
!pip install datasets -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from datasets import load_dataset

dataset = load_dataset("wikipedia", "20220301.simple")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/36.7k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/16.0k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.66k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/235M [00:00<?, ?B/s]

In [None]:
text_list = dataset['train']['text'][0:3]

In [None]:
len(text_list)

['April is the fourth month of the year in the Julian and Gregorian calendars, and comes between March and May. It is one of four months to have 30 days.\n\nApril always begins on the same day of week as July, and additionally, January in leap years. April always ends on the same day of the week as December.\n\nApril\'s flowers are the Sweet Pea and Daisy. Its birthstone is the diamond. The meaning of the diamond is innocence.\n\nThe Month \n\nApril comes between March and May, making it the fourth month of the year. It also comes first in the year out of the four months that have 30 days, as June, September and November are later in the year.\n\nApril begins on the same day of the week as July every year and on the same day of the week as January in leap years. April ends on the same day of the week as December every year, as each other\'s last days are exactly 35 weeks (245 days) apart.\n\nIn common years, April starts on the same day of the week as October of the previous year, and 

## insert data for preprocessing

In [None]:
with open('output.txt', 'w') as file:
    # Iterate over the list and write each string to the file
    for line in text_list:
        file.write(line + '\n')

In [None]:
# Dataset Preparation
with open('output.txt', 'r', encoding='utf-8') as file:
    text = file.read()

# Tokenize the text into words
words = text.split()
word_counts = Counter(words)

vocab = list(word_counts.keys())
vocab_size = len(vocab)
word_to_int = {word: i for i, word in enumerate(vocab)}
int_to_word = {i: word for word, i in word_to_int.items()}

SEQUENCE_LENGTH = 64
samples = [words[i:i+SEQUENCE_LENGTH+1] for i in range(len(words)-SEQUENCE_LENGTH)]

print(word_to_int)

{'April': 0, 'is': 1, 'the': 2, 'fourth': 3, 'month': 4, 'of': 5, 'year': 6, 'in': 7, 'Julian': 8, 'and': 9, 'Gregorian': 10, 'calendars,': 11, 'comes': 12, 'between': 13, 'March': 14, 'May.': 15, 'It': 16, 'one': 17, 'four': 18, 'months': 19, 'to': 20, 'have': 21, '30': 22, 'days.': 23, 'always': 24, 'begins': 25, 'on': 26, 'same': 27, 'day': 28, 'week': 29, 'as': 30, 'July,': 31, 'additionally,': 32, 'January': 33, 'leap': 34, 'years.': 35, 'ends': 36, 'December.': 37, "April's": 38, 'flowers': 39, 'are': 40, 'Sweet': 41, 'Pea': 42, 'Daisy.': 43, 'Its': 44, 'birthstone': 45, 'diamond.': 46, 'The': 47, 'meaning': 48, 'diamond': 49, 'innocence.': 50, 'Month': 51, 'May,': 52, 'making': 53, 'it': 54, 'year.': 55, 'also': 56, 'first': 57, 'out': 58, 'that': 59, 'days,': 60, 'June,': 61, 'September': 62, 'November': 63, 'later': 64, 'July': 65, 'every': 66, 'December': 67, 'year,': 68, 'each': 69, "other's": 70, 'last': 71, 'days': 72, 'exactly': 73, '35': 74, 'weeks': 75, '(245': 76, 'day

In [None]:
print(samples)

[['April', 'is', 'the', 'fourth', 'month', 'of', 'the', 'year', 'in', 'the', 'Julian', 'and', 'Gregorian', 'calendars,', 'and', 'comes', 'between', 'March', 'and', 'May.', 'It', 'is', 'one', 'of', 'four', 'months', 'to', 'have', '30', 'days.', 'April', 'always', 'begins', 'on', 'the', 'same', 'day', 'of', 'week', 'as', 'July,', 'and', 'additionally,', 'January', 'in', 'leap', 'years.', 'April', 'always', 'ends', 'on', 'the', 'same', 'day', 'of', 'the', 'week', 'as', 'December.', "April's", 'flowers', 'are', 'the', 'Sweet', 'Pea'], ['is', 'the', 'fourth', 'month', 'of', 'the', 'year', 'in', 'the', 'Julian', 'and', 'Gregorian', 'calendars,', 'and', 'comes', 'between', 'March', 'and', 'May.', 'It', 'is', 'one', 'of', 'four', 'months', 'to', 'have', '30', 'days.', 'April', 'always', 'begins', 'on', 'the', 'same', 'day', 'of', 'week', 'as', 'July,', 'and', 'additionally,', 'January', 'in', 'leap', 'years.', 'April', 'always', 'ends', 'on', 'the', 'same', 'day', 'of', 'the', 'week', 'as', 'D

In [None]:
class TextDataset(Dataset):
    def __init__(self, samples, word_to_int):
        self.samples = samples
        self.word_to_int = word_to_int

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        sample = self.samples[idx]
        input_seq = torch.LongTensor([self.word_to_int[word] for word in sample[:-1]])
        target_seq = torch.LongTensor([self.word_to_int[word] for word in sample[1:]])
        return input_seq, target_seq

In [None]:
BATCH_SIZE = 32
dataset = TextDataset(samples, word_to_int)
dataloader = DataLoader(
    dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
)

print(dataset[1])

(tensor([ 1,  2,  3,  4,  5,  2,  6,  7,  2,  8,  9, 10, 11,  9, 12, 13, 14,  9,
        15, 16,  1, 17,  5, 18, 19, 20, 21, 22, 23,  0, 24, 25, 26,  2, 27, 28,
         5, 29, 30, 31,  9, 32, 33,  7, 34, 35,  0, 24, 36, 26,  2, 27, 28,  5,
         2, 29, 30, 37, 38, 39, 40,  2, 41, 42]), tensor([ 2,  3,  4,  5,  2,  6,  7,  2,  8,  9, 10, 11,  9, 12, 13, 14,  9, 15,
        16,  1, 17,  5, 18, 19, 20, 21, 22, 23,  0, 24, 25, 26,  2, 27, 28,  5,
        29, 30, 31,  9, 32, 33,  7, 34, 35,  0, 24, 36, 26,  2, 27, 28,  5,  2,
        29, 30, 37, 38, 39, 40,  2, 41, 42,  9]))


## Transformer Model

In [None]:
def generate_square_subsequent_mask(sz):
    """
    Generate a square mask for the sequence. The masked positions are filled with float('-inf').
    Unmasked positions are filled with float(0.0).
    """
    mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask

In [None]:
class PositionalEncoding(nn.Module):
    def __init__(self, max_len, d_model, dropout=0.1):
        """
        :param max_len: Input length sequence.
        :param d_model: Embedding dimension.
        :param dropout: Dropout value (default=0.1)
        """
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        """
        Inputs of forward function
        :param x: the sequence fed to the positional encoder model (required).
        Shape:
            x: [sequence length, batch size, embed dim]
            output: [sequence length, batch size, embed dim]
        """

        x = x + self.pe[:, :x.size(1)]
        return self.dropout(x)

In [None]:
class TextGen(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_layers, num_heads):
        super(TextGen, self).__init__()
        self.pos_encoder = PositionalEncoding(max_len=SEQUENCE_LENGTH, d_model=embed_dim)
        self.emb = nn.Embedding(vocab_size, embed_dim)
        self.decoder_layer = nn.TransformerDecoderLayer(
            d_model=embed_dim,
            nhead=num_heads,
            batch_first=True
        )
        self.decoder = nn.TransformerDecoder(
            decoder_layer=self.decoder_layer,
            num_layers=num_layers,
        )
        self.linear = nn.Linear(embed_dim, vocab_size)
        self.dropout = nn.Dropout(0.2)

    # Positional encoding is required. Else the model does not learn.
    def forward(self, x):
        emb = self.emb(x)

        # Generate input sequence mask with shape (SEQUENCE_LENGTH, SEQUENCE_LENGTH)
        input_mask = generate_square_subsequent_mask(x.size(1)).to(x.device)

        x = self.pos_encoder(emb)
        x = self.decoder(x, memory=x, tgt_mask=input_mask, memory_mask=input_mask)
        x = self.dropout(x)
        out = self.linear(x)
        return out

## Training

In [None]:
epochs = 100
learning_rate = 0.001

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = TextGen(
    vocab_size=vocab_size,
    embed_dim=100,
    num_layers=2,
    num_heads=2,
).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [None]:
print(model)
# Total parameters and trainable parameters.
total_params = sum(p.numel() for p in model.parameters())
print(f"{total_params:,} total parameters.")
total_trainable_params = sum(
    p.numel() for p in model.parameters() if p.requires_grad)
print(f"{total_trainable_params:,} training parameters.\n")

TextGen(
  (pos_encoder): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (emb): Embedding(1860, 100)
  (decoder_layer): TransformerDecoderLayer(
    (self_attn): MultiheadAttention(
      (out_proj): NonDynamicallyQuantizableLinear(in_features=100, out_features=100, bias=True)
    )
    (multihead_attn): MultiheadAttention(
      (out_proj): NonDynamicallyQuantizableLinear(in_features=100, out_features=100, bias=True)
    )
    (linear1): Linear(in_features=100, out_features=2048, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
    (linear2): Linear(in_features=2048, out_features=100, bias=True)
    (norm1): LayerNorm((100,), eps=1e-05, elementwise_affine=True)
    (norm2): LayerNorm((100,), eps=1e-05, elementwise_affine=True)
    (norm3): LayerNorm((100,), eps=1e-05, elementwise_affine=True)
    (dropout1): Dropout(p=0.1, inplace=False)
    (dropout2): Dropout(p=0.1, inplace=False)
    (dropout3): Dropout(p=0.1, inplace=False)
  )
  (decoder): Transform

In [None]:
# Training
def train(model, epochs, dataloader, criterion):
    model.train()
    for epoch in range(epochs):
        running_loss = 0
        for input_seq, target_seq in dataloader:
            input_seq, target_seq = input_seq.to(device), target_seq.to(device)
            outputs = model(input_seq)
            target_seq = target_seq.contiguous().view(-1)
            outputs = outputs.view(-1, vocab_size)
            loss = criterion(outputs, target_seq)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            running_loss += loss.detach().cpu().numpy()
        epoch_loss = running_loss / len(dataloader)
        print(f"Epoch {epoch} loss: {epoch_loss:.3f}")

train(model, epochs, dataloader, criterion)

Epoch 0 loss: 4.125


## Inference

In [None]:
def return_int_vector(text):
    words = text.split()
    input_seq = torch.LongTensor([word_to_int[word] for word in words[-SEQUENCE_LENGTH:]]).unsqueeze(0)
    return input_seq

In [None]:
def sample_next(predictions):
    """
    Greedy sampling.
    """
    # Greedy approach.
    probabilities = F.softmax(predictions[:, -1, :], dim=-1).cpu()
    next_token = torch.argmax(probabilities)
    return int(next_token.cpu())

def text_generator(sentence, generate_length):
    model.eval()
    sample = sentence
    for i in range(generate_length):
        int_vector = return_int_vector(sample)
        if len(int_vector) >= SEQUENCE_LENGTH - 1:
            break
        input_tensor = int_vector.to(device)
        with torch.no_grad():
            predictions = model(input_tensor)
        next_token = sample_next(predictions)
        sample += ' ' + int_to_word[next_token]
    print(sample)
    print('\n')

In [None]:
sentences = [
    "The United States purchases"
]

In [None]:
generate_length = 100

In [None]:
for sentence in sentences:
    print(f"PROMPT: {sentence}")
    text_generator(sentence, generate_length)

PROMPT: The United States purchases
The United States purchases (buys) the Louisiana territory from France. April 30, 1945 - Adolf Hitler commits suicide on the same day that the Soviet Army raises the Red Flag on Berlin's Reichstag. April 30, 1952 - The Diary of Anne Frank is published in English. April 30, 1975 - The Vietnam War ends, as North Vietnamese forces take Saigon. April 30, 1980 - Queen Juliana of the Netherlands abdicates the throne, and her daughter becomes Queen Beatrix of the Netherlands. Beatrix later also abdicates, on this day in 2013, in favor of her son, King Willem-Alexander of the Netherlands. Trivia In Western Christianity,




## Download The Model On Local

In [None]:
torch.save(model.state_dict(), 'text_gen_model.pth')


# Load Model Inside The Program For Direct Use

In [None]:
class TextGen(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_layers, num_heads):
        super(TextGen, self).__init__()
        self.pos_encoder = PositionalEncoding(max_len=SEQUENCE_LENGTH, d_model=embed_dim)
        self.emb = nn.Embedding(vocab_size, embed_dim)
        self.decoder_layer = nn.TransformerDecoderLayer(
            d_model=embed_dim,
            nhead=num_heads,
            batch_first=True
        )
        self.decoder = nn.TransformerDecoder(
            decoder_layer=self.decoder_layer,
            num_layers=num_layers,
        )
        self.linear = nn.Linear(embed_dim, vocab_size)
        self.dropout = nn.Dropout(0.2)

    def forward(self, x):
        emb = self.emb(x)
        input_mask = generate_square_subsequent_mask(x.size(1)).to(x.device)
        x = self.pos_encoder(emb)
        x = self.decoder(x, memory=x, tgt_mask=input_mask, memory_mask=input_mask)
        x = self.dropout(x)
        out = self.linear(x)
        return out

# Create an instance of the model
loaded_model = TextGen(
    vocab_size=vocab_size,
    embed_dim=100,
    num_layers=2,
    num_heads=2,
).to(device)

# Load the saved model's state dict
loaded_model.load_state_dict(torch.load('text_gen_model.pth'))

# Set the model to evaluation mode
loaded_model.eval()

# Inference function with loaded model
def text_generator_with_loaded_model(sentence, generate_length):
    sample = sentence
    for i in range(generate_length):
        int_vector = return_int_vector(sample)
        if len(int_vector) >= SEQUENCE_LENGTH - 1:
            break
        input_tensor = int_vector.to(device)
        with torch.no_grad():
            predictions = loaded_model(input_tensor)
        next_token = sample_next(predictions)
        sample += ' ' + int_to_word[next_token]
    print(sample)
    print('\n')

# Example sentences for inference
sentences = [
    "The United States purchases"
]

generate_length = 100

# Perform inference with the loaded model
for sentence in sentences:
    print(f"PROMPT: {sentence}")
    text_generator_with_loaded_model(sentence, generate_length)

PROMPT: The United States purchases
The United States purchases (buys) the Louisiana territory from France. April 30, 1945 - Adolf Hitler commits suicide on the same day that the Soviet Army raises the Red Flag on Berlin's Reichstag. April 30, 1952 - The Diary of Anne Frank is published in English. April 30, 1975 - The Vietnam War ends, as North Vietnamese forces take Saigon. April 30, 1980 - Queen Juliana of the Netherlands abdicates the throne, and her daughter becomes Queen Beatrix of the Netherlands. Beatrix later also abdicates, on this day in 2013, in favor of her son, King Willem-Alexander of the Netherlands. Trivia In Western Christianity,




In [None]:
sentences = [
    "Soviet Army"
]

for sentence in sentences:
    print(f"PROMPT: {'sentence'}")
    text_generator_with_loaded_model(sentence, generate_length)

PROMPT: sentence
Soviet Army raises the Red Flag on Berlin's Reichstag. April 30, 1952 - The Diary of Anne Frank is published in English. April 30, 1975 - The Vietnam War ends, as North Vietnamese forces take Saigon. April 30, 1980 - Queen Juliana of the Netherlands abdicates the throne, and her daughter becomes Queen Beatrix of the Netherlands. Beatrix later also abdicates, on this day in 2013, in favor of her son, King Willem-Alexander of the Netherlands. Trivia In Western Christianity, there is a bigger likelihood of Easter falling in April than in March. The months around April (March and May) both start


