In [40]:
# Load the text data.
with open('english_train.txt', 'r', encoding='utf-8') as file:
    text = file.read()

In [41]:
lines_of_text = 20000

with open('english_train.txt', 'r', encoding='utf-8') as file:
    lines = [next(file) for _ in range(lines_of_text)]

# Join lines into a single string
text = ''.join(lines)
print('Sample Text:', text[:200])

Sample Text: I think that students would benefit from learning at home,because they wont have to change and get up early in the morning to shower and do there hair. taking only classes helps them because at there 


In [42]:
len(text)

4521185

In [43]:
import torchtext

In [44]:
# Basic pre-processing..
import re

# Convert to lowercase
text = text.lower()

# Remove special characters and digits
text = re.sub(r'[^a-zA-Z\s]', '', text)

# Replace multiple spaces with a single space
text = re.sub(r'\s+', ' ', text).strip()

In [45]:
# Importing the torchtext module
import torch
from torchtext.data.utils import get_tokenizer

# Making a tokenizer for tokenization of all the words in a text.
english_tokenizer = get_tokenizer('basic_english') # English word tokens..

In [46]:
# Fitting the tokenizer to the text for making the tokens.
tokens = english_tokenizer(text)

In [47]:
len(tokens)

831137

In [48]:
# Defining the max padding
MAX_PADDING = 100

# Making padding function
def pad_token(tokens):
  if(len(tokens)) >= MAX_PADDING:
    return tokens[:MAX_PADDING]
  else:
    return tokens + [pad_token] * (MAX_PADDING - len(tokens))

In [49]:
# Implementing the vocabulary.

from torchtext.vocab import build_vocab_from_iterator # Vocab module

# Defining the vocabulary size.
VOCAB_SIZE = 100_000

# Some Special Conditions.
unk_token = "<unk>"
pad_token = "<pad>"

# Vocabulary
vocab = build_vocab_from_iterator([tokens], max_tokens=VOCAB_SIZE,
                                  specials=[unk_token, pad_token])

In [50]:
len(vocab)

16419

In [51]:
# There is some basic implementation for handling the unknown tokens. By making the default index
vocab.set_default_index(vocab[unk_token])

In [52]:
vocab['hi']

1811

In [53]:
# Viewing the token's indices..
sample_view = vocab.lookup_indices(tokens)

In [54]:
sample_view[:10]

[11, 55, 7, 23, 50, 239, 57, 160, 53, 11487]

In [55]:
import torch
import numpy

# Convert all tokens to indices using the vocabulary
encoded_text = [vocab[token] for token in tokens]

# Set the sequence length
sequence_length = 100

# Create input-output sequences
input_sequences = []
output_tokens = []

for i in range(sequence_length, len(encoded_text)):
    input_sequences.append(encoded_text[i-sequence_length:i])
    output_tokens.append(encoded_text[i])

# Convert to tensors
X = torch.tensor(input_sequences)
y = torch.tensor(output_tokens)

print('Input Shape:', X.shape)   # (num_sequences, sequence_length)
print('Output Shape:', y.shape)  # (num_sequences,)

Input Shape: torch.Size([831037, 100])
Output Shape: torch.Size([831037])


In [56]:
from torch.utils.data import TensorDataset, DataLoader

# Create TensorDataset and DataLoader for the entire dataset
full_dataset = TensorDataset(X, y)
full_loader = DataLoader(full_dataset, batch_size=16, shuffle=True)

print('Number of Batches:', len(full_loader))


Number of Batches: 51940


In [57]:
import torch
import torch.nn as nn

class TextGenerationLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim=100, hidden_dim=256, num_layers=2): # Specifying the dimensions of embedding, hidden layers.
        super(TextGenerationLSTM, self).__init__()
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers

        # Embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)

        # LSTM layer
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True)

        # Fully connected layer
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x, hidden):
        x = self.embedding(x)
        out, hidden = self.lstm(x, hidden)
        out = self.fc(out[:, -1, :])  # Predict the next token in the linear layer.
        return out, hidden

    def init_hidden(self, batch_size, device):
        # Initialize hidden states (h0, c0) with zeros
        h0 = torch.zeros(self.num_layers, batch_size, self.hidden_dim).to(device)
        c0 = torch.zeros(self.num_layers, batch_size, self.hidden_dim).to(device)
        return (h0, c0)

In [58]:
# Let's Instantiate the model..

# Instantiating the model
vocab_size = len(vocab)  # Vocabulary size from torchtext
model = TextGenerationLSTM(vocab_size)

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

print(model)

TextGenerationLSTM(
  (embedding): Embedding(16419, 100)
  (lstm): LSTM(100, 256, num_layers=2, batch_first=True)
  (fc): Linear(in_features=256, out_features=16419, bias=True)
)


In [59]:
# Loss function and optimizer
criterion = nn.CrossEntropyLoss()  # Using the cross-entropy loss.
optimizer = torch.optim.Adam(model.parameters(), lr=0.001) # Using the Adam Optimization method/

In [None]:
# Training configuration
num_epochs = 3

# Training loop
for epoch in range(num_epochs):
    model.train()  # Set model to training mode
    total_loss = 0

    for inputs, targets in full_loader:
        inputs, targets = inputs.to(device), targets.to(device)

        # Initialize hidden states
        hidden = model.init_hidden(inputs.size(0), device)

        # Forward pass
        outputs, hidden = model(inputs, hidden)

        # Calculate loss
        loss = criterion(outputs, targets)

        # Backpropagation and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(full_loader)
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}')

In [None]:
import torch.nn.functional as F

# Set the model to evaluation mode
model.eval()

def generate_text(model, start_text, vocab, tokenizer, max_length=100, temperature=1.0):

    # Tokenize the start text
    tokens = tokenizer(start_text)
    input_seq = [vocab[token] for token in tokens]
    input_seq = torch.tensor(input_seq).unsqueeze(0).to(device)  # (1, seq_len)

    # Initialize hidden states
    hidden = model.init_hidden(input_seq.size(0), device)

    # Collect generated tokens
    generated_tokens = tokens.copy()

    model.eval()
    with torch.no_grad():
        for _ in range(max_length):
            # Forward pass
            output, hidden = model(input_seq, hidden)

            # Apply temperature to logits
            output = output / temperature
            probs = F.softmax(output, dim=-1).squeeze()

            # Sample the next token
            next_token_id = torch.multinomial(probs, num_samples=1).item()
            next_token = vocab.lookup_token(next_token_id)

            # Stop if end of sequence token is generated
            if next_token == '<eos>':
                break

            # Add the token to the generated sequence
            generated_tokens.append(next_token)

            # Update input sequence
            input_seq = torch.tensor([next_token_id]).unsqueeze(0).to(device)

    # Join tokens to form the final text
    generated_text = ' '.join(generated_tokens)
    return generated_text

In [None]:
# Define the seed text to start generating
seed_text = "What is your name ? "

# Generate text
generated_text = generate_text(model, seed_text, vocab, english_tokenizer, max_length=100, temperature=1.0)

# Display the generated text
print("\nGenerated Text:\n", generated_text)


Generated Text:
 what is your name ? and if they pay attention in conclusion the people kept during the community stable until this people stay comfortable up and that just government dont be great saying all the persons will not be good but that will be a good thing to have positive things is everyone point they have to learn those who they would like to never be active if youre bad attitude esteem and skill they mean everything the best they dont like it etc also believe that person always always always all wont make out of their body and tell fun they accomplish more and


In [None]:
torch.save(model, 'genai-englang.pth')

NameError: name 'torch' is not defined