<a href="https://colab.research.google.com/github/rajan083/TextGen/blob/master/TextGen.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
from transformers import BertTokenizer, BertForMaskedLM
import numpy as np

class BertGenerativeModel:
    def __init__(self, model_name='bert-large-uncased'):
        self.tokenizer = BertTokenizer.from_pretrained(model_name)
        self.model = BertForMaskedLM.from_pretrained(model_name)
        self.model.eval()

    def generate_text(self, prompt, max_length=50):
        input_ids = self.tokenizer.encode(prompt, return_tensors='pt')

        for _ in range(max_length):
            with torch.no_grad():
                outputs = self.model(input_ids)
                predictions = outputs[0]

            predicted_index = torch.argmax(predictions[0, -1, :]).item()

            if predicted_index == self.tokenizer.sep_token_id:
                break

            new_token = self.tokenizer.convert_ids_to_tokens([predicted_index])[0]
            input_ids = torch.cat([input_ids, torch.tensor([[predicted_index]])], dim=1)

        return self.tokenizer.decode(input_ids[0])

    def generate_question(self, topic):
        prompt = f"Generate a question about {topic}: "
        return self.generate_text(prompt)

if __name__ == "__main__":
    generator = BertGenerativeModel()

    topics = ["artificial intelligence", "climate change", "space exploration"]

    for topic in topics:
        question = generator.generate_question(topic)
        print(f"Topic: {topic}")
        print(f"Generated Question: {question}\n")


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Topic: artificial intelligence
Generated Question: [CLS] generate a question about artificial intelligence : [SEP] problem problem problem problem problem problem problem problem problem problem problem problem problem problem problem problem problem problem problem problem problem problem problem problem problem problem problem problem problem problem problem problem problem problem problem problem problem problem problem problem problem problem problem problem problem problem problem problem problem problem

Topic: climate change
Generated Question: [CLS] generate a question about climate change : [SEP] research research research research research research research research research research research research research research research research research research research research research research research research research research research research research research research research research research research research research research research research research research research 

In [None]:
!pip install torch



In [None]:
#Importing libraries and dataset
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np
import string

In [None]:
class TextDataset(Dataset):
    def __init__(self, text, sequence_length=50):

        text = text.lower().translate(str.maketrans("", "", string.punctuation))
        self.sequence_length = sequence_length


        chars = sorted(list(set(text)))
        self.char2idx = {ch: idx for idx, ch in enumerate(chars)}
        self.idx2char = {idx: ch for ch, idx in self.char2idx.items()}


        self.encoded_text = np.array([self.char2idx[ch] for ch in text])

    def __len__(self):
        return len(self.encoded_text) - self.sequence_length

    def __getitem__(self, idx):
        return (
            torch.tensor(self.encoded_text[idx:idx + self.sequence_length], dtype=torch.long),
            torch.tensor(self.encoded_text[idx + 1:idx + self.sequence_length + 1], dtype=torch.long),
        )

In [None]:
with open('/content/chatgpt-reddit-comments.csv', 'r') as file:
    text = file.read()

dataset = TextDataset(text)
dataloader = DataLoader(dataset, batch_size=64, shuffle=True)

In [None]:
#Architecture

import torch
import torch.nn as nn
import torch.optim as optim

class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, n_layers):
        super(LSTMModel, self).__init__()
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers

        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x, hidden):
        x = self.embedding(x)
        lstm_out, hidden = self.lstm(x, hidden)
        output = self.fc(lstm_out.reshape(lstm_out.size(0) * lstm_out.size(1), self.hidden_dim))
        return output, hidden

    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_(),
                  weight.new(self.n_layers, batch_size, self.hidden_dim).zero_())
        return hidden

vocab_size = len(dataset.char2idx)
embedding_dim = 128
hidden_dim = 256
n_layers = 2

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = LSTMModel(vocab_size, embedding_dim, hidden_dim, n_layers).to(device)

In [None]:
#Training the model

n_epochs = 2
learning_rate = 0.001
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()

model.train()

for epoch in range(n_epochs):
    for inputs, targets in dataloader:
        batch_size = inputs.size(0)
        hidden = model.init_hidden(batch_size)
        hidden = tuple([each.data for each in hidden])
        inputs, targets = inputs.to(device), targets.to(device)

        model.zero_grad()
        output, hidden = model(inputs, hidden)

        loss = criterion(output, targets.view(-1))
        loss.backward()
        optimizer.step()

    print(f'Epoch: {epoch+1}/{n_epochs}, Loss: {loss.item()}')

In [None]:

#Generating text with Prompt

def generate_text(model, start_str, char2idx, idx2char, predict_len=100):
  model.eval()
  chars = [char2idx[ch] for ch in start_str]
  batch__size = inputs.size(0)
  hidden = model.init_hidden(batch_size)

  for i in range(predict_len):
    char_tensor = torch.tensor(chars[-1], dtype=torch.long).unsqueeze(0).unsqueeze(0).to(device)
    output, hidden = model(char_tensor, hidden)
    prob = nn.functional.softmax(output[-1], dim=0).data
    char_idx = torch.max(prob, dim=0)[1].item()
    chars.append(char_idx)

  return start_str + ''.join(idx2char[idx] for idx in chars[len(start_str):])


start_str = input('Enter text to generate from: ')
generated_text = generate_text(model, start_str, dataset.char2idx, dataset.idx2char, predict_len = 200)
print(generated_text)

In [None]:
#Saving and loading the model
torch.save(model.state_duct(), 'lstm_model.pth')
model.load_state_dict(torch.load('lstm_model_pth'))
model.eval()