<a href="https://colab.research.google.com/github/rajnishkumar1906/Deep-Learning/blob/main/LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#                               Shakespeare Text Generation using LSTM

 This project explores character-level text generation using an LSTM neural network, trained on Shakespeare’s plays. The goal is to build a model that learns the style and structure of Shakespearean language and generates new text that mimics his writing. We use a structured version of the Shakespeare dataset containing metadata like the play name, speaker, and actual lines of dialogue. The focus is on generating authentic-looking Shakespearean text using deep learning techniques

# Connecting drive to google colab

In [1]:
# Connect google drive to colab
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


#importing dataset

In [2]:
import pandas as pd
data = pd.read_csv("/content/drive/MyDrive/DATASETS/Shakespeare_data.csv")
data.head(10)

Unnamed: 0,Dataline,Play,PlayerLinenumber,ActSceneLine,Player,PlayerLine
0,1,Henry IV,,,,ACT I
1,2,Henry IV,,,,SCENE I. London. The palace.
2,3,Henry IV,,,,"Enter KING HENRY, LORD JOHN OF LANCASTER, the ..."
3,4,Henry IV,1.0,1.1.1,KING HENRY IV,"So shaken as we are, so wan with care,"
4,5,Henry IV,1.0,1.1.2,KING HENRY IV,"Find we a time for frighted peace to pant,"
5,6,Henry IV,1.0,1.1.3,KING HENRY IV,And breathe short-winded accents of new broils
6,7,Henry IV,1.0,1.1.4,KING HENRY IV,To be commenced in strands afar remote.
7,8,Henry IV,1.0,1.1.5,KING HENRY IV,No more the thirsty entrance of this soil
8,9,Henry IV,1.0,1.1.6,KING HENRY IV,Shall daub her lips with her own children's bl...
9,10,Henry IV,1.0,1.1.7,KING HENRY IV,"Nor more shall trenching war channel her fields,"


In [3]:
print(data.shape)
data.isnull().sum()

(111396, 6)


Unnamed: 0,0
Dataline,0
Play,0
PlayerLinenumber,3
ActSceneLine,6243
Player,7
PlayerLine,0


In [4]:
data.nunique()

Unnamed: 0,0
Dataline,111396
Play,36
PlayerLinenumber,405
ActSceneLine,16122
Player,934
PlayerLine,107580


#Data Preprocessing

In [5]:
# Step 1: Drop NaNs first
text_data = data['PlayerLine'].dropna()

# Step 2: Convert all text to lowercase
text_data = text_data.str.lower()

# Step 3: Combine all lines into one long string
text = ' '.join(text_data.tolist())

# Optional: See number of total characters
# print(f"Total characters in combined text: {len(text)}")
# text = text[:1000000]
print(f"Total characters in combined text: {len(text)}")
text

Total characters in combined text: 4366287




#Encoding data

In [6]:
# Get all unique characters in the text
chars = sorted(list(set(text)))
vocab_size = len(chars)

# Create mappings
# Dictionary with key and value pai\
char2idx = {char: idx for idx, char in enumerate(chars)}
idx2char = {idx: char for idx, char in enumerate(chars)}

# Encode the entire text as integer indices
encoded_text = [char2idx[c] for c in text]         #Applied on whole text

print(f"Vocabulary size: {vocab_size}")
print(f"Sample mapping: {chars[:10]}")

Vocabulary size: 50
Sample mapping: ['\t', ' ', '!', '$', "'", '(', ')', ',', '-', '.']


In [7]:
print(f"First 20 encoded chars: {encoded_text[:20]}")

First 20 encoded chars: [24, 26, 43, 1, 32, 1, 42, 26, 28, 37, 28, 1, 32, 9, 1, 35, 38, 37, 27, 38]


# Create a Dataset class for given data

In [8]:
import torch
from torch.utils.data import Dataset,DataLoader

class ShakespeareDataset(Dataset):
    def __init__(self, data, seq_length):
        self.data = data                      # List of encoded characters (as integers)
        self.seq_length = seq_length          # How many characters to feed at once

    def __len__(self):
        return len(self.data) - self.seq_length

    def __getitem__(self, idx):
        # Input sequence
        x = self.data[idx : idx + self.seq_length]
        # Target is the next character
        y = self.data[idx + self.seq_length]
        return torch.tensor(x, dtype=torch.long), torch.tensor(y, dtype=torch.long)


# Define Model

In [9]:
import torch
import torch.nn as nn

class Model(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers):
        super(Model, self).__init__()
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers

        self.embedding = nn.Embedding(vocab_size, embedding_dim)

        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True)

        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x, hidden=None):
        x = self.embedding(x)
        if hidden is None:
            out, hidden = self.lstm(x)
        else:
            out, hidden = self.lstm(x, hidden)

        out = self.fc(out[:, -1, :])
        return out, hidden

#Defining general parameters of the model

In [10]:
vocab_size = len(char2idx)       # number of unique characters
embedding_dim = 128
hidden_dim = 256
num_layers = 2
seq_length = 100
batch_size = 64
learning_rate = 0.002

#Using dataset and dataloader

In [11]:
dataset = ShakespeareDataset(encoded_text, seq_length)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

#Initialize model

In [12]:
# Initialize the model
model = Model(50, 128, 256, 2)
model

Model(
  (embedding): Embedding(50, 128)
  (lstm): LSTM(128, 256, num_layers=2, batch_first=True)
  (fc): Linear(in_features=256, out_features=50, bias=True)
)

#Define loss function and optimizer

In [13]:
# defining loss function
criterion = nn.CrossEntropyLoss()

In [14]:
# Initializing optimizer
optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate)
optimizer

Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    lr: 0.002
    maximize: False
    weight_decay: 0
)

#Training the model for dataset

In [None]:
num_epochs = 20

for epoch in range(num_epochs):
    total_loss = 0

    for batch_idx, (x, y) in enumerate(dataloader):
        batch_size = x.size(0)  # current batch size (could be < 64 in the last batch)

        # Initialize hidden state for current batch size
        hidden = (
            torch.zeros(num_layers, batch_size, hidden_dim),
            torch.zeros(num_layers, batch_size, hidden_dim)
        )

        # Move tensors to GPU if available (optional)
        if torch.cuda.is_available():
            x, y = x.cuda(), y.cuda()
            hidden = (hidden[0].cuda(), hidden[1].cuda())
            model = model.cuda()

        optimizer.zero_grad()
        output, hidden = model(x, hidden)

        # Detach hidden state to prevent growing computation graph
        hidden = tuple([h.detach() for h in hidden])

        loss = criterion(output, y)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        if batch_idx % 100 == 0:
            print(f"Epoch [{epoch+1}/{num_epochs}], Batch [{batch_idx}], Loss: {loss.item():.4f}")

    avg_loss = total_loss / len(dataloader)
    print("-----------------------------------------------------")
    print(f"Epoch {epoch+1} complete. Avg Loss: {avg_loss:.4f}")


Epoch [1/20], Batch [0], Loss: 3.9249


# Evaluation Mode

In [None]:
model.eval()

In [None]:
model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

with torch.no_grad():
    total_loss = 0
    for x, y in dataloader:
        x = x.to(device)
        y = y.to(device)

        output, _ = model(x)
        loss = criterion(output, y)
        total_loss += loss.item()

    avg_loss = total_loss / len(dataloader)
    print(f"Evaluation Loss: {avg_loss:.4f}")

# Generate prediction

In [None]:
def predict_next_char(model, input_text, char2idx, idx2char, device='cpu'):
    model.eval()
    model.to(device)

    # Convert input characters to tensor of indices
    input_indices = [char2idx[c] for c in input_text]
    input_tensor = torch.tensor([input_indices], dtype=torch.long).to(device)

    # Initialize hidden state
    hidden = (
        torch.zeros(model.num_layers, 1, model.hidden_dim).to(device),
        torch.zeros(model.num_layers, 1, model.hidden_dim).to(device)
    )

    with torch.no_grad():
        # Forward pass
        output, hidden = model(input_tensor, hidden)

        # Get the last output time step
        last_logits = output.squeeze(0)  # shape: [seq_len, vocab_size]
        last_logits = last_logits[-1]    # take last time step

        # Get predicted character index (argmax for deterministic)
        predicted_idx = torch.argmax(last_logits).item()
        predicted_char = idx2char[predicted_idx]

    return predicted_char


In [None]:
seed = "Enter KING HENRY, LORD JOH"
seed = seed.lower()
next_char = predict_next_char(model, seed, char2idx, idx2char, device='cuda' if torch.cuda.is_available() else 'cpu')
print(f"Input: {seed} → Next char: '{next_char}'")
