In [70]:
import torch
import torch.nn as nn
from typing import Any
import pandas as pd
import tokenizers
from torch.utils.data import DataLoader, Dataset
import math

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [71]:
# HYPERPARAMS
EMBED_DIM = 256
HIDDEN_SIZE = 512
RNN_LAYERS = 2
ATTENTION_HEADS = 4
BATCH_SIZE = 128
DATA_MAX_LEN = 512
EPOCHS = 30
DROPOUT = 0.2
WEIGHT_DECAY = 0.1
TEMPERATURE = 0.7
GRAD_CLIP = 1.0
LEARNING_RATE = 0.0005

In [72]:
class PositionalEncoding(nn.Module):
    """Add positional information to embeddings."""
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        return x + self.pe[:, :x.size(1), :]

class Decoder(nn.Module):
    def __init__(self, embed_dim, hidden_size, vocab_size):
        super(Decoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.rnn = nn.LSTM(embed_dim, hidden_size, num_layers=RNN_LAYERS, batch_first=True)
        self.pos_encoding = PositionalEncoding(embed_dim, max_len=DATA_MAX_LEN)
        self.attention = nn.MultiheadAttention(hidden_size, dropout=DROPOUT, num_heads=ATTENTION_HEADS, batch_first=True)
        self.projection = nn.Linear(hidden_size, vocab_size)
        self.norm1 = nn.LayerNorm(hidden_size)
        self.dropout = nn.Dropout(DROPOUT)
    def forward(self, x):
        out = self.embedding(x)
        out = self.pos_encoding(out)
        rnn_out, _ = self.rnn(out)
        out, _ = self.attention(rnn_out, rnn_out, rnn_out)
        out = self.norm1(rnn_out + self.dropout(out))
        out = self.projection(out)
        return out


class EmailDataset(Dataset):
    """Dataset for email sequences."""
    def __init__(self, texts, tokenizer, max_length=DATA_MAX_LEN):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        encoding = self.tokenizer.encode(text).ids
        if len(encoding) < self.max_length + 1:
            encoding += [0] * (self.max_length + 1 - len(encoding))
        else:
            encoding = encoding[:self.max_length + 1]
        return torch.tensor(encoding[:-1]).long(), torch.tensor(encoding[1:]).long()

In [73]:
def retrieve_data(path,  batch_size=BATCH_SIZE):
    data = pd.read_csv(path).dropna().to_numpy()
    data = ['Subject: ' + x[0] + '\n\n' + x[1] for x in data]

    dataset = EmailDataset(data, tokenizer)

    train_size = int(0.8 * len(dataset))
    test_size = len(dataset) - train_size
    train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])

    train_loader = DataLoader(train_dataset, batch_size, shuffle=True, num_workers=4)
    test_loader = DataLoader(test_dataset, batch_size, shuffle=True, num_workers=4)
    return (train_loader, test_loader)

In [74]:
def train_model(tokenizer, train_loader, epochs=EPOCHS, learning_rate=LEARNING_RATE):
    model =  Decoder(embed_dim=EMBED_DIM, hidden_size=HIDDEN_SIZE, vocab_size=tokenizer.get_vocab_size())
    model = model.to(device)
    criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.token_to_id('[PAD]'))
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=WEIGHT_DECAY)

    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for inputs, labels in train_loader:
            inputs = inputs.to(device)
            labels = labels.to(device)

            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(
                outputs.view(-1, tokenizer.get_vocab_size()),
                labels.view(-1)
            )

            loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), GRAD_CLIP)

            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch}, Loss: {total_loss / len(train_loader)}")
    return model

In [75]:
def test_model(model, test_loader, tokenizer):
    model.eval() # Set the model to evaluation mode
    model = model.to(device)
    total_loss = 0
    criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.token_to_id('[PAD]'))

    with torch.no_grad(): # Disable gradient calculations during evaluation
        for inputs, labels in test_loader:
            inputs = inputs.to(device)
            labels = labels.to(device)

            outputs = model(inputs)
            loss = criterion(
                outputs.view(-1, tokenizer.get_vocab_size()),
                labels.view(-1)
            )
            total_loss += loss.item()

    avg_loss = total_loss / len(test_loader)
    print(f"Test Loss: {avg_loss}")
    return avg_loss

In [76]:
tokenizer = tokenizers.Tokenizer.from_pretrained("bert-base-uncased")
train_loader, test_loader = retrieve_data('gmail_data.csv')

model = train_model(tokenizer, train_loader)
test_model(model, test_loader, tokenizer)

Epoch 0, Loss: 8.352147531509399
Epoch 1, Loss: 6.196687412261963
Epoch 2, Loss: 5.512816286087036
Epoch 3, Loss: 5.082492828369141
Epoch 4, Loss: 4.733634185791016
Epoch 5, Loss: 4.453773069381714
Epoch 6, Loss: 4.181771469116211
Epoch 7, Loss: 4.011816120147705
Epoch 8, Loss: 3.885533857345581
Epoch 9, Loss: 3.7292094945907595
Epoch 10, Loss: 3.527302598953247
Epoch 11, Loss: 3.449075222015381
Epoch 12, Loss: 3.330120301246643
Epoch 13, Loss: 3.1487065076828005
Epoch 14, Loss: 3.0914467096328737
Epoch 15, Loss: 2.9285510063171385
Epoch 16, Loss: 2.772164750099182
Epoch 17, Loss: 2.687672734260559
Epoch 18, Loss: 2.6541017293930054
Epoch 19, Loss: 2.5792191505432127
Epoch 20, Loss: 2.479838514328003
Epoch 21, Loss: 2.4143373727798463
Epoch 22, Loss: 2.3157206535339356
Epoch 23, Loss: 2.2793318748474123
Epoch 24, Loss: 2.250495719909668
Epoch 25, Loss: 2.1116225957870483
Epoch 26, Loss: 2.095490598678589
Epoch 27, Loss: 2.021770405769348
Epoch 28, Loss: 1.9334482550621033
Epoch 29, Los

2.8739431699117026

In [77]:
def generate_text(model, tokenizer, start_text, max_length=50, device='cpu'):
    model.eval()
    model = model.to(device)
    input_ids = tokenizer.encode(start_text).ids
    input_tensor = torch.tensor(input_ids).unsqueeze(0).to(device)

    generated_ids = input_ids.copy()

    with torch.no_grad():
        for _ in range(max_length):
            outputs = model(input_tensor)
            next_token_logits = outputs[0, -1, :] / TEMPERATURE
            next_token_probs = torch.softmax(next_token_logits, dim=-1)
            next_token_id = torch.multinomial(next_token_probs, num_samples=1).item()

            generated_ids.append(next_token_id)

            # If it's a special token like [SEP] or [END], stop generation
            if next_token_id == tokenizer.token_to_id('[SEP]') or next_token_id == tokenizer.token_to_id('[END]'):
                break

            input_tensor = torch.tensor(generated_ids).unsqueeze(0).to(device)

    return tokenizer.decode(generated_ids)

In [79]:
torch.save(model.state_dict(), 'model_weights.pt')

In [80]:
model.eval()
scripted_model = torch.jit.script(model)
scripted_model.save('model_scripted.pt')

In [78]:
START_TEXT = "Hello my name is kaustubh"
result = generate_text(model, tokenizer, START_TEXT, device=device)
print(result)

hello my name is kaustubh hello mr. donange, i have attached my resume below. i have all attached my resume below along. i would it be possible to get a scheduled zoom meeting. i ' m super interested in interning at wayfair. i just applied
