In [None]:
import os
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, random_split
import numpy as np
from tqdm import tqdm

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

def load_data():
    dataset = '/Users/sahil/My Data/Assignment'
    corpus = []
    try:
        for filename in os.listdir(dataset):
            if filename.endswith('.txt'):
                with open(os.path.join(dataset, filename), 'r', encoding='utf-8') as file:
                    corpus.append(file.read())
        print(f"Loaded {len(corpus)} documents.")
    except FileNotFoundError:
        print("The specified directory was not found. Please check the path and try again.")
    return corpus

def preprocess_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation.replace(' ', '')))
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words or token == ' ']
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) if token != ' ' else token for token in tokens]
    return tokens

class TextDataset(Dataset):
    def __init__(self, corpus, sequence_length):
        self.corpus = corpus
        self.sequence_length = sequence_length
        self.vocab = sorted(set(token for doc in corpus for token in doc))
        self.token2idx = {token: idx for idx, token in enumerate(self.vocab)}
        self.idx2token = {idx: token for token, idx in self.token2idx.items()}
        self.data = self.prepare_data()

    def prepare_data(self):
        data = []
        for doc in self.corpus:
            indices = [self.token2idx[token] for token in doc]
            for i in range(0, len(indices) - self.sequence_length, self.sequence_length):
                chunk = indices[i:i + self.sequence_length]
                target = indices[i + 1:i + self.sequence_length + 1]
                if len(chunk) == self.sequence_length and len(target) == self.sequence_length:
                    data.append((chunk, target))
        return data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return torch.tensor(self.data[idx][0]), torch.tensor(self.data[idx][1])

class TransformerModel(nn.Module):
    def __init__(self, vocab_size, d_model, nhead, num_encoder_layers, dim_feedforward, dropout):
        super(TransformerModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoder = PositionalEncoding(d_model, dropout)
        encoder_layers = nn.TransformerEncoderLayer(d_model, nhead, dim_feedforward, dropout)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_encoder_layers)
        self.d_model = d_model
        self.fc_out = nn.Linear(d_model, vocab_size)

    def forward(self, src):
        src = self.embedding(src) * np.sqrt(self.d_model)
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src)
        return self.fc_out(output)

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-np.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

def train(model, train_loader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    for src, tgt in tqdm(train_loader, desc="Training"):
        src, tgt = src.to(device), tgt.to(device)
        optimizer.zero_grad()
        output = model(src)
        loss = criterion(output.view(-1, output.size(-1)), tgt.view(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(train_loader)

def evaluate(model, val_loader, criterion, device):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for src, tgt in tqdm(val_loader, desc="Evaluating"):
            src, tgt = src.to(device), tgt.to(device)
            output = model(src)
            loss = criterion(output.view(-1, output.size(-1)), tgt.view(-1))
            total_loss += loss.item()
    return total_loss / len(val_loader)

import torch

def nucleus_sample(predictions, p=0.9):
    sorted_probs, sorted_indices = torch.sort(predictions, descending=True)
    cumulative_probs = torch.cumsum(torch.softmax(sorted_probs, dim=-1), dim=-1)
    sorted_indices_to_remove = cumulative_probs > p
    sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
    sorted_indices_to_remove[..., 0] = 0

    indices_to_remove = torch.zeros_like(predictions, dtype=torch.bool).scatter_(
        dim=-1, index=sorted_indices, src=sorted_indices_to_remove
    )
    predictions[indices_to_remove] = float('-inf')
    
    return torch.multinomial(torch.softmax(predictions, dim=-1), num_samples=1).item()

def generate_text(model, dataset, start_tokens, num_generate=50, temperature=1.0, top_p=0.9, device='cpu'):
    model.eval()
    input_eval = torch.tensor([[dataset.token2idx[token] for token in start_tokens]], dtype=torch.long).to(device)
    generated_tokens = start_tokens.copy()

    with torch.no_grad():
        for _ in range(num_generate):
            output = model(input_eval)
            predictions = output[0, -1, :] / temperature
            predicted_id = nucleus_sample(predictions, p=top_p)

            predicted_token = dataset.idx2token[predicted_id]
            generated_tokens.append(predicted_token)

            input_eval = torch.cat([input_eval[:, 1:], torch.tensor([[predicted_id]], device=device)], dim=1)

    return ' '.join(generated_tokens)
    
def generate_from_prompt(model, dataset, device):
    while True:
        prompt = input("Enter a prompt (or 'quit' to exit): ")
        if prompt.lower() == 'quit':
            break
        
        start_tokens = preprocess_text(prompt)[:dataset.sequence_length]
        temperature = float(input("Enter temperature (0.1-1.0, higher for more randomness): "))
        num_tokens = int(input("Enter number of tokens to generate: "))
        
        generated_text = generate_text(model, dataset, start_tokens, num_generate=num_tokens, temperature=temperature, device=device)
        print("\nGenerated text:\n", generated_text)
        print("\n" + "="*50 + "\n")

if __name__ == "__main__":
    # Load and preprocess data
    corpus = load_data()
    processed_corpus = [preprocess_text(text) for text in corpus]
    
    # Create dataset
    sequence_length = 50  # You can adjust this value
    dataset = TextDataset(processed_corpus, sequence_length)
    
    # Split dataset
    train_size = int(0.8 * len(dataset))
    val_size = len(dataset) - train_size
    train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
    
    # Create data loaders
    batch_size = 32
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size)
    
    # Initialize model
    vocab_size = len(dataset.token2idx)
    d_model = 512
    nhead = 8
    num_encoder_layers = 6
    dim_feedforward = 2048
    dropout = 0.1
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    model = TransformerModel(vocab_size, d_model, nhead, num_encoder_layers, dim_feedforward, dropout)
    model.to(device)
    
    # Training loop
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    num_epochs = 10
    
    for epoch in range(num_epochs):
        train_loss = train(model, train_loader, optimizer, criterion, device)
        val_loss = evaluate(model, val_loader, criterion, device)
        print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")
    
    print("Training completed!")
    
    # Text generation
    print("\nEntering text generation mode...")
    generate_from_prompt(model, dataset, device)

[nltk_data] Downloading package punkt to /Users/sahil/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/sahil/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/sahil/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Loaded 1 documents.


Training: 100%|███████████████████████████████████| 2/2 [00:01<00:00,  1.80it/s]
Evaluating: 100%|█████████████████████████████████| 1/1 [00:00<00:00, 21.55it/s]


Epoch 1/10, Train Loss: 6.7452, Val Loss: 6.5485


Training: 100%|███████████████████████████████████| 2/2 [00:00<00:00,  2.24it/s]
Evaluating: 100%|█████████████████████████████████| 1/1 [00:00<00:00, 18.02it/s]


Epoch 2/10, Train Loss: 6.3171, Val Loss: 6.4051


Training: 100%|███████████████████████████████████| 2/2 [00:01<00:00,  1.93it/s]
Evaluating: 100%|█████████████████████████████████| 1/1 [00:00<00:00, 22.24it/s]


Epoch 3/10, Train Loss: 6.0324, Val Loss: 6.4844


Training: 100%|███████████████████████████████████| 2/2 [00:00<00:00,  2.28it/s]
Evaluating: 100%|█████████████████████████████████| 1/1 [00:00<00:00, 20.31it/s]


Epoch 4/10, Train Loss: 5.9560, Val Loss: 6.6235


Training: 100%|███████████████████████████████████| 2/2 [00:00<00:00,  2.50it/s]
Evaluating: 100%|█████████████████████████████████| 1/1 [00:00<00:00, 22.22it/s]


Epoch 5/10, Train Loss: 6.0139, Val Loss: 6.7400


Training: 100%|███████████████████████████████████| 2/2 [00:00<00:00,  2.32it/s]
Evaluating: 100%|█████████████████████████████████| 1/1 [00:00<00:00, 22.16it/s]


Epoch 6/10, Train Loss: 5.9990, Val Loss: 6.7987


Training: 100%|███████████████████████████████████| 2/2 [00:00<00:00,  2.58it/s]
Evaluating: 100%|█████████████████████████████████| 1/1 [00:00<00:00, 22.82it/s]


Epoch 7/10, Train Loss: 5.9289, Val Loss: 6.8539


Training: 100%|███████████████████████████████████| 2/2 [00:00<00:00,  2.50it/s]
Evaluating: 100%|█████████████████████████████████| 1/1 [00:00<00:00, 24.17it/s]


Epoch 8/10, Train Loss: 5.9455, Val Loss: 6.9255


Training: 100%|███████████████████████████████████| 2/2 [00:00<00:00,  2.50it/s]
Evaluating: 100%|█████████████████████████████████| 1/1 [00:00<00:00, 21.13it/s]


Epoch 9/10, Train Loss: 5.9754, Val Loss: 6.9821


Training: 100%|███████████████████████████████████| 2/2 [00:00<00:00,  2.52it/s]
Evaluating: 100%|█████████████████████████████████| 1/1 [00:00<00:00, 21.53it/s]


Epoch 10/10, Train Loss: 5.9406, Val Loss: 7.0219
Training completed!

Entering text generation mode...


Enter a prompt (or 'quit' to exit):  this is legal
Enter temperature (0.1-1.0, higher for more randomness):  100
Enter number of tokens to generate:  100



Generated text:
 legal feature generate body containing similar concern mode changed provided limiting foundation expected one article generally licensors perpetuity httpswwwgnuorglicenses hypothetical object acknowledges licensed general actual regenerate safest area licensing generate offer unpacking line alleging menu particular cure remove exclusively local following notice enables information subdividing authorization entered prohibit occurs used programming likewise provisionally product personal implementation public prove propagation subject implement simultaneously prohibit copyrightlike unless transaction imposed imposed us single kind server correction original counterclaim medium regenerate appropriate operation copyright year visible differ accessible inability owned manner intimate assures accord must generalpurpose script added reviewing occurring feature saying conditioned termination correction




Enter a prompt (or 'quit' to exit):  this is legal 
Enter temperature (0.1-1.0, higher for more randomness):  1
Enter number of tokens to generate:  100



Generated text:
 legal arrangement cease work section work conveying revised work advised term work permanently version patent patent normal apply copy may copyright code copy explicitly circumstance third license explicitly code server copyright continue may copy c license part technological give party object price propagate agreement transaction covered apply work component relying order license material nothing licensing sale modified work used includes available must additional including copy given component liability apply violation permitted foundation purpose server permanently entity work warranty commitment convey convey license appropriate sale entire use freedom available working network shall added used reinstated propagation license component source patent gnu copyright




Enter a prompt (or 'quit' to exit):  this is legal
Enter temperature (0.1-1.0, higher for more randomness):  0.1
Enter number of tokens to generate:  100



Generated text:
 legal license license license work work license license work license work work work work work work license work work license work work work work license work work work license work license work work work license work work license work work work license work license work work work work work work work work work work work work work work work work license license work work work license license work work work work license work work work work work work work work work work license work work work work work license license work work work work license work work work work work license




Enter a prompt (or 'quit' to exit):  this is legal
Enter temperature (0.1-1.0, higher for more randomness):  1000
Enter number of tokens to generate:  100



Generated text:
 legal obligated 3 criterion sale acquired offered remain offer sold reviewing programto inclusion prove exclusion distribute knowledge agreement occurring away limit indemnification publicly operated arrange away effect although enable 0 impose copyrightlike could sign relicensing paper incompatible modifying given copyrightlike item price display notify damage long published obligate incorporating notwithstanding support run relevant unpacking acceptance thus inability show portion legal version script system foundation affirmed earlier plus exclusively adopted disclaimer family prominent electronic practice importing semiconductor pertinent consequence obligated procuring corresponding asset writing widely might ready add next downstream crossclaim dwelling author useful enable control relicensing systematic approximates applied render merchantability




Enter a prompt (or 'quit' to exit):  this is legal
Enter temperature (0.1-1.0, higher for more randomness):  10000
Enter number of tokens to generate:  100



Generated text:
 legal liable rendered terminates develop advised way obligation implied consequence 11 constantly threatened modified right interpretation show responsible return saying neither operating liable paper affirmed ability enforcing copying programto interpretation using feature copyrighted determining worldwide invalidate installed protect propagating stated place choose keep display may distributed anything obligated performing manner incorporation covered published copy lesser interface expects prohibits satisfy first actual library conveying perpetuity nonexercise andor programming december abuse charge inability others transferred mere force invalidate full infringed medium danger qualify welcome development keep could part litigation link prohibits fee copying threatened covenant downstream local class greatest like consumer program particular




# Quantitative Evaluation

In [17]:
from torch.utils.data import Dataset, DataLoader

class YourTestDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # Return input and target for the idx-th item
        return self.data[idx]['input'], self.data[idx]['target']

# Prepare your test data
test_data = [
    {'input': torch.tensor([1, 2, 3]), 'target': torch.tensor([2, 3, 4])},
    # Add more test samples here
]

# Create the test dataset
test_dataset = YourTestDataset(test_data)

In [19]:
# Create the test_loader
batch_size = 32  # Adjust as needed
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [23]:
import torch
import math

def calculate_perplexity(model, test_loader, device):
    model.eval()
    total_loss = 0
    total_tokens = 0
    
    with torch.no_grad():
        for batch in test_loader:
            inputs, targets = batch
            inputs, targets = inputs.to(device), targets.to(device)
            
            outputs = model(inputs)
            loss = torch.nn.functional.cross_entropy(outputs.view(-1, outputs.size(-1)), targets.view(-1), reduction='sum')
            
            total_loss += loss.item()
            total_tokens += targets.numel()
    
    perplexity = math.exp(total_loss / total_tokens)
    return perplexity

    model = TransformerModel(vocab_size, d_model, nhead, num_encoder_layers, dim_feedforward, dropout)
    model.to(device)

# Usage
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
perplexity = calculate_perplexity(model, test_loader, device)
print(f"Perplexity on test set: {perplexity:.2f}")

Perplexity on test set: 4153.37


# BLEU Score Calculation
For tasks involving text generation, you can use the BLEU score:

In [26]:
from nltk.translate.bleu_score import corpus_bleu

def calculate_bleu(references, hypotheses):
    return corpus_bleu([[ref.split()] for ref in references], [hyp.split() for hyp in hypotheses])

# Usage
references = ["this is a test", "another test sentence"]
hypotheses = ["this is test", "another sentence for testing"]
bleu_score = calculate_bleu(references, hypotheses)
print(f"BLEU score: {bleu_score:.4f}")

BLEU score: 0.0000


The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


# ROUGE Score Calculation
For summarization tasks, ROUGE score is commonly used:

In [37]:
from rouge_score import rouge_scorer

def calculate_rouge(references, hypotheses):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = {}
    for ref, hyp in zip(references, hypotheses):
        score = scorer.score(ref, hyp)
        for key, value in score.items():
            if key not in scores:
                scores[key] = {'precision': [], 'recall': [], 'fmeasure': []}
            scores[key]['precision'].append(value.precision)
            scores[key]['recall'].append(value.recall)
            scores[key]['fmeasure'].append(value.fmeasure)
    
    # Calculate average scores
    avg_scores = {}
    for key, value in scores.items():
        avg_scores[key] = {
            'precision': sum(value['precision']) / len(value['precision']),
            'recall': sum(value['recall']) / len(value['recall']),
            'fmeasure': sum(value['fmeasure']) / len(value['fmeasure'])
        }
    return avg_scores

# Usage
references = ["This is the reference summary.", "Another reference summary."]
hypotheses = ["This is the generated summary.", "Another generated summary."]
rouge_scores = calculate_rouge(references, hypotheses)
print("ROUGE scores:", rouge_scores)

ROUGE scores: {'rouge1': {'precision': 0.7333333333333334, 'recall': 0.7333333333333334, 'fmeasure': 0.7333333333333334}, 'rouge2': {'precision': 0.25, 'recall': 0.25, 'fmeasure': 0.25}, 'rougeL': {'precision': 0.7333333333333334, 'recall': 0.7333333333333334, 'fmeasure': 0.7333333333333334}}
