In [11]:
import torch
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"Device: {torch.cuda.get_device_name(0)}")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA version: {torch.version.cuda}")
torch.cuda.empty_cache()
test_tensor = torch.rand(1000, 1000).cuda()
print("Test tensor on GPU successful")

CUDA available: True
Device: NVIDIA GeForce RTX 3050 Laptop GPU
PyTorch version: 2.0.1+cu118
CUDA version: 11.8
Test tensor on GPU successful


In [12]:
import requests

books = {
    "Pride and Prejudice": "https://www.gutenberg.org/files/1342/1342-0.txt",
    "Moby Dick": "https://www.gutenberg.org/files/2701/2701-0.txt"
}

corpus = ""
for title, url in books.items():
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        corpus += response.text + "\n"
    except Exception as e:
        print(f"Failed to download {title}: {e}")

with open("literature_corpus.txt", "w", encoding="utf-8") as f:
    f.write(corpus)

print("Data saved to literature_corpus.txt")
print(f"Corpus length: {len(corpus)} characters")

Data saved to literature_corpus.txt
Corpus length: 1984344 characters


In [13]:
from datasets import Dataset

try:
    with open("literature_corpus.txt", "r", encoding="utf-8") as f:
        text = f.read()
except FileNotFoundError:
    print("Error: literature_corpus.txt not found. Run Cell 2 first.")
    exit()

def chunk_text(text, chunk_size=512):
    return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]

text_chunks = chunk_text(text)
dataset = Dataset.from_dict({"text": text_chunks})

print(f"Number of chunks: {len(text_chunks)}")
print("Sample chunk:", text_chunks[0][:100])

Number of chunks: 3876
Sample chunk: *** START OF THE PROJECT GUTENBERG EBOOK 1342 ***

                            [Illustration:



   


In [14]:
from transformers import AutoTokenizer

try:
    tokenizer = AutoTokenizer.from_pretrained("distilgpt2")  # Using distilgpt2 tokenizer for simplicity
    tokenizer.pad_token = tokenizer.eos_token
    print("Tokenizer loaded successfully")
except Exception as e:
    print(f"Failed to load tokenizer: {e}")
    exit()

def tokenize_function(examples):
    try:
        return tokenizer(
            examples["text"],
            max_length=512,
            padding="max_length",
            truncation=True
        )
    except Exception as e:
        print(f"Error in tokenize_function: {e}")
        raise

if 'dataset' not in globals():
    print("Error: 'dataset' not defined. Run Cell 3 first.")
    exit()

print("Dataset info:", dataset)

try:
    tokenized_dataset = dataset.map(tokenize_function, batched=True)
    print("Tokenization completed successfully")
except Exception as e:
    print(f"Tokenization failed: {e}")
    raise

tokenized_dataset = tokenized_dataset.remove_columns(["text"])
tokenized_dataset.set_format("torch")

print("Tokenized dataset ready:", tokenized_dataset)
print("Sample input IDs:", tokenized_dataset[0]["input_ids"][:10].tolist())



Tokenizer loaded successfully
Dataset info: Dataset({
    features: ['text'],
    num_rows: 3876
})


Map: 100%|██████████| 3876/3876 [00:01<00:00, 3271.16 examples/s]

Tokenization completed successfully
Tokenized dataset ready: Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 3876
})
Sample input IDs: [8162, 33303, 3963, 3336, 21965, 23680, 402, 3843, 1677, 13246]





In [1]:
import torch
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"Device: {torch.cuda.get_device_name(0)}")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA version: {torch.version.cuda}")
torch.cuda.empty_cache()
test_tensor = torch.rand(1000, 1000).cuda()
print("Test tensor on GPU successful")

CUDA available: True
Device: NVIDIA GeForce RTX 3050 Laptop GPU
PyTorch version: 2.0.1+cu118
CUDA version: 11.8
Test tensor on GPU successful



A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.0.2 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "c:\Users\sahaa\anaconda3\envs\llm_env\lib\runpy.py", line 197, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "c:\Users\sahaa\anaconda3\envs\llm_env\lib\runpy.py", line 87, in _run_code
    exec(code, run_globals)
  File "C:\Users\sahaa\AppData\Roaming\Python\Python39\site-packages\ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "C:\Users\sahaa\AppData\Roaming\Python\Python39\site-packages\traitlets\config\application.py", line 1075, in launch_instance

In [2]:
import requests

books = {
    "Pride and Prejudice": "https://www.gutenberg.org/files/1342/1342-0.txt",
    "Moby Dick": "https://www.gutenberg.org/files/2701/2701-0.txt"
}

corpus = ""
for title, url in books.items():
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        corpus += response.text + "\n"
    except Exception as e:
        print(f"Failed to download {title}: {e}")

with open("literature_corpus.txt", "w", encoding="utf-8") as f:
    f.write(corpus)

print("Data saved to literature_corpus.txt")
print(f"Corpus length: {len(corpus)} characters")

Data saved to literature_corpus.txt
Corpus length: 1984344 characters


In [3]:
from datasets import Dataset

try:
    with open("literature_corpus.txt", "r", encoding="utf-8") as f:
        text = f.read()
except FileNotFoundError:
    print("Error: literature_corpus.txt not found. Run Cell 2 first.")
    exit()

def chunk_text(text, chunk_size=512):
    return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]

text_chunks = chunk_text(text)
dataset = Dataset.from_dict({"text": text_chunks})

print(f"Number of chunks: {len(text_chunks)}")
print("Sample chunk:", text_chunks[0][:100])

  from .autonotebook import tqdm as notebook_tqdm


Number of chunks: 3876
Sample chunk: *** START OF THE PROJECT GUTENBERG EBOOK 1342 ***

                            [Illustration:



   


In [4]:
from transformers import AutoTokenizer

try:
    tokenizer = AutoTokenizer.from_pretrained("distilgpt2")  # Using distilgpt2 tokenizer for simplicity
    tokenizer.pad_token = tokenizer.eos_token
    print("Tokenizer loaded successfully")
except Exception as e:
    print(f"Failed to load tokenizer: {e}")
    exit()

def tokenize_function(examples):
    try:
        return tokenizer(
            examples["text"],
            max_length=512,
            padding="max_length",
            truncation=True
        )
    except Exception as e:
        print(f"Error in tokenize_function: {e}")
        raise

if 'dataset' not in globals():
    print("Error: 'dataset' not defined. Run Cell 3 first.")
    exit()

print("Dataset info:", dataset)

try:
    tokenized_dataset = dataset.map(tokenize_function, batched=True)
    print("Tokenization completed successfully")
except Exception as e:
    print(f"Tokenization failed: {e}")
    raise

tokenized_dataset = tokenized_dataset.remove_columns(["text"])
tokenized_dataset.set_format("torch")

print("Tokenized dataset ready:", tokenized_dataset)
print("Sample input IDs:", tokenized_dataset[0]["input_ids"][:10].tolist())



Tokenizer loaded successfully
Dataset info: Dataset({
    features: ['text'],
    num_rows: 3876
})


Map: 100%|██████████| 3876/3876 [00:01<00:00, 3631.76 examples/s]

Tokenization completed successfully
Tokenized dataset ready: Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 3876
})
Sample input IDs: [8162, 33303, 3963, 3336, 21965, 23680, 402, 3843, 1677, 13246]





In [5]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader

# Define LSTM model
class LSTMSummarizer(nn.Module):
    def __init__(self, vocab_size, embedding_dim=128, hidden_dim=256):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)
        out = self.fc(lstm_out)
        return out

# Verify tokenized_dataset
if 'tokenized_dataset' not in globals():
    print("Error: 'tokenized_dataset' not defined. Run Cell 4 first.")
    exit()

# Initialize model
device = "cuda" if torch.cuda.is_available() else "cpu"
vocab_size = tokenizer.vocab_size
model = LSTMSummarizer(vocab_size).to(device)
print(f"Model initialized on {device}")

# Training setup
optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)
criterion = nn.CrossEntropyLoss()
dataloader = DataLoader(tokenized_dataset, batch_size=4, shuffle=True)

# Training loop
try:
    print("Starting training...")
    for epoch in range(1):  # 1 epoch for speed
        for batch in dataloader:
            inputs = batch["input_ids"].to(device)
            targets = inputs[:, 1:].contiguous()  # Shifted input as target
            inputs = inputs[:, :-1]  # Remove last token for input
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs.view(-1, vocab_size), targets.view(-1))
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
        print(f"Epoch {epoch+1} complete, Loss: {loss.item():.4f}")
except Exception as e:
    print(f"Training failed: {e}")
    torch.cuda.empty_cache()
    exit()

# Save the model
torch.save(model.state_dict(), "lstm_summarizer.pth")
print("LSTM training complete and saved to lstm_summarizer.pth")
torch.cuda.empty_cache()

Model initialized on cuda
Starting training...
Epoch 1 complete, Loss: 1.7979
LSTM training complete and saved to lstm_summarizer.pth


In [7]:
import torch
import torch.nn as nn

# Define LSTM model (must match Cell 5)
class LSTMSummarizer(nn.Module):
    def __init__(self, vocab_size, embedding_dim=128, hidden_dim=256):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x, hidden=None):
        embedded = self.embedding(x)
        lstm_out, hidden = self.lstm(embedded, hidden)
        out = self.fc(lstm_out)
        return out, hidden

# Load tokenizer and model
try:
    tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
    tokenizer.pad_token = tokenizer.eos_token
    vocab_size = tokenizer.vocab_size
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model = LSTMSummarizer(vocab_size).to(device)
    model.load_state_dict(torch.load("lstm_summarizer.pth"))
    model.eval()
    print("Model loaded successfully on", device)
except Exception as e:
    print(f"Failed to load model/tokenizer: {e}")
    exit()

# Test generation
prompt = "Mr. Darcy, a wealthy but aloof gentleman, initially clashes with"
inputs = tokenizer(prompt, return_tensors="pt")["input_ids"].to(device)

try:
    with torch.no_grad():
        hidden = None
        generated_ids = inputs
        for _ in range(50):  # Generate 50 tokens
            outputs, hidden = model(generated_ids, hidden)
            next_token = torch.argmax(outputs[:, -1, :], dim=-1).unsqueeze(0)
            generated_ids = torch.cat((generated_ids, next_token), dim=1)
        response = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    print("Generated response:", response)
except Exception as e:
    print(f"Generation failed: {e}")

torch.cuda.empty_cache()

Model loaded successfully on cuda
Generated response: Mr. Darcy, a wealthy but aloof gentleman, initially clashes with


















































