In [None]:
!pip install datasets
!pip install transformers
!pip install torch
!pip install nltk
!pip install rouge_score
!pip install evaluate
!pip install sentencepiece
!pip install torch transformers datasets evaluate nltk pandas numpy matplotlib seaborn plotly scikit-learn tqdm

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (1

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
import os
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoTokenizer,
    T5ForConditionalGeneration,
    AdamW
)
from datasets import load_dataset
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import re
import gc  # For garbage collection
from tqdm import tqdm

class DataProcessor:
    def __init__(self, max_samples=10000):  # Limit samples to prevent memory issues
        self.max_samples = max_samples
        # Initialize NLTK components
        nltk.download('punkt', quiet=True)
        nltk.download('stopwords', quiet=True)
        nltk.download('wordnet', quiet=True)

        self.lemmatizer = WordNetLemmatizer()
        self.stop_words = set(stopwords.words('english'))

    def load_data(self):
        """Load a subset of the Quora dataset"""
        print("Loading dataset...")
        dataset = load_dataset("toughdata/quora-question-answer-dataset")
        df = pd.DataFrame(dataset['train'])

        # Take a subset of the data
        if len(df) > self.max_samples:
            df = df.sample(n=self.max_samples, random_state=42)

        print(f"Loaded {len(df)} question-answer pairs")
        return df

    def preprocess_text(self, text):
        """Basic text preprocessing"""
        if not isinstance(text, str):
            return ""

        # Basic cleaning
        text = text.lower()
        text = re.sub(r'[^\w\s]', '', text)

        # Simple tokenization and lemmatization
        tokens = word_tokenize(text)
        tokens = [self.lemmatizer.lemmatize(token) for token in tokens
                 if token not in self.stop_words]

        return ' '.join(tokens)

    def analyze_data(self, df):
        """Simple data analysis"""
        analysis = {
            'total_pairs': len(df),
            'avg_question_length': df['question'].str.len().mean(),
            'avg_answer_length': df['answer'].str.len().mean()
        }
        return analysis

    def create_visualizations(self, df):
        """Create basic visualizations"""
        plt.figure(figsize=(10, 5))
        sns.histplot(data=df['question'].str.len(), bins=30)
        plt.title('Question Length Distribution')
        plt.xlabel('Length')
        plt.savefig('question_dist.png')
        plt.close()

class QADataset(Dataset):
    def __init__(self, questions, answers, tokenizer, max_length=128):  # Reduced max_length
        self.questions = questions
        self.answers = answers
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.questions)

    def __getitem__(self, idx):
        question = str(self.questions[idx])
        answer = str(self.answers[idx])

        # Encode with truncation and padding
        encoding = self.tokenizer(
            question,
            truncation=True,
            max_length=self.max_length,
            padding='max_length',
            return_tensors='pt'
        )

        # Encode answer separately
        target_encoding = self.tokenizer(
            answer,
            truncation=True,
            max_length=self.max_length,
            padding='max_length',
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': target_encoding['input_ids'].squeeze()
        }

class QASystem:
    def __init__(self, model_name='t5-small'):  # Using smaller model
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        print(f"Using device: {self.device}")

        # Initialize tokenizer and model
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = T5ForConditionalGeneration.from_pretrained(model_name)
        self.model.to(self.device)

    def train(self, train_df, val_df=None, epochs=3, batch_size=8):
        """Train the model with memory-efficient batching"""
        # Create datasets
        train_dataset = QADataset(
            train_df['question'].tolist(),
            train_df['answer'].tolist(),
            self.tokenizer
        )

        train_loader = DataLoader(
            train_dataset,
            batch_size=batch_size,
            shuffle=True
        )

        optimizer = AdamW(self.model.parameters(), lr=5e-5)

        for epoch in range(epochs):
            self.model.train()
            total_loss = 0

            # Using tqdm for progress tracking
            for batch_idx, batch in enumerate(tqdm(train_loader, desc=f'Epoch {epoch+1}/{epochs}')):
                # Move batch to device
                input_ids = batch['input_ids'].to(self.device)
                attention_mask = batch['attention_mask'].to(self.device)
                labels = batch['labels'].to(self.device)

                # Clear any stored gradients
                optimizer.zero_grad()

                # Forward pass
                outputs = self.model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    labels=labels
                )

                loss = outputs.loss
                total_loss += loss.item()

                # Backward pass
                loss.backward()
                optimizer.step()

                # Free up memory
                del outputs
                torch.cuda.empty_cache() if torch.cuda.is_available() else None

                # Print progress every 50 batches
                if (batch_idx + 1) % 50 == 0:
                    print(f'Batch {batch_idx+1}, Loss: {loss.item():.4f}')

            avg_loss = total_loss / len(train_loader)
            print(f'Epoch {epoch+1}, Average Loss: {avg_loss:.4f}')

            # Garbage collection
            gc.collect()

    def predict(self, question):
        """Generate answer for a question"""
        self.model.eval()

        # Tokenize input
        inputs = self.tokenizer(
            question,
            return_tensors="pt",
            max_length=128,
            truncation=True
        ).to(self.device)

        # Generate answer
        with torch.no_grad():
            outputs = self.model.generate(
                **inputs,
                max_length=128,
                num_beams=2,  # Reduced beam size
                early_stopping=True
            )

        # Decode and return answer
        answer = self.tokenizer.decode(outputs[0], skip_special_tokens=True)

        # Clean up
        del outputs
        torch.cuda.empty_cache() if torch.cuda.is_available() else None

        return answer

def main():
    # Initialize processor with limited samples
    processor = DataProcessor(max_samples=5000)  # Start with small dataset

    # Load and process data
    print("Loading data...")
    df = processor.load_data()

    # Basic analysis
    print("\nAnalyzing data...")
    analysis = processor.analyze_data(df)
    print("Data Analysis Results:", analysis)

    # Create visualizations
    print("\nCreating visualizations...")
    processor.create_visualizations(df)

    # Split data
    print("\nSplitting data...")
    train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

    # Initialize QA system
    print("\nInitializing QA system...")
    qa_system = QASystem()

    # Train model
    print("\nStarting training...")
    qa_system.train(train_df, epochs=2)  # Reduced epochs

    # Test the model
    print("\nTesting the model...")
    test_questions = [
        "What is machine learning?",
        "How does Python work?"
    ]

    for question in test_questions:
        answer = qa_system.predict(question)
        print(f"\nQ: {question}")
        print(f"A: {answer}")

if __name__ == "__main__":
    main()

Loading data...
Loading dataset...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/485 [00:00<?, ?B/s]

Quora-QuAD.jsonl:   0%|          | 0.00/60.9M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/56402 [00:00<?, ? examples/s]

Loaded 5000 question-answer pairs

Analyzing data...
Data Analysis Results: {'total_pairs': 5000, 'avg_question_length': 77.2656, 'avg_answer_length': 1001.335}

Creating visualizations...

Splitting data...

Initializing QA system...
Using device: cpu


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]




Starting training...


Epoch 1/2:   0%|          | 0/500 [00:00<?, ?it/s]Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
Epoch 1/2:  10%|█         | 50/500 [07:44<1:05:28,  8.73s/it]

Batch 50, Loss: 4.0347


Epoch 1/2:  20%|██        | 100/500 [15:22<1:01:34,  9.24s/it]

Batch 100, Loss: 3.5068


Epoch 1/2:  30%|███       | 150/500 [22:54<52:47,  9.05s/it]

Batch 150, Loss: 2.9133


Epoch 1/2:  40%|████      | 200/500 [30:30<44:57,  8.99s/it]

Batch 200, Loss: 4.2608


Epoch 1/2:  50%|█████     | 250/500 [38:00<38:25,  9.22s/it]

Batch 250, Loss: 3.4549


Epoch 1/2:  60%|██████    | 300/500 [45:23<29:27,  8.84s/it]

Batch 300, Loss: 3.2071


Epoch 1/2:  70%|███████   | 350/500 [52:45<22:08,  8.86s/it]

Batch 350, Loss: 3.1182


Epoch 1/2:  80%|████████  | 400/500 [1:00:05<14:17,  8.57s/it]

Batch 400, Loss: 3.9071


Epoch 1/2:  90%|█████████ | 450/500 [1:07:29<07:32,  9.05s/it]

Batch 450, Loss: 3.6657


Epoch 1/2: 100%|██████████| 500/500 [1:14:51<00:00,  8.98s/it]

Batch 500, Loss: 4.5257
Epoch 1, Average Loss: 3.6237



Epoch 2/2:  10%|█         | 50/500 [07:22<1:06:20,  8.85s/it]

Batch 50, Loss: 2.0826


Epoch 2/2:  20%|██        | 100/500 [14:45<58:13,  8.73s/it]

Batch 100, Loss: 1.4639


Epoch 2/2:  30%|███       | 150/500 [22:09<50:36,  8.68s/it]

Batch 150, Loss: 2.0166


Epoch 2/2:  40%|████      | 200/500 [29:34<44:15,  8.85s/it]

Batch 200, Loss: 2.4596


Epoch 2/2:  50%|█████     | 250/500 [36:59<37:13,  8.94s/it]

Batch 250, Loss: 2.9477


Epoch 2/2:  60%|██████    | 300/500 [44:22<29:44,  8.92s/it]

Batch 300, Loss: 2.7728


Epoch 2/2:  70%|███████   | 350/500 [51:46<22:00,  8.81s/it]

Batch 350, Loss: 2.8183


Epoch 2/2:  73%|███████▎  | 363/500 [53:40<19:47,  8.67s/it]