In [1]:
import os
import torch
import json
import numpy as np
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoModelForCausalLM, 
    AutoTokenizer, 
    Trainer, 
    TrainingArguments,
    get_linear_schedule_with_warmup
)
from sklearn.model_selection import train_test_split
import wandb
import time
from datetime import timedelta

2025-05-04 15:27:46.885591: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746372467.150378      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746372467.229406      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [13]:
!pip install re
import re

[31mERROR: Could not find a version that satisfies the requirement re (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for re[0m[31m
[0m

In [2]:
# Set environment variable to avoid tokenizers parallelism warning
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [3]:
# Check GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU count: {torch.cuda.device_count()}")
    for i in range(torch.cuda.device_count()):
        print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
        print(f"Memory: {torch.cuda.get_device_properties(i).total_memory / 1e9:.2f} GB")

Using device: cuda
GPU count: 2
GPU 0: Tesla T4
Memory: 15.83 GB
GPU 1: Tesla T4
Memory: 15.83 GB


In [4]:
# Constants
MAX_LENGTH = 512  # Reduced from 1024 to fit more examples in memory
BATCH_SIZE = 4    # Reduced batch size to prevent OOM errors
GRADIENT_ACCUMULATION_STEPS = 8  # Increase effective batch size
EPOCHS = 3
LEARNING_RATE = 2e-5
WARMUP_STEPS = 100
SAVE_STEPS = 500
EVAL_STEPS = 500
BASE_MODEL = "distilgpt2"  # Smaller model to fit in memory
OUTPUT_DIR = "./instruction_model_alpaca"
GITHUB_REPO = "siddhamapple/instruction_tuned_model"

In [5]:
# Downloading and preparing the Alpaca dataset
def download_alpaca_dataset():
    """Download the full Alpaca dataset with 52K examples"""
    print("Downloading Alpaca dataset...")
    
    # Option 1: Direct download
    !wget -q https://raw.githubusercontent.com/tatsu-lab/stanford_alpaca/main/alpaca_data.json
    
    # Option 2: If direct download fails, clone the repo
    if not os.path.exists("alpaca_data.json"):
        !git clone https://github.com/tatsu-lab/stanford_alpaca.git
        !cp stanford_alpaca/alpaca_data.json ./
    
    # Load the dataset
    with open("alpaca_data.json", "r") as f:
        data = json.load(f)
    
    print(f"Loaded {len(data)} examples from Alpaca dataset")
    return data

In [6]:
# Format instruction data using a better prompt template
def format_instruction(example):
    """Format instruction data with a clearer prompt template"""
    instruction = example["instruction"]
    input_text = example.get("input", "")
    output = example["output"]
    
    # Better prompt template with clear section markers
    if input_text:
        formatted_text = f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{instruction}

### Input:
{input_text}

### Response:
{output}"""
    else:
        formatted_text = f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
{instruction}

### Response:
{output}"""
    
    return formatted_text

In [7]:
# Custom dataset for instruction tuning
class InstructionDataset(Dataset):
    def __init__(self, examples, tokenizer, max_length=512):
        self.examples = examples
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.examples)
    
    def __getitem__(self, idx):
        example = self.examples[idx]
        formatted_text = format_instruction(example)
        
        # Tokenize with padding
        encoding = self.tokenizer(
            formatted_text,
            truncation=True,
            max_length=self.max_length,
            padding="max_length",
            return_tensors="pt"
        )
        
        input_ids = encoding["input_ids"].squeeze()
        attention_mask = encoding["attention_mask"].squeeze()
        
        # Create labels (same as input_ids for causal language modeling)
        labels = input_ids.clone()
        
        # Replace padding tokens in labels with -100 so they're ignored in loss calculation
        labels[labels == self.tokenizer.pad_token_id] = -100
        
        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": labels
        }

In [8]:
# Custom collator function to handle batching
def custom_collate_fn(batch, pad_token_id=50256, ignore_index=-100):
    """Custom collate function for batching instruction examples"""
    batch_max_length = max(len(item["input_ids"]) for item in batch)
    
    input_ids_list = []
    attention_mask_list = []
    labels_list = []
    
    for item in batch:
        input_ids = item["input_ids"]
        attention_mask = item["attention_mask"]
        labels = item["labels"]
        
        # Pad sequences to the maximum length in this batch
        padding_length = batch_max_length - len(input_ids)
        
        if padding_length > 0:
            # Pad input_ids and attention_mask
            input_ids = torch.cat([input_ids, torch.ones(padding_length, dtype=torch.long) * pad_token_id])
            attention_mask = torch.cat([attention_mask, torch.zeros(padding_length, dtype=torch.long)])
            
            # Pad labels with ignore_index
            labels = torch.cat([labels, torch.ones(padding_length, dtype=torch.long) * ignore_index])
        
        input_ids_list.append(input_ids)
        attention_mask_list.append(attention_mask)
        labels_list.append(labels)
    
    return {
        "input_ids": torch.stack(input_ids_list),
        "attention_mask": torch.stack(attention_mask_list),
        "labels": torch.stack(labels_list)
    }

In [9]:
# Main training function
def train_on_alpaca():
    """Train the instruction model on the full Alpaca dataset"""
    start_time = time.time()
    
    # Download and prepare dataset
    data = download_alpaca_dataset()
    
    # Initialize tokenizer and model
    print(f"Loading base model: {BASE_MODEL}")
    tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
    model = AutoModelForCausalLM.from_pretrained(BASE_MODEL)
    
    # Ensure the tokenizer has a pad token
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
        model.config.pad_token_id = model.config.eos_token_id
    
    # Split data into train and validation sets
    train_data, val_data = train_test_split(data, test_size=0.05, random_state=42)
    print(f"Training on {len(train_data)} examples, validating on {len(val_data)} examples")
    
    # Create datasets
    train_dataset = InstructionDataset(train_data, tokenizer, max_length=MAX_LENGTH)
    val_dataset = InstructionDataset(val_data, tokenizer, max_length=MAX_LENGTH)
    
    # Create data loaders with custom collation
    train_loader = DataLoader(
        train_dataset,
        batch_size=BATCH_SIZE,
        shuffle=True,
        collate_fn=lambda batch: custom_collate_fn(batch, pad_token_id=tokenizer.pad_token_id)
    )
    val_loader = DataLoader(
        val_dataset,
        batch_size=BATCH_SIZE,
        collate_fn=lambda batch: custom_collate_fn(batch, pad_token_id=tokenizer.pad_token_id)
    )
    
    # Set up training arguments
    training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    overwrite_output_dir=True,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    learning_rate=LEARNING_RATE,
    weight_decay=0.01,
    warmup_steps=WARMUP_STEPS,
    logging_dir="./logs",
    logging_steps=100,
    save_steps=SAVE_STEPS,
    
    save_total_limit=2,
    
    fp16=True,  
    report_to="none",
   
)
    
    # Initialize Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        data_collator=lambda batch: custom_collate_fn(batch, pad_token_id=tokenizer.pad_token_id)
    )
    
    # Train the model
    print("Starting training...")
    trainer.train()
    
    # Save the final model
    print("Saving final model...")
    trainer.save_model(OUTPUT_DIR)
    tokenizer.save_pretrained(OUTPUT_DIR)
    
    # Calculate training time
    training_time = time.time() - start_time
    print(f"Training completed in {timedelta(seconds=int(training_time))}")
    
    return model, tokenizer

In [10]:
# Push model to GitHub
def push_to_github():
    """Push the trained model to GitHub"""
    print("Pushing model to GitHub...")
    
    # Configure Git
    !git config --global user.email "siddhamjainn@gmail.com"
    !git config --global user.name "siddhamapple"
    
    # Clone repository if it doesn't exist
    if not os.path.exists(GITHUB_REPO.split('/')[-1]):
        !git clone https://github.com/{GITHUB_REPO}.git
    
    # Copy model files to the repository
    repo_dir = GITHUB_REPO.split('/')[-1]
    !mkdir -p {repo_dir}/models_alpaca
    !cp -r {OUTPUT_DIR}/* {repo_dir}/models_alpaca/
    
    # Add, commit, and push
    !cd {repo_dir} && git add .
    !cd {repo_dir} && git commit -m "Add Alpaca-trained instruction model"
    
    # Use token-based authentication for pushing
    # Note: Replace YOUR_GITHUB_TOKEN with your actual token in a real environment
    !cd {repo_dir} && git push https://siddhamapple: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@github.com/{GITHUB_REPO}.git main
    
    # For security, it's better to use environment variables for tokens
    github_token = os.environ.get("GITHUB_TOKEN")
    if github_token:
        !cd {repo_dir} && git push https://siddhamapple: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@github.com/{GITHUB_REPO}.git main
    else:
        print("GitHub token not found. Please push manually or set the GITHUB_TOKEN environment variable.")
    
    print("Model pushed to GitHub!")

In [12]:
# Enhanced evaluation function with TF-IDF fallback
def evaluate_model_with_fallback(model, tokenizer, test_examples):
    """Evaluate the model with enhanced post-processing"""
    from sklearn.feature_extraction.text import TfidfVectorizer
    from nltk.corpus import stopwords
    
    # Download stopwords if needed
    try:
        stopwords.words('english')
    except:
        nltk.download('stopwords')
    # TF-IDF fallback for keyword extraction
    def extract_keywords_tfidf(text, num_keywords=5):
        """Extract keywords using TF-IDF when model output is unreliable"""
        # Clean text
        text = re.sub(r'[^\w\s]', '', text.lower())
        
        # Initialize TF-IDF vectorizer
        stop_words = set(stopwords.words('english'))
        vectorizer = TfidfVectorizer(
            stop_words='english',
            max_features=100,
            ngram_range=(1, 2)  # Consider both unigrams and bigrams
        )
        
        # Fit and transform the text
        try:
            tfidf_matrix = vectorizer.fit_transform([text])
            feature_names = vectorizer.get_feature_names_out()
            
            # Get top keywords based on TF-IDF scores
            tfidf_scores = zip(feature_names, tfidf_matrix.toarray()[0])
            sorted_keywords = sorted(tfidf_scores, key=lambda x: x[1], reverse=True)
            
            # Return top N keywords
            return [keyword for keyword, score in sorted_keywords[:num_keywords]]
        except:
            # Fallback to simple frequency-based extraction if TF-IDF fails
            words = text.split()
            word_freq = {}
            for word in words:
                if word not in stop_words and len(word) > 3:
                    word_freq[word] = word_freq.get(word, 0) + 1
            
            sorted_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)
            return [word for word, _ in sorted_words[:num_keywords]]
    
    # Check if model output is valid
    def is_valid_keyword_response(response, expected_count):
        """Check if the response looks like a proper keyword list"""
        # Look for patterns like "Keywords: x, y, z" or a list of words
        if "keywords:" in response.lower():
            keyword_text = response.lower().split("keywords:")[1].strip()
            keywords = [k.strip() for k in re.split(r',|\n', keyword_text) if k.strip()]
            
            # Check if we have a reasonable number of keywords
            if len(keywords) >= max(1, expected_count * 0.5):
                return True
        
        # Check for repetitive patterns (a sign of model hallucination)
        words = response.split()
        if len(words) > 10:
            # Check for excessive repetition
            word_set = set(words)
            if len(word_set) < len(words) * 0.4:  # High repetition
                return False
        
        return False
    
    # Process instruction with enhanced post-processing
    def process_instruction_enhanced(text, instruction, num_items=5):
        """Process instruction with fallback mechanisms"""
        # Generate response
        prompt = f"""
Below is a blog post. {instruction}

Blog: {text}

Response:
"""
        
        inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024)
        inputs = {k: v.to(device) for k, v in inputs.items()}
        
        # Generate with parameters to avoid repetition
        output = model.generate(
            **inputs,
            max_length=len(inputs["input_ids"][0]) + 512,
            temperature=0.7,
            top_p=0.9,
            do_sample=True,
            repetition_penalty=1.5,  # Increased repetition penalty
            no_repeat_ngram_size=3,
            pad_token_id=tokenizer.eos_token_id
        )
        
        response = tokenizer.decode(output[0], skip_special_tokens=True)
        response_text = response.split("Response:")[-1].strip()
        
        # For keyword extraction tasks, validate and potentially use the fallback
        if "keywords" in instruction.lower():
            if not is_valid_keyword_response(response_text, num_items):
                # Use TF-IDF fallback
                keywords = extract_keywords_tfidf(text, num_items)
                return "Keywords: " + ", ".join(keywords)
        
        # Post-process to remove repetitions
        response_text = re.sub(r'(.{30,}?)\1+', r'\1', response_text)
        
        return response_text
    
    # Evaluate on test examples
    results = []
    for example in test_examples[:10]:  # Evaluate on a subset for demonstration
        instruction = example["instruction"]
        input_text = example.get("input", "")
        expected_output = example["output"]
        
        # Extract number from instruction if present
        num_items = 3  # Default
        match = re.search(r'(\d+)', instruction)
        if match:
            num_items = int(match.group(1))
        
        # Process with enhanced handling
        if input_text:
            full_text = f"{instruction}\n{input_text}"
        else:
            full_text = instruction
            
        model_output = process_instruction_enhanced(full_text, instruction, num_items)
        
        results.append({
            "instruction": instruction,
            "input": input_text,
            "expected": expected_output,
            "generated": model_output
        })
    
    return results


In [None]:
# Main execution
if __name__ == "__main__":
    # Train the model
    model, tokenizer = train_on_alpaca()
    
    # Push to GitHub
    push_to_github()
    
    # Optional: Evaluate the model
    # Load the Alpaca dataset for evaluation
    with open("alpaca_data.json", "r") as f:
        eval_data = json.load(f)
    
    # Use a small subset for evaluation
    eval_results = evaluate_model_with_fallback(model, tokenizer, eval_data[:100])
    
    # Print some evaluation results
    print("\nEvaluation Results:")
    for i, result in enumerate(eval_results[:3]):
        print(f"\nExample {i+1}:")
        print(f"Instruction: {result['instruction']}")
        if result['input']:
            print(f"Input: {result['input']}")
        print(f"Expected: {result['expected']}")
        print(f"Generated: {result['generated']}")

Using device: cuda
GPU count: 2
GPU 0: Tesla T4
Memory: 15.83 GB
GPU 1: Tesla T4
Memory: 15.83 GB
Downloading Alpaca dataset...
Loaded 52002 examples from Alpaca dataset
Loading base model: distilgpt2
Training on 49401 examples, validating on 2601 examples


  trainer = Trainer(


Starting training...


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
100,1.3576
200,0.9671
300,0.9675
400,0.924
500,0.9381
600,0.9204
700,0.8994
800,0.8963
900,0.8988
1000,0.9092


Saving final model...
Training completed in 1:51:19
Pushing model to GitHub...
Cloning into 'instruction_tuned_model'...
remote: Enumerating objects: 27, done.[K
remote: Counting objects: 100% (27/27), done.[K
remote: Compressing objects: 100% (21/21), done.[K
remote: Total 27 (delta 5), reused 27 (delta 5), pack-reused 0 (from 0)[K
Receiving objects: 100% (27/27), 1.14 MiB | 8.41 MiB/s, done.
Resolving deltas: 100% (5/5), done.
Filtering content: 100% (2/2), 566.43 MiB | 127.73 MiB/s, done.
[main b1fca40] Add Alpaca-trained instruction model
 37 files changed, 901608 insertions(+)
 create mode 100644 models_alpaca/checkpoint-2000/config.json
 create mode 100644 models_alpaca/checkpoint-2000/generation_config.json
 create mode 100644 models_alpaca/checkpoint-2000/merges.txt
 create mode 100644 models_alpaca/checkpoint-2000/model.safetensors
 create mode 100644 models_alpaca/checkpoint-2000/optimizer.pt
 create mode 100644 models_alpaca/checkpoint-2000/rng_state.pth
 create mode 100

NameError: name 're' is not defined