# NeuroForge Advanced Training Pipeline (Continuous Learning)
This notebook provides a comprehensive pipeline for training the NeuroForge model with advanced datasets and capabilities, designed for continuous learning and GitHub synchronization.

## 1. Setup Environment and Install Dependencies

In [None]:
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
!pip install transformers datasets accelerate sentencepiece numpy opencv-python scikit-learn pandas
!pip install bitsandbytes peft
!pip install ipywidgets # For interactive elements
!git config --global user.email "your_email@example.com" # REPLACE WITH YOUR GITHUB EMAIL
!git config --global user.name "Your GitHub Username" # REPLACE WITH YOUR GITHUB USERNAME
# For pushing to GitHub, you will need to provide your GitHub Personal Access Token (PAT) when prompted.
# Ensure your PAT has "repo" scope.

## 2. Clone Repository and Load Model/Tokenizer

In [None]:
import os
from transformers import AutoModelForCausalLM, AutoTokenizer

repo_url = "https://github.com/shlok71/chra-nf-xl.git"
repo_dir = "/content/chra-nf-xl"
model_save_path = os.path.join(repo_dir, "neuroforge_trained_model_advanced")

if not os.path.exists(repo_dir):
    !git clone {repo_url} {repo_dir}
%cd {repo_dir}

# Check if a previously trained model exists, otherwise load base GPT2
if os.path.exists(model_save_path):
    print("Loading existing model and tokenizer...")
    model = AutoModelForCausalLM.from_pretrained(model_save_path)
    tokenizer = AutoTokenizer.from_pretrained(model_save_path)
else:
    print("No existing model found. Loading base GPT2 model...")
    model_name = "gpt2"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(model_name)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

print("Model and tokenizer loaded.")

## 3. Data Preparation for Continuous Training

In [None]:
from datasets import load_dataset, Dataset
from torch.utils.data import DataLoader
from transformers import DataCollatorForLanguageModeling
import torch

class CombinedTextDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __len__(self):
        return len(self.encodings.input_ids)

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

def get_training_data(new_texts=None):
    all_texts = []

    # Add content from pasted_content.txt (if available in repo)
    pasted_content_path = os.path.join(repo_dir, "pasted_content.txt")
    if os.path.exists(pasted_content_path):
        with open(pasted_content_path, "r") as f:
            all_texts.append(f.read())
        print("pasted_content.txt added to training data.")

    # Load a small subset of TinyStories
    try:
        tinystories_dataset = load_dataset("roneneldan/TinyStories", split="train[:5000]", cache_dir="/content/cache")
        all_texts.extend(tinystories_dataset["text"])
        print("TinyStories subset loaded.")
    except Exception as e:
        print(f"Could not load TinyStories: {e}")

    # Add new texts from interaction
    if new_texts:
        all_texts.extend(new_texts)
        print(f"Added {len(new_texts)} new texts from interaction.")

    if not all_texts:
        print("No text data available for training.")
        return None

    max_length = 512
    tokenized_texts = tokenizer(all_texts, truncation=True, padding="max_length", max_length=max_length)
    return CombinedTextDataset(tokenized_texts)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

## 4. Continuous Training Loop with GitHub Sync

In [None]:
from transformers import Trainer, TrainingArguments
import time

def train_and_sync(new_texts=None, num_epochs=1, save_interval_steps=100):
    train_dataset = get_training_data(new_texts)
    if train_dataset is None:
        print("Skipping training due to no data.")
        return

    training_args = TrainingArguments(
        output_dir="./checkpoints",
        overwrite_output_dir=True,
        num_train_epochs=num_epochs,
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        save_steps=save_interval_steps,
        save_total_limit=2,
        logging_dir="./training_logs",
        logging_steps=10,
        report_to="none",
        fp16=torch.cuda.is_available(),
        push_to_hub=False, # We will handle Git push manually for more control
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        data_collator=data_collator,
        tokenizer=tokenizer,
    )

    print("Starting training...")
    trainer.train()
    print("Training complete.")

    # Save the trained model and tokenizer
    model.save_pretrained(model_save_path)
    tokenizer.save_pretrained(model_save_path)
    print(f"Model saved to {model_save_path}")

    # Git operations to sync with GitHub
    print("Syncing with GitHub...")
    !git add .
    !git commit -m "Update trained model and logs (continuous training)" || echo "No changes to commit"
    # Use a token for pushing to avoid interactive prompts. Replace <YOUR_GITHUB_TOKEN>
    # with your actual Personal Access Token. Keep it secret!
    !git push https://<YOUR_GITHUB_USERNAME>:<YOUR_GITHUB_TOKEN>@github.com/shlok71/chra-nf-xl.git HEAD:neuroforge-training-and-inference
    print("GitHub sync complete.")

# Example of continuous training (can be run in a loop)
# while True:
#     train_and_sync(num_epochs=1) # Train for one epoch, then sync
#     time.sleep(3600) # Wait for an hour before next training cycle

# Initial training run
train_and_sync()

## 5. Interaction-Based Learning (Conceptual Outline)

In [None]:
from IPython.display import display, HTML
import ipywidgets as widgets

# This section outlines how interaction-based learning would work.
# In a real application, user inputs and model outputs would be captured
# and periodically used to fine-tune the model.

interaction_history = []

def generate_response(prompt):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(inputs.input_ids, max_new_tokens=50, num_return_sequences=1)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

def on_button_click(b):
    user_input = text_input.value
    if user_input:
        model_response = generate_response(user_input)
        output_area.append_display_data(HTML(f"<b>You:</b> {user_input}<br><b>NeuroForge:</b> {model_response}<br>"))
        interaction_history.append(user_input + " " + model_response)
        text_input.value = ""
        
        # Periodically retrain with new interactions (e.g., every 10 interactions)
        if len(interaction_history) % 10 == 0 and len(interaction_history) > 0:
            print("
Retraining with recent interactions...")
            train_and_sync(new_texts=interaction_history[-10:]) # Train on last 10 interactions
            # Clear interaction history after training if desired
            # interaction_history.clear()

text_input = widgets.Text(description="Your Input:", layout=widgets.Layout(width="80%"))
send_button = widgets.Button(description="Send")
output_area = widgets.Output()

send_button.on_click(on_button_click)

display(text_input, send_button, output_area)