<a href="https://colab.research.google.com/github/spamhamneggs/FinalProjectCOMP6885/blob/main/brautigan.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Cell 1: Install dependencies and setup environment
%%capture

import os, re
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Use specific versions optimized for the Qwen model
    import torch; v = re.match(r"[0-9\.]{3,}", str(torch.__version__)).group(0)
    xformers = "xformers==" + ("0.0.32.post2" if v == "2.8.0" else "0.0.29.post3")
    # Consolidate LLM dependencies
    !pip install --no-deps bitsandbytes accelerate {xformers} peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets>=3.4.1,<4.0.0" "huggingface_hub>=0.34.0" hf_transfer
    !pip install --no-deps unsloth

# Consolidate all NLP package installations
!pip install -U spacy==3.7.5 spacy_syllables bertopic umap-learn hdbscan numpy pandas syllapy

# Download the specific English language model required by spaCy
!python -m spacy download en_core_web_sm

# IMPORTANT: After this cell executes, the kernel must be restarted
# to load the newly installed modules into memory.
print("Installation complete. PLEASE RESTART THE KERNEL NOW.")

In [None]:
# Cell 2: Drive Mounting and Global Variables

from google.colab import drive
drive.mount('/content/drive')

# Create a symbolic link for easy path access (as defined in your original code)
!ln -s /content/drive/MyDrive/haiku_dataset /content/haiku_dataset

# --- Global Configuration Variables ---
MODEL_NAME = "unsloth/Qwen3-4B-Instruct-2507"
# FIX: Use the symbolic link path for consistent access
DATASET_PATH = "/content/haiku_dataset/haiku_dataset_merged.csv"
OUTPUT_DIR = "/content/drive/MyDrive/haiku_suggester_unsloth"

# Training parameters
# Path to persist BERTopic model to speed repeated runs
TOPIC_MODEL_PATH = OUTPUT_DIR + '/bertopic_model.joblib'
SAVE_TOPIC_MODEL = True
NUM_EPOCHS = 1
BATCH_SIZE = 2
GA_STEPS = 4
LEARNING_RATE = 2e-4
LORA_R = 32
LORA_ALPHA = 32

# Variables to be defined globally after training/loading
model = None
tokenizer = None
LLM_READY = False

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
ln: failed to create symbolic link '/content/haiku_dataset/haiku_dataset': File exists


In [None]:
# Cell 3: BERTopic and HaikuGrammarly Class Definition

import pandas as pd
import spacy
from spacy_syllables import SpacySyllables
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
import warnings
import numpy as np
from typing import List, Dict, Tuple

import joblib
# Suppress warnings
warnings.filterwarnings("ignore")

# --- CORE NLP SETUP ---
nlp = spacy.load("en_core_web_sm")
syllables = SpacySyllables(nlp)
nlp.add_pipe("syllables", after="tagger")

class HaikuGrammarly:

    def __init__(self, target_syllables: List[int] = [5, 7, 5], haiku_dataset_path: str = DATASET_PATH):
        self.target_syllables = target_syllables
        self.haiku_dataset_path = haiku_dataset_path
        self.topic_model = self._initialize_bertopic()
        self.llm_ready = globals().get('LLM_READY', False) # Check global status

    def _initialize_bertopic(self):
        """Initializes and pre-fits the BERTopic model on the Haiku dataset."""
        print("Initializing and fitting BERTopic model on Haiku dataset...")
        try:
            # Try to load a cached BERTopic model to avoid re-fitting every run
            try:
                if 'TOPIC_MODEL_PATH' in globals() and TOPIC_MODEL_PATH and os.path.exists(TOPIC_MODEL_PATH):
                    print(f"Loading cached BERTopic model from {TOPIC_MODEL_PATH}...")
                    topic_model = joblib.load(TOPIC_MODEL_PATH)
                    print("✅ Loaded cached BERTopic model.")
                    return topic_model
            except Exception as e_load:
                print(f"Could not load cached BERTopic model: {e_load}. Will fit a new model.")
            df = pd.read_csv(self.haiku_dataset_path)
            documents = (df['line1'].fillna('') + ' ' + df['line2'].fillna('') + ' ' + df['line3'].fillna('')).tolist()
            documents = [doc.strip() for doc in documents if len(doc.split()) >= 3]
            vectorizer_model = CountVectorizer(stop_words="english")
            topic_model = BERTopic(vectorizer_model=vectorizer_model, nr_topics="auto", min_topic_size=50, verbose=False)
            topic_model.fit(documents)
            print("✅ BERTopic model initialization complete.")
            # Persist the model for future runs (best-effort)
            try:
                if 'TOPIC_MODEL_PATH' in globals() and TOPIC_MODEL_PATH and SAVE_TOPIC_MODEL:
                    os.makedirs(os.path.dirname(TOPIC_MODEL_PATH), exist_ok=True)
                    joblib.dump(topic_model, TOPIC_MODEL_PATH)
                    print(f"✅ Saved BERTopic model to {TOPIC_MODEL_PATH}")
            except Exception as e_save:
                print(f"Could not save BERTopic model: {e_save}")
            return topic_model
        except Exception as e:
            print(f"❌ Error initializing BERTopic: {e}. Skipping BERTopic analysis.")
            return None

    def _get_line_syllables(self, line: str) -> int:
        doc = nlp(line)
        total_syllables = 0
        for token in doc:
            syllables_count = token._.syllables_count
            if syllables_count is not None:
                total_syllables += syllables_count
        return total_syllables

    def check_structure(self, lines: List[str]) -> Tuple[Dict, float]:
        feedback = {"structural_ok": True, "line_details": []}
        total_syllable_errors = 0

        for i, line in enumerate(lines):
            target = self.target_syllables[i]
            count = self._get_line_syllables(line)
            is_ok = (count == target)

            feedback["line_details"].append({
                "line_num": i + 1, "text": line, "count": count, "target": target, "ok": is_ok, "error": target - count
            })

            if not is_ok:
                feedback["structural_ok"] = False
                total_syllable_errors += abs(target - count)

        structural_score = max(0.0, 1.0 - (total_syllable_errors / (sum(self.target_syllables) * 0.5)))
        return feedback, structural_score

    def get_semantic_coherence(self, haiku_text: str) -> float:
        if self.topic_model is None: return 0.5
        try:
            topics, probabilities = self.topic_model.transform([haiku_text])
            topic = topics[0]
            if topic == -1: return 0.2
            if np.isscalar(probabilities[0]):
                coherence_score = probabilities[0]
            elif isinstance(probabilities[0], (list, np.ndarray)) and topic < len(probabilities[0]):
                coherence_score = probabilities[0][topic]
            else:
                return 0.2
            return np.interp(coherence_score, [0.0, 1.0], [0.3, 1.0])
        except Exception:
            return 0.5

    def check_quality_metrics(self, lines: List[str], haiku_text: str) -> Dict:
        doc = nlp(haiku_text)
        metrics = {"Semantic_Coherence": self.get_semantic_coherence(haiku_text), "Imagery_Score": 0.0, "Concision_Score": 1.0, "Grammar_Feedback": []}
        for i, token in enumerate(doc):
            if token.pos_ in ["DET", "ADP"] and token.text.lower() in ["a", "an", "the", "of", "in", "on"]: metrics["Concision_Score"] -= 0.03
            if token.pos_ == "ADV" and token.text.lower() in ["very", "really", "so"]:
                metrics["Concision_Score"] -= 0.05
                metrics["Grammar_Feedback"].append(f"Token: '{token.text}'. Consider removing weak intensifiers for better impact.")

        metrics["Imagery_Score"] = 0.4
        metrics["Concision_Score"] = max(0.0, min(1.0, metrics["Concision_Score"]))
        metrics["Sentiment_Balance"] = 0.75
        return metrics

    def generate_report(self, haiku_text: str) -> Dict:
        lines = [line.strip() for line in haiku_text.strip().split('\n') if line.strip()]
        if len(lines) != 3: return {"error": "Haiku must have exactly three lines."}

        structural_feedback, structural_score = self.check_structure(lines)
        quality_metrics = self.check_quality_metrics(lines, haiku_text)

        weighted_score = (structural_score * 0.40 + quality_metrics["Semantic_Coherence"] * 0.25 + quality_metrics["Imagery_Score"] * 0.20 + quality_metrics["Concision_Score"] * 0.10 + quality_metrics["Sentiment_Balance"] * 0.05)

        report = {
            "input_haiku": haiku_text,
            "weighted_quality_score": round(weighted_score, 4),
            "structural_analysis": structural_feedback,
            "quality_metrics": quality_metrics,
            "llm_suggestion": self._generate_llm_suggestion(haiku_text, structural_feedback, quality_metrics),
        }
        return report

    def _generate_llm_suggestion(self, haiku_text: str, structural_feedback: Dict, quality_metrics: Dict) -> str:
        if self.llm_ready:
            try:
                json_suggestions = get_json_suggestions(haiku_text)
                return json.dumps(json_suggestions, indent=2)
            except Exception as e:
                return f"Qwen/Unsloth inference failed: {e}. Falling back to placeholder."
        else:
            suggestion = "LLM Suggestion (via Qwen3-4B Placeholder): "
            if not structural_feedback["structural_ok"]:
                line_errors = [d for d in structural_feedback["line_details"] if not d["ok"]]
                suggestion += f"Structure Error: Line {line_errors[0]['line_num']} is off by {line_errors[0]['error']} syllable(s). "
            if quality_metrics["Imagery_Score"] < 0.5:
                suggestion += "Low Imagery Score. Use more vivid nouns. "
            if quality_metrics["Semantic_Coherence"] < 0.5:
                suggestion += "Low Semantic Coherence. Ensure all three lines contribute to a single, unified image. "
            if suggestion == "LLM Suggestion (via Qwen3-4B Placeholder): ":
                suggestion += "The Haiku is structurally sound and stylistically fair. Great job!"
            return suggestion

In [None]:
# @title
# Cell 4: Data Preparation, Metric-driven Example Creation, Fine-Tuning Setup, and Training

import re
import json
import pandas as pd
from typing import Dict
import unsloth
from unsloth import FastLanguageModel
from unsloth.chat_templates import get_chat_template
from datasets import Dataset
from trl import SFTTrainer, SFTConfig
import torch
import os
from unsloth import FastLanguageModel
import numpy as np

global model, tokenizer, LLM_READY

# --- Heuristic Syllable Counter (used as backup) ---
def count_syllables_heuristic(line: str) -> int:
    if not line or line.strip() == "":
        return 0
    s = re.sub(r"[^a-zA-Z\s]", " ", line)
    s = re.sub(r"\s+", " ", s.strip())
    groups = re.findall(r"[aeiouyAEIOUY]+", s)
    n = len(groups)
    if len(s.split()) > 0 and s.endswith("e") and not s.endswith("le"):
        n = max(1, n - 1)
    return max(1, n)

# Minimal fallback suggestion generator
def generate_suggestions_for_haiku(haiku_text: str) -> Dict:
    lines = [l.strip() for l in haiku_text.strip().splitlines() if l.strip()]
    if len(lines) == 1 and ' / ' in lines[0]:
        lines = [p.strip() for p in lines[0].split(' / ')]
    while len(lines) < 3:
        lines.append("")
    lines = lines[:3]
    syl_counts = [count_syllables_heuristic(l) for l in lines]
    suggestions = []
    target = [5, 7, 5]
    for i, (syl, tgt) in enumerate(zip(syl_counts, target)):
        if syl != tgt:
            suggestions.append({"type": "syllable", "line": i + 1, "message": f"Line {i+1} has approx {syl}; revise to {tgt}."})
    return {"suggestions": suggestions}

# Define the get_json_suggestions inference function (needed globally)
def get_json_suggestions(haiku: str, max_new_tokens=256):
    global model, tokenizer
    if model is None or tokenizer is None:
        # If LLM not ready, fallback to heuristic generator
        return generate_suggestions_for_haiku(haiku)

    inference_instruction = (
        "You are an assistant that provides improvement suggestions for a user-provided haiku. "
        "Do NOT write or invent haiku lines. Only analyze the provided haiku and return JSON only, following this schema: "
        "{ \"input_haiku\": <str>, \"weighted_quality_score\": <float>, \"structural_analysis\": <obj>, \"quality_metrics\": <obj>, \"llm_suggestion\": <obj_or_str> }"
    )
    prompt = f"### Instruction:\n{inference_instruction}\n\n### Input:\n{haiku}\n\n### Output:\n"
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False)
    text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    if '### Output:' in text:
        gen = text.split('### Output:')[-1].strip()
    else:
        gen = text.strip()
    try:
        return json.loads(gen)
    except Exception:
        return generate_suggestions_for_haiku(haiku)

# --- OPTIMIZATION CHECK: Check if the model has already been fine-tuned and saved ---
if os.path.exists(OUTPUT_DIR) and len(os.listdir(OUTPUT_DIR)) > 0:
    print(f"✅ FOUND SAVED MODEL: Loading model from {OUTPUT_DIR}. Skipping training...")

    # --- Load the previously trained model directly ---
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name=OUTPUT_DIR,  # Load from the saved directory
        max_seq_length=2048,
        load_in_4bit=True,
    )
    LLM_READY = True

else:
    print("⚠️ SAVED MODEL NOT FOUND. Proceeding with initial download and fine-tuning...")

    # --- Data Loading and Metric-driven Formatting ---
    df = pd.read_csv(DATASET_PATH)
    # keep original line columns for BERTopic (if present)
    if all(c in df.columns for c in ["line1", "line2", "line3"]):
        df["haiku"] = df[["line1", "line2", "line3"]].astype(str).agg("\n".join, axis=1)
    else:
        # fall back to using a single 'haiku' column if dataset differs
        if "haiku" not in df.columns:
            raise ValueError("Dataset must contain either (line1,line2,line3) or haiku column")
        df = df.rename(columns={"haiku": "haiku"})
    df = df[["haiku"]].dropna().reset_index(drop=True)

    # Instantiate the metric checker so we can compute targets for every example
    checker = HaikuGrammarly()

    instruction = (
        "You are an assistant that provides improvement suggestions for a user-provided haiku. "
        "Do NOT write or invent haiku lines. Only analyze the provided haiku and return JSON only, following this schema that includes a weighted_quality_score."
    )

    def make_example(haiku: str):
        # Compute the metric-driven report and include the numeric target used for validation.
        try:
            report = checker.generate_report(haiku)
            quality_score = report.get("weighted_quality_score") if isinstance(report, dict) else None
        except Exception:
            # Fallback to lightweight labels if report computation fails
            report = {"input_haiku": haiku, "weighted_quality_score": None, "llm_suggestion": generate_suggestions_for_haiku(haiku)}
            quality_score = None
        response = json.dumps(report)
        return {"instruction": instruction, "input": haiku, "output": response, "quality_score": quality_score}

    examples = [make_example(x) for x in df["haiku"].tolist()]
    hf_ds = Dataset.from_pandas(pd.DataFrame(examples))

    # Create a validation split so we can track metric improvements separately
    try:
        hf_split = hf_ds.train_test_split(test_size=0.1, seed=42)
        train_ds = hf_split['train']
        val_ds = hf_split['test']
    except Exception:
        # fallback: use entire dataset as train if HF version doesn't support split
        train_ds = hf_ds
        val_ds = None

    # --- Model Loading and LoRA Setup (Download occurs here on first run) ---
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name=MODEL_NAME,  # Downloads Qwen on first run
        max_seq_length=2048,
        load_in_4bit=True,
    )

    model = FastLanguageModel.get_peft_model(
        model,
        r=LORA_R,
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                        "gate_proj", "up_proj", "down_proj", ],
        lora_alpha=LORA_ALPHA,
        lora_dropout=0,
        bias="none",
        use_gradient_checkpointing="unsloth",
    )

    tokenizer = get_chat_template(tokenizer, chat_template="qwen3-instruct")

    def formatting_prompts_func(examples):
        texts = []
        for instr, inp, out in zip(examples["instruction"], examples["input"], examples["output"]):
            text = f"### Instruction:\n{instr}\n\n### Input:\n{inp}\n\n### Output:\n{out}"
            texts.append(text)
        return {"text": texts}

    train_ds = train_ds.map(formatting_prompts_func, batched=True)

    # --- SFT Trainer and Training ---
    trainer = SFTTrainer(
        model=model,
        tokenizer=tokenizer,
        train_dataset=train_ds,
        args=SFTConfig(
            dataset_text_field="text",
            per_device_train_batch_size=BATCH_SIZE,
            gradient_accumulation_steps=GA_STEPS,
            num_train_epochs=NUM_EPOCHS,
            learning_rate=LEARNING_RATE,
            optim="adamw_8bit",
            report_to="none",
        ),
    )

    trainer.train()

    # Attempt to train a small regression head that predicts the numeric quality score directly
    try:
        device = getattr(model, 'device', torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
        # Determine a way to get embeddings from the base model; try common HF attribute paths
        base_model = None
        if hasattr(model, 'model'):
            base_model = model.model
        elif hasattr(model, 'base_model'):
            base_model = model.base_model
        elif hasattr(model, 'transformer'):
            base_model = model.transformer

        if base_model is None:
            print("Regression head training skipped: cannot access underlying transformer model to extract embeddings.")
        else:
            print("Training regression head to predict numeric quality_score (this will be relatively fast).")
            base_model.eval()

            # Utility to extract mean-pooled embeddings for a batch of texts
            def get_embeddings(texts, batch_size=8):
                embeddings = []
                for i in range(0, len(texts), batch_size):
                    batch = texts[i:i+batch_size]
                    enc = tokenizer(batch, return_tensors='pt', padding=True, truncation=True).to(device)
                    with torch.no_grad():
                        # try different forward method signatures
                        out = None
                        try:
                            out = base_model(**enc, output_hidden_states=True)
                        except Exception:
                            try:
                                out = base_model.forward(**enc, output_hidden_states=True)
                            except Exception as e_f:
                                raise RuntimeError(f"Could not run base_model forward: {e_f}")

                        # locate last hidden state: common names
                        if hasattr(out, 'last_hidden_state') and out.last_hidden_state is not None:
                            last = out.last_hidden_state
                        elif isinstance(out, (tuple, list)) and len(out) > 0:
                            # some models return (last_hidden_state, ...)
                            last = out[0]
                        elif hasattr(out, 'hidden_states') and out.hidden_states is not None:
                            last = out.hidden_states[-1]
                        else:
                            raise RuntimeError('Unable to find last_hidden_state from base_model output')

                        # mean-pool across sequence length
                        pooled = last.mean(dim=1).cpu()
                        embeddings.append(pooled)
                return torch.cat(embeddings, dim=0)

            # Build regression datasets (only include examples with numeric quality_score)
            train_texts = []
            train_targets = []
            if hasattr(train_ds, '__len__'):
                for ex in train_ds:
                    score = ex.get('quality_score')
                    if score is not None:
                        train_texts.append(ex.get('input') or ex.get('haiku') or ex.get('input_haiku'))
                        train_targets.append(float(score))
            val_texts = []
            val_targets = []
            if val_ds is not None:
                for ex in val_ds:
                    score = ex.get('quality_score')
                    if score is not None:
                        val_texts.append(ex.get('input') or ex.get('haiku') or ex.get('input_haiku'))
                        val_targets.append(float(score))

            if len(train_texts) == 0:
                print("No numeric quality_score found in training set; skipping regression head training.")
            else:
                emb_dim = None
                # extract embeddings for small subset to get dim
                sample_emb = get_embeddings(train_texts[:min(32, len(train_texts))])
                emb_dim = sample_emb.shape[1]

                # Define regression head
                reg_head = torch.nn.Linear(emb_dim, 1).to(device)
                optimizer = torch.optim.Adam(reg_head.parameters(), lr=1e-4)
                loss_fn = torch.nn.MSELoss()

                EPOCHS = max(1, NUM_EPOCHS)
                BATCH = 16
                for ep in range(1, EPOCHS + 1):
                    reg_head.train()
                    # shuffle
                    idxs = np.random.permutation(len(train_texts))
                    epoch_losses = []
                    for i in range(0, len(idxs), BATCH):
                        batch_idx = idxs[i:i+BATCH]
                        batch_texts = [train_texts[j] for j in batch_idx]
                        batch_targets = torch.tensor([train_targets[j] for j in batch_idx], dtype=torch.float32).to(device)
                        emb = get_embeddings(batch_texts)
                        emb = emb.to(device)
                        preds = reg_head(emb).squeeze(-1)
                        loss = loss_fn(preds, batch_targets)
                        optimizer.zero_grad()
                        loss.backward()
                        optimizer.step()
                        epoch_losses.append(loss.item())
                    # validation
                    reg_head.eval()
                    val_mae = None
                    val_mse = None
                    if len(val_texts) > 0:
                        with torch.no_grad():
                            val_emb = get_embeddings(val_texts)
                            val_emb = val_emb.to(device)
                            val_preds = reg_head(val_emb).squeeze(-1).cpu().numpy()
                            val_targets_np = np.array(val_targets)
                            val_mae = float(np.mean(np.abs(val_preds - val_targets_np)))
                            val_mse = float(np.mean((val_preds - val_targets_np)**2))
                    avg_loss = float(np.mean(epoch_losses)) if epoch_losses else 0.0
                    print(f"Regression epoch {ep}/{EPOCHS} | train_loss={avg_loss:.6f} | val_MAE={val_mae} | val_MSE={val_mse}")

                # Save regression head weights for reuse
                try:
                    os.makedirs(OUTPUT_DIR, exist_ok=True)
                    torch.save({'state_dict': reg_head.state_dict()}, os.path.join(OUTPUT_DIR, 'regression_head.pt'))
                    print(f"Saved regression head to {os.path.join(OUTPUT_DIR, 'regression_head.pt')}")
                except Exception as e_save:
                    print(f"Could not save regression head: {e_save}")
    except Exception as e_reg:
        print(f"Regression head training skipped due to error: {e_reg}")

    # Save the trained model
    model.save_pretrained(OUTPUT_DIR)
    tokenizer.save_pretrained(OUTPUT_DIR)
    print(f"\n✅ Saved fine-tuned model to {OUTPUT_DIR}")
    LLM_READY = True

    # --- Post-training evaluation on the held-out validation split (if present) ---
    try:
        if val_ds is None:
            print("No validation split available; skipping evaluation.")
        else:
            # sample up to 200 examples from validation for quicker evaluation
            n_eval = min(200, len(val_ds))
            sample = val_ds.select(range(n_eval)) if hasattr(val_ds, 'select') else val_ds[:n_eval]
            actuals = []
            preds = []

            def _parse_pred_score(pred):
                # pred may be dict (if get_json_suggestions returned JSON) or string
                if isinstance(pred, dict):
                    return pred.get('weighted_quality_score')
                if isinstance(pred, str):
                    import re as _re
                    m = _re.search(r"weighted_quality_score\"?\s*[:=]\s*([0-9]+\.?[0-9]*)", pred)
                    if m:
                        return float(m.group(1))
                return None

            for item in sample:
                haiku = item.get('input') or item.get('haiku') or item.get('input_haiku')
                target = item.get('quality_score')
                try:
                    pred = get_json_suggestions(haiku)
                except Exception:
                    pred = generate_suggestions_for_haiku(haiku)
                p_score = _parse_pred_score(pred)
                if target is not None and p_score is not None:
                    actuals.append(float(target))
                    preds.append(float(p_score))

            if len(actuals) > 0:
                actuals = np.array(actuals)
                preds = np.array(preds)
                mae = np.mean(np.abs(actuals - preds))
                mse = np.mean((actuals - preds)**2)
                print(f"Validation eval (n={len(actuals)}): MAE={mae:.4f}, MSE={mse:.4f}")
            else:
                print("No paired predicted/actual quality_score values available in validation sample.")
    except Exception as e:
        print(f"Validation evaluation failed: {e}")


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
✅ FOUND SAVED MODEL: Loading model from /content/drive/MyDrive/haiku_suggester_unsloth. Skipping training...
==((====))==  Unsloth 2025.10.9: Fast Qwen3 patching. Transformers: 4.57.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/3.55G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/237 [00:00<?, ?B/s]

Unsloth 2025.10.9 patched 36 layers with 36 QKV layers, 36 O layers and 36 MLP layers.


In [None]:
# Cell 6: Inference Test

# Haiku for testing
haiku_1 = """
The old pond is still,
A frog jumps into the water,
A deep splash is heard.
"""

# Initialize the checker (it uses the DATASET_PATH defined in Cell 2)
checker = HaikuGrammarly()

# Analyze Haiku 1
report_1 = checker.generate_report(haiku_1)

print("\n--- Haiku Analysis Report ---")
print(json.dumps(report_1, indent=4))

Initializing and fitting BERTopic model on Haiku dataset...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

✅ BERTopic model initialization complete.
✅ Saved BERTopic model to /content/drive/MyDrive/haiku_suggester_unsloth/bertopic_model.joblib

--- Haiku Analysis Report ---
{
    "input_haiku": "\nThe old pond is still,\nA frog jumps into the water,\nA deep splash is heard.\n",
    "weighted_quality_score": 0.9084,
    "structural_analysis": {
        "structural_ok": false,
        "line_details": [
            {
                "line_num": 1,
                "text": "The old pond is still,",
                "count": 5,
                "target": 5,
                "ok": true,
                "error": 0
            },
            {
                "line_num": 2,
                "text": "A frog jumps into the water,",
                "count": 8,
                "target": 7,
                "ok": false,
                "error": -1
            },
            {
                "line_num": 3,
                "text": "A deep splash is heard.",
                "count": 5,
                "target":

# How to run this notebook in Colab

1. Run the first install cell (Cell 1). After it finishes, restart the Colab kernel so newly installed packages (spaCy, spacy-syllables, unsloth, etc.) are available.
2. Re-run cells from top to bottom in order. The `HaikuGrammarly` cell initializes BERTopic and may take time on first run; a cached BERTopic model will be loaded from `OUTPUT_DIR` if present.
3. The training cell performs supervised fine-tuning (SFT) then optionally trains a small regression head to predict the numeric `weighted_quality_score`. The regression head training is best-effort and will be skipped if your hardware or the model wrapper does not expose hidden states.
4. After training the notebook saves: fine-tuned model and tokenizer to `OUTPUT_DIR`, and (if created) `regression_head.pt` and `bertopic_model.joblib`. Make sure `OUTPUT_DIR` points to a writable location in your Drive.
5. If you run into memory/OOM issues: reduce `BATCH_SIZE`, set `NUM_EPOCHS` to 1, or run the notebook on a Colab Pro/Colab GPU runtime with more memory.

Notes:
- This notebook now centers the pipeline around the computed `weighted_quality_score`. Examples include the numeric target `quality_score` and training includes a dedicated regression step that logs validation MAE/MSE per epoch.
- For stronger metric-driven optimization (directly maximizing the metric) consider creating a reward model and using RLHF or a custom loss that combines SFT and metric regression — this is more advanced and not included here to keep the notebook runnable in Colab.
