<a href="https://colab.research.google.com/github/spamhamneggs/FinalProjectCOMP6885/blob/main/brautingan.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Cell 1: Install dependencies and setup environment
%%capture

# --- 1. LLM SETUP (From Group_6_NLP_Project_[Alternate].ipynb) ---
import os, re
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Use specific versions optimized for the Qwen model
    import torch; v = re.match(r"[0-9\.]{3,}", str(torch.__version__)).group(0)
    xformers = "xformers==" + ("0.0.32.post2" if v == "2.8.0" else "0.0.29.post3")
    # Consolidate LLM dependencies
    !pip install --no-deps bitsandbytes accelerate {xformers} peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets>=3.4.1,<4.0.0" "huggingface_hub>=0.34.0" hf_transfer
    !pip install --no-deps unsloth

# --- 2. CORE NLP SETUP: Fix for 'No module named spacy_syllables' ---

# Consolidate all NLP package installations
!pip install spacy==3.7.2 spacy-syllables==1.0.4 bertopic umap-learn hdbscan numpy pandas

# Download the specific English language model required by spaCy
!python -m spacy download en_core_web_sm

# IMPORTANT: After this cell executes, the kernel must be restarted
# to load the newly installed modules into memory.
print("Installation complete. PLEASE RESTART THE KERNEL NOW.")

In [None]:
# Cell 2: Drive Mounting and Global Variables

from google.colab import drive
drive.mount('/content/drive')

# Create a symbolic link for easy path access (as defined in your original code)
!ln -s /content/drive/MyDrive/haiku_dataset /content/haiku_dataset

# --- Global Configuration Variables ---
MODEL_NAME = "unsloth/Qwen3-4B-Instruct-2507"
# FIX: Use the symbolic link path for consistent access
DATASET_PATH = "/content/haiku_dataset/haiku_dataset_merged.csv"
OUTPUT_DIR = "/content/drive/MyDrive/haiku_suggester_unsloth"

# Training parameters
NUM_EPOCHS = 1
BATCH_SIZE = 2
GA_STEPS = 4
LEARNING_RATE = 2e-4
LORA_R = 32
LORA_ALPHA = 32

# Variables to be defined globally after training/loading
model = None
tokenizer = None
LLM_READY = False

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
ln: failed to create symbolic link '/content/haiku_dataset/haiku_dataset': File exists


In [None]:
# Cell 3: Data Preparation, Fine-Tuning Setup, and Training (Optimized)

import re, json, pandas as pd
from typing import Dict
import unsloth
from unsloth import FastLanguageModel
from unsloth.chat_templates import get_chat_template
from datasets import Dataset
from trl import SFTTrainer, SFTConfig
import torch
import os
from unsloth import FastLanguageModel

global model, tokenizer, LLM_READY

# --- Heuristic Syllable Counter (used for creating mock training labels) ---
def count_syllables_heuristic(line: str) -> int:
    # [Insert your count_syllables_heuristic function code here]
    if not line or line.strip() == "": return 0
    s = re.sub(r"[^a-zA-Z\s]", " ", line)
    s = re.sub(r"\s+", " ", s.strip())
    groups = re.findall(r"[aeiouyAEIOUY]+", s)
    n = len(groups)
    if len(s.split()) > 0 and s.endswith("e") and not s.endswith("le"): n = max(1, n - 1)
    return max(1, n)

# --- Mock Suggestion Generator (for creating training examples) ---
def generate_suggestions_for_haiku(haiku_text: str) -> Dict:
    lines = [l.strip() for l in haiku_text.strip().splitlines() if l.strip()]
    if len(lines) == 1 and ' / ' in lines[0]:
        lines = [p.strip() for p in lines[0].split(' / ')]
    while len(lines) < 3: lines.append("")
    lines = lines[:3]
    syl_counts = [count_syllables_heuristic(l) for l in lines]
    suggestions = []
    target = [5, 7, 5]
    for i, (syl, tgt) in enumerate(zip(syl_counts, target)):
        if syl != tgt:
            suggestions.append({"type": "syllable","line": i+1,"message": f"Line {i+1} has approx {syl}; revise to {tgt}."})
    return {"suggestions": suggestions}

# Define the get_json_suggestions inference function (needed globally)
# This function is now defined regardless of whether the model was trained or loaded.
def get_json_suggestions(haiku: str, max_new_tokens=256):
    global model, tokenizer
    inference_instruction = (
        "You are an assistant that provides improvement suggestions for a user-provided haiku. "
        "Do NOT write or invent haiku lines. Only analyze the provided haiku and return JSON only, following this schema: "
        "{ "
            "\"suggestions\": [ "
                "{ "
                    "\"type\": \"syllable\" | \"wording\" | \"readability\" | \"imagery\" | \"conciseness\" | \"sound\" | \"tone\" | \"style\" | \"grammar\", "
                    "\"line\": <int, optional>, "
                    "\"message\": <string>, "
                    "\"replacement\": <string or null> "
                "} "
            "] "
        "} "
        "Consider not only syllable structure, but also readability, vivid imagery, word economy, rhythm/sound, tone consistency, style polish, and grammar or punctuation issues."
    )
    prompt = f"### Instruction:\n{inference_instruction}\n\n### Input:\n{haiku}\n\n### Output:\n"
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False)
    text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    if '### Output:' in text:
        gen = text.split('### Output:')[-1].strip()
    else:
        gen = text.strip()
    try:
        return json.loads(gen)
    except:
        # Fallback to the heuristic counter if LLM output is garbage
        return generate_suggestions_for_haiku(haiku)


# --- OPTIMIZATION CHECK: Check if the model has already been fine-tuned and saved ---
if os.path.exists(OUTPUT_DIR) and len(os.listdir(OUTPUT_DIR)) > 0:
    print(f"✅ FOUND SAVED MODEL: Loading model from {OUTPUT_DIR}. Skipping training...")

    # --- Load the previously trained model directly ---
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = OUTPUT_DIR, # Load from the saved directory
        max_seq_length = 2048,
        load_in_4bit = True,
    )
    LLM_READY = True

else:
    print("⚠️ SAVED MODEL NOT FOUND. Proceeding with initial download and fine-tuning...")

    # --- Data Loading and Formatting ---
    df = pd.read_csv(DATASET_PATH)
    df["haiku"] = df[["line1", "line2", "line3"]].astype(str).agg("\n".join, axis=1)
    df = df[["haiku"]].dropna().reset_index(drop=True)

    instruction = (
        "You are an assistant that provides improvement suggestions for a user-provided haiku. "
        "Do NOT write or invent haiku lines. Only analyze the provided haiku and return JSON only, following this schema: "
        # ... (Rest of your instruction text)
    )

    def make_example(haiku: str):
        response = json.dumps(generate_suggestions_for_haiku(haiku))
        return {"instruction": instruction, "input": haiku, "output": response}

    examples = [make_example(x) for x in df["haiku"].tolist()]
    hf_ds = Dataset.from_pandas(pd.DataFrame(examples))

    # --- Model Loading and LoRA Setup (Download occurs here on first run) ---
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = MODEL_NAME, # Downloads Qwen on first run
        max_seq_length = 2048,
        load_in_4bit = True,
    )

    model = FastLanguageModel.get_peft_model(
        model,
        r = LORA_R,
        target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                          "gate_proj", "up_proj", "down_proj",],
        lora_alpha = LORA_ALPHA,
        lora_dropout = 0,
        bias = "none",
        use_gradient_checkpointing = "unsloth",
    )

    tokenizer = get_chat_template(tokenizer, chat_template = "qwen3-instruct")

    def formatting_prompts_func(examples):
        texts = []
        for instr, inp, out in zip(examples["instruction"], examples["input"], examples["output"]):
            text = f"### Instruction:\n{instr}\n\n### Input:\n{inp}\n\n### Output:\n{out}"
            texts.append(text)
        return {"text": texts}

    hf_ds = hf_ds.map(formatting_prompts_func, batched=True)

    # --- SFT Trainer and Training ---
    trainer = SFTTrainer(
        model = model,
        tokenizer = tokenizer,
        train_dataset = hf_ds,
        args = SFTConfig(
            dataset_text_field = "text",
            per_device_train_batch_size = BATCH_SIZE,
            gradient_accumulation_steps = GA_STEPS,
            num_train_epochs = NUM_EPOCHS,
            learning_rate = LEARNING_RATE,
            optim = "adamw_8bit",
            report_to = "none",
        ),
    )

    trainer.train()

    # Save the trained model
    model.save_pretrained(OUTPUT_DIR)
    tokenizer.save_pretrained(OUTPUT_DIR)
    print(f"\n✅ Saved fine-tuned model to {OUTPUT_DIR}")
    LLM_READY = True

# --- End of Optimization Check ---

✅ FOUND SAVED MODEL: Loading model from /content/drive/MyDrive/haiku_suggester_unsloth. Skipping training...
==((====))==  Unsloth 2025.10.9: Fast Qwen3 patching. Transformers: 4.57.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [None]:
# Reinstall spacy and spacy-syllables and download the language model
!pip install spacy==3.7.2 "spacy-syllables>=3.0.0,<4.0.0"
!python -m spacy download en_core_web_sm

Collecting https://github.com/explosion/spacy-models/releases/download/-en_core_web_sm/-en_core_web_sm.tar.gz
[31m  ERROR: HTTP error 404 while getting https://github.com/explosion/spacy-models/releases/download/-en_core_web_sm/-en_core_web_sm.tar.gz[0m[31m
[0m[31mERROR: Could not install requirement https://github.com/explosion/spacy-models/releases/download/-en_core_web_sm/-en_core_web_sm.tar.gz because of HTTP error 404 Client Error: Not Found for url: https://github.com/explosion/spacy-models/releases/download/-en_core_web_sm/-en_core_web_sm.tar.gz for URL https://github.com/explosion/spacy-models/releases/download/-en_core_web_sm/-en_core_web_sm.tar.gz[0m[31m
[0m

In [None]:
# Cell 4: BERTopic and HaikuGrammarly Class Definition

import spacy
from spacy_syllables import SpacySyllables
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
import warnings
import numpy as np
from typing import List, Dict, Tuple # Import List, Dict, and Tuple

# Suppress warnings
warnings.filterwarnings("ignore")

# --- CORE NLP SETUP ---
nlp = spacy.load("en_core_web_sm")
syllables = SpacySyllables(nlp)
nlp.add_pipe("syllables", after="tagger")

# --- CUSTOM LEXICON (Kigo List) ---
KIGO_LIST = [
    "spring", "summer", "fall", "autumn", "winter",
    "snow", "rain", "wind", "moon", "flower", "stream",
    "frog", "cicada", "cherry blossom", "mist", "sun", "leaf"
]

class HaikuGrammarly:

    def __init__(self, target_syllables: List[int] = [5, 7, 5], haiku_dataset_path: str = DATASET_PATH):
        self.target_syllables = target_syllables
        self.kigo_list = set([word.lower() for word in KIGO_LIST])
        self.haiku_dataset_path = haiku_dataset_path
        self.topic_model = self._initialize_bertopic()
        self.llm_ready = globals().get('LLM_READY', False) # Check global status

    def _initialize_bertopic(self):
        """Initializes and pre-fits the BERTopic model on the Haiku dataset."""
        print("Initializing and fitting BERTopic model on Haiku dataset...")
        try:
            df = pd.read_csv(self.haiku_dataset_path)
            documents = (df['line1'].fillna('') + ' ' + df['line2'].fillna('') + ' ' + df['line3'].fillna('')).tolist()
            documents = [doc.strip() for doc in documents if len(doc.split()) >= 3]
            vectorizer_model = CountVectorizer(stop_words="english")
            topic_model = BERTopic(vectorizer_model=vectorizer_model, nr_topics="auto", min_topic_size=50, verbose=False)
            topic_model.fit(documents)
            print("✅ BERTopic model initialization complete.")
            return topic_model
        except Exception as e:
            print(f"❌ Error initializing BERTopic: {e}. Skipping BERTopic analysis.")
            return None

    def _get_line_syllables(self, line: str) -> int:
        doc = nlp(line)
        total_syllables = 0
        for token in doc:
            syllables_count = token._.syllables_count
            if syllables_count is not None:
                total_syllables += syllables_count
        return total_syllables

    def check_structure(self, lines: List[str]) -> Tuple[Dict, float]:
        # [Structural Check Logic]... (As defined in previous response)
        feedback = {"structural_ok": True, "line_details": []}
        total_syllable_errors = 0

        for i, line in enumerate(lines):
            target = self.target_syllables[i]
            count = self._get_line_syllables(line)
            is_ok = (count == target)

            feedback["line_details"].append({
                "line_num": i + 1, "text": line, "count": count, "target": target, "ok": is_ok, "error": target - count
            })

            if not is_ok:
                feedback["structural_ok"] = False
                total_syllable_errors += abs(target - count)

        structural_score = max(0.0, 1.0 - (total_syllable_errors / (sum(self.target_syllables) * 0.5)))
        return feedback, structural_score

    def get_semantic_coherence(self, haiku_text: str) -> float:
        # [BERTopic Metric Logic]... (As defined in previous response)
        if self.topic_model is None: return 0.5
        topics, probabilities = self.topic_model.transform([haiku_text])
        topic = topics[0]
        if topic == -1: return 0.2
        # Check if probabilities[0] is a scalar or an array/list and if topic is a valid index
        if np.isscalar(probabilities[0]):
             # If scalar, it means there was only one probability returned
             coherence_score = probabilities[0]
        elif isinstance(probabilities[0], (list, np.ndarray)) and topic < len(probabilities[0]):
             # Otherwise, index with the topic if it's a valid index
             coherence_score = probabilities[0][topic]
        else:
            # Fallback if the structure is unexpected
            return 0.2

        return np.interp(coherence_score, [0.0, 1.0], [0.3, 1.0])

    def check_quality_metrics(self, lines: List[str], haiku_text: str) -> Dict:
        # [Qualitative/Grammarly Logic]... (As defined in previous response)
        doc = nlp(haiku_text)
        metrics = {"Semantic_Coherence": self.get_semantic_coherence(haiku_text), "Imagery_Score": 0.0, "Concision_Score": 1.0, "Grammar_Feedback": []}
        found_kigo = False
        for i, token in enumerate(doc):
            if token.text.lower() in self.kigo_list: found_kigo = True
            if token.pos_ in ["DET", "ADP"] and token.text.lower() in ["a", "an", "the", "of", "in", "on"]: metrics["Concision_Score"] -= 0.03
            if token.pos_ == "ADV" and token.text.lower() in ["very", "really", "so"]:
                metrics["Concision_Score"] -= 0.05
                metrics["Grammar_Feedback"].append(f"Token: '{token.text}'. Consider removing weak intensifiers for better impact.")

        metrics["Imagery_Score"] = 0.9 if found_kigo else 0.4
        metrics["Concision_Score"] = max(0.0, min(1.0, metrics["Concision_Score"]))
        metrics["Sentiment_Balance"] = 0.75
        return metrics

    def generate_report(self, haiku_text: str) -> Dict:
        # [Report Generation Logic]...
        lines = [line.strip() for line in haiku_text.strip().split('\n') if line.strip()]
        if len(lines) != 3: return {"error": "Haiku must have exactly three lines."}

        structural_feedback, structural_score = self.check_structure(lines)
        quality_metrics = self.check_quality_metrics(lines, haiku_text)

        weighted_score = (structural_score * 0.40 + quality_metrics["Semantic_Coherence"] * 0.25 + quality_metrics["Imagery_Score"] * 0.20 + quality_metrics["Concision_Score"] * 0.10 + quality_metrics["Sentiment_Balance"] * 0.05)

        report = {
            "input_haiku": haiku_text,
            "weighted_quality_score": round(weighted_score, 4),
            "structural_analysis": structural_feedback,
            "quality_metrics": quality_metrics,
            "llm_suggestion": self._generate_llm_suggestion(haiku_text, structural_feedback, quality_metrics)
        }
        return report

    def _generate_llm_suggestion(self, haiku_text: str, structural_feedback: Dict, quality_metrics: Dict) -> str:
        """
        [III. Qwen/Unsloth LLM Assistance] - Calls the inference function or placeholder.
        """
        if self.llm_ready:
            # Call the function defined in Cell 3's setup
            try:
                # get_json_suggestions is assumed to be defined globally after training
                json_suggestions = get_json_suggestions(haiku_text)
                return json.dumps(json_suggestions, indent=2)
            except Exception as e:
                return f"Qwen/Unsloth inference failed: {e}. Falling back to placeholder."
        else:
            # --- SMART PLACEHOLDER LOGIC ---
            suggestion = "LLM Suggestion (via Qwen3-4B Placeholder): "
            if not structural_feedback["structural_ok"]:
                line_errors = [d for d in structural_feedback["line_details"] if not d["ok"]]
                suggestion += f"Structure Error: Line {line_errors[0]['line_num']} is off by {line_errors[0]['error']} syllable(s). "
            if quality_metrics["Imagery_Score"] < 0.5:
                suggestion += "Low Imagery Score. Use a kigo (seasonal word) or more vivid nouns. "
            if quality_metrics["Semantic_Coherence"] < 0.5:
                 suggestion += "Low Semantic Coherence. Ensure all three lines contribute to a single, unified image. "
            if suggestion == "LLM Suggestion (via Qwen3-4B Placeholder): ":
                suggestion += "The Haiku is structurally sound and stylistically fair. Great job!"
            return suggestion

In [None]:
# Cell 5: Inference Test

# Haiku for testing
haiku_1 = """
The old pond is still,
A frog jumps into the water,
A deep splash is heard.
"""

# Initialize the checker (it uses the DATASET_PATH defined in Cell 2)
checker = HaikuGrammarly()

# Analyze Haiku 1
report_1 = checker.generate_report(haiku_1)

print("\n--- Haiku Analysis Report ---")
print(json.dumps(report_1, indent=4))

Initializing and fitting BERTopic model on Haiku dataset...
✅ BERTopic model initialization complete.

--- Haiku Analysis Report ---
{
    "input_haiku": "\nThe old pond is still,\nA frog jumps into the water,\nA deep splash is heard.\n",
    "weighted_quality_score": 0.9084,
    "structural_analysis": {
        "structural_ok": false,
        "line_details": [
            {
                "line_num": 1,
                "text": "The old pond is still,",
                "count": 5,
                "target": 5,
                "ok": true,
                "error": 0
            },
            {
                "line_num": 2,
                "text": "A frog jumps into the water,",
                "count": 8,
                "target": 7,
                "ok": false,
                "error": -1
            },
            {
                "line_num": 3,
                "text": "A deep splash is heard.",
                "count": 5,
                "target": 5,
                "ok": true,
   