<a href="https://colab.research.google.com/github/steffilewi/steffilewi.github.io/blob/MS1/MS1_retry_with_T4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install bitsandbytes --prefer-binary --upgrade

Collecting bitsandbytes
  Downloading bitsandbytes-0.46.1-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-c

In [2]:
# =============================================================================
# STEP 2: Fresh Installation After Runtime Restart
# =============================================================================

import subprocess
import sys

def fresh_install():
    """Fresh installation with minimal conflicts"""

    print("🚀 Starting fresh installation...")

    # Install only essential packages that commonly cause conflicts
    essential_packages = [
        # PyTorch with CUDA (this will handle numpy correctly)
        'torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118',

        # Core ML packages
        'transformers==4.35.0',
        'datasets==2.14.0',
        'accelerate==0.24.0',
        'peft==0.6.0',

        # Quantization
        #'bitsandbytes',

        # Google API
        'google-api-python-client',
        'google-auth-httplib2',
        'google-auth-oauthlib',
    ]

    for package in essential_packages:
        try:
            print(f"📦 Installing {package.split('==')[0]}...")
            if '--index-url' in package:
                cmd = [sys.executable, "-m", "pip", "install", "--upgrade"] + package.split()
            else:
                cmd = [sys.executable, "-m", "pip", "install", "--upgrade", package]

            result = subprocess.run(cmd, capture_output=True, text=True, timeout=300)
            if result.returncode == 0:
                print(f"✅ {package.split('==')[0]} installed successfully")
            else:
                print(f"⚠️  {package.split('==')[0]} installation had warnings")
        except Exception as e:
            print(f"❌ Failed to install {package}: {e}")
            continue

    print("✅ Installation completed!")

# Run installation
fresh_install()

# Test imports immediately
print("\n🔍 Testing imports...")
try:
    import torch
    print(f"✅ PyTorch: {torch.__version__}")

    import numpy as np
    print(f"✅ NumPy: {np.__version__}")

    import pandas as pd
    print(f"✅ Pandas: {pd.__version__}")

    from transformers import AutoTokenizer
    print("✅ Transformers: Available")

    import bitsandbytes as bnb
    print("✅ BitsAndBytes: Available")

    print("\n🎉 All core packages working!")

except Exception as e:
    print(f"❌ Import error: {e}")
    print("🔄 Please restart runtime and try again")

🚀 Starting fresh installation...
📦 Installing torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118...
✅ torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118 installed successfully
📦 Installing transformers...
✅ transformers installed successfully
📦 Installing datasets...
✅ datasets installed successfully
📦 Installing accelerate...
✅ accelerate installed successfully
📦 Installing peft...
✅ peft installed successfully
📦 Installing google-api-python-client...
✅ google-api-python-client installed successfully
📦 Installing google-auth-httplib2...
✅ google-auth-httplib2 installed successfully
📦 Installing google-auth-oauthlib...
✅ google-auth-oauthlib installed successfully
✅ Installation completed!

🔍 Testing imports...
✅ PyTorch: 2.7.1+cu118
✅ NumPy: 2.0.2
✅ Pandas: 2.2.2


  _torch_pytree._register_pytree_node(


✅ Transformers: Available
✅ BitsAndBytes: Available

🎉 All core packages working!


**Enter Hugginface Token in Line 92**

In [3]:
# =============================================================================
# STEP 3: Complete Setup with Working Packages
# =============================================================================

# Import all required libraries
import torch
import pandas as pd
import numpy as np
import json
import warnings
warnings.filterwarnings('ignore')

# Google Colab specific imports
from google.colab import auth
from google.auth import default
from googleapiclient.discovery import build

# ML imports
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    pipeline,
    BitsAndBytesConfig
)

# Plotting (use colab defaults)
import matplotlib.pyplot as plt
try:
    import seaborn as sns
except:
    sns = None

# Quantization imports
try:
    import bitsandbytes as bnb
    from peft import LoraConfig, get_peft_model, TaskType
    quantization_available = True
except:
    quantization_available = False

print("✅ All imports successful!")

# =============================================================================
# STEP 4: System Verification
# =============================================================================

def check_system():
    """Check system capabilities"""
    print("🔍 System Check:")

    # GPU Check
    if torch.cuda.is_available():
        gpu_name = torch.cuda.get_device_name(0)
        gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1024**3
        print(f"✅ GPU: {gpu_name} ({gpu_memory:.1f} GB)")

        # Test CUDA
        try:
            test_tensor = torch.randn(10, 10).cuda()
            print("✅ CUDA: Working")
            cuda_working = True
        except Exception as e:
            print(f"❌ CUDA: {e}")
            cuda_working = False
    else:
        print("❌ No GPU available")
        cuda_working = False

    # Check quantization
    if quantization_available and cuda_working:
        print("✅ Quantization: Available")
        use_quantization = True
    else:
        print("⚠️  Quantization: Disabled")
        use_quantization = False

    return cuda_working, use_quantization

cuda_ok, use_quantization = check_system()
device = torch.device('cuda' if cuda_ok else 'cpu')

# =============================================================================
# STEP 5: Configuration
# =============================================================================

# Input variables
GOOGLE_SHEET_URL = "https://docs.google.com/spreadsheets/d/1CpWL01U9HSfmre2OjFj3GkMV816EYZOryxWGDDVouy4/edit?gid=1497010733#gid=1497010733"
#HUGGINGFACE_TOKEN =
SHEET_NAME = "Dataset_short"

# Extract sheet ID
def extract_sheet_id(url):
    """Extract Google Sheet ID from URL"""
    try:
        sheet_id = url.split('/d/')[1].split('/')[0]
        return sheet_id
    except:
        raise ValueError("Invalid Google Sheet URL format")

SHEET_ID = extract_sheet_id(GOOGLE_SHEET_URL)

# Model configuration
MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.1"
NUM_LABELS = 3
MAX_LENGTH = 512
BATCH_SIZE = 2 if use_quantization else 1  # Conservative for T4
GRADIENT_ACCUMULATION_STEPS = 4
LEARNING_RATE = 2e-5
NUM_EPOCHS = 3
TRAIN_TEST_SPLIT = 0.7

# Quantization config
if use_quantization:
    QUANTIZATION_CONFIG = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_use_double_quant=True,
    )
else:
    QUANTIZATION_CONFIG = None

print(f"\n✅ Configuration loaded:")
print(f"  - Sheet ID: {SHEET_ID}")
print(f"  - Model: {MODEL_NAME}")
print(f"  - Device: {device}")
print(f"  - Batch Size: {BATCH_SIZE}")
print(f"  - Quantization: {'Enabled' if use_quantization else 'Disabled'}")

print(f"\n🎯 Ready for data loading!")

✅ All imports successful!
🔍 System Check:
✅ GPU: Tesla T4 (14.7 GB)
✅ CUDA: Working
✅ Quantization: Available

✅ Configuration loaded:
  - Sheet ID: 1CpWL01U9HSfmre2OjFj3GkMV816EYZOryxWGDDVouy4
  - Model: mistralai/Mistral-7B-Instruct-v0.1
  - Device: cuda
  - Batch Size: 2
  - Quantization: Enabled

🎯 Ready for data loading!


## **Load Dataset from google sheets**

In [4]:
# =============================================================================
# STEP 6: Google Sheets Authentication and Data Loading
# =============================================================================

def authenticate_google_sheets():
    """Authenticate with Google Sheets API"""
    print("🔐 Authenticating with Google Sheets...")

    try:
        # Authenticate with Google Colab
        auth.authenticate_user()

        # Get credentials
        creds, _ = default()

        # Build the service
        service = build('sheets', 'v4', credentials=creds)

        print("✅ Google Sheets authentication successful!")
        return service

    except Exception as e:
        print(f"❌ Authentication failed: {e}")
        return None

def load_data_from_sheet(service, sheet_id, sheet_name):
    """Load data from Google Sheet"""
    print(f"📊 Loading data from sheet: {sheet_name}")

    try:
        # Call the Sheets API
        sheet = service.spreadsheets()
        result = sheet.values().get(
            spreadsheetId=sheet_id,
            range=f"{sheet_name}!A:L"  # Get all columns
        ).execute()

        values = result.get('values', [])

        if not values:
            print("❌ No data found in sheet")
            return None

        # Convert to DataFrame
        df = pd.DataFrame(values[1:], columns=values[0])  # First row as headers

        print(f"✅ Data loaded successfully!")
        print(f"  - Shape: {df.shape}")
        print(f"  - Columns: {list(df.columns)}")

        return df

    except Exception as e:
        print(f"❌ Failed to load data: {e}")
        return None

def explore_data(df):
    """Explore the loaded data"""
    print("\n🔍 Data Exploration:")

    # Basic info
    print(f"Dataset shape: {df.shape}")
    print(f"Columns: {list(df.columns)}")

    # Check for required columns
    required_columns = ["company", "Description", "Longest Chunk", "Language", "Relevance", "Usefulness"]
    missing_columns = [col for col in required_columns if col not in df.columns]

    if missing_columns:
        print(f"⚠️  Missing columns: {missing_columns}")
        print("Available columns:")
        for i, col in enumerate(df.columns):
            print(f"  {i}: {col}")
    else:
        print("✅ All required columns found!")

    # Show first few rows
    print("\n📋 First 3 rows:")
    print(df.head(3))

    # Data types and missing values
    print("\n📈 Data Info:")
    print(df.info())

    # Check Relevance and Usefulness columns
    if 'Relevance' in df.columns and 'Usefulness' in df.columns:
        print("\n🎯 Target Variables:")
        print("Relevance distribution:")
        print(df['Relevance'].value_counts().sort_index())
        print("\nUsefulness distribution:")
        print(df['Usefulness'].value_counts().sort_index())

    return df

# Authenticate and load data
print("🚀 Starting Google Sheets data loading...")

# Authenticate
service = authenticate_google_sheets()

if service:
    # Load data
    df = load_data_from_sheet(service, SHEET_ID, SHEET_NAME)

    if df is not None:
        # Explore data
        df = explore_data(df)
        print(f"\n✅ Data loading completed successfully!")
        print(f"Ready to proceed with preprocessing...")
    else:
        print("❌ Failed to load data")
else:
    print("❌ Failed to authenticate with Google Sheets")

🚀 Starting Google Sheets data loading...
🔐 Authenticating with Google Sheets...
✅ Google Sheets authentication successful!
📊 Loading data from sheet: Dataset_short
✅ Data loaded successfully!
  - Shape: (376, 12)
  - Columns: ['Company sort', 'company', 'industry', 'Key_word', 'Description', 'Longest Chunk', 'Chunk most important part', 'Language', 'Action/ solution/ target/ background (a, s, t, b)', 'Relevance', 'Usefulness', 'Explanation']

🔍 Data Exploration:
Dataset shape: (376, 12)
Columns: ['Company sort', 'company', 'industry', 'Key_word', 'Description', 'Longest Chunk', 'Chunk most important part', 'Language', 'Action/ solution/ target/ background (a, s, t, b)', 'Relevance', 'Usefulness', 'Explanation']
✅ All required columns found!

📋 First 3 rows:
  Company sort              company                industry   Key_word  \
0            1  KYOCERA Corporation  Information technology  criteria1   
1            1  KYOCERA Corporation  Information technology  criteria1   
2         

In [5]:
# =============================================================================
# STEP 7: Data Preprocessing
# =============================================================================

def preprocess_data(df):
    """Preprocess the sustainability report data"""
    print("🔧 Starting data preprocessing...")

    # Create a copy to avoid modifying original
    processed_df = df.copy()

    # 1. Check and clean required columns
    required_columns = ["company", "Description", "Longest Chunk", "Language", "Relevance", "Usefulness"]

    print(f"📋 Checking required columns...")
    for col in required_columns:
        if col not in processed_df.columns:
            print(f"❌ Missing column: {col}")
            return None
        else:
            print(f"✅ Found column: {col}")

    # 2. Handle missing values
    print(f"\n🧹 Handling missing values...")
    initial_rows = len(processed_df)

    # Fill missing descriptions with empty string
    processed_df['Description'] = processed_df['Description'].fillna('')
    processed_df['Longest Chunk'] = processed_df['Longest Chunk'].fillna('')

    # Remove rows with missing target values
    processed_df = processed_df.dropna(subset=['Relevance', 'Usefulness'])

    print(f"  - Initial rows: {initial_rows}")
    print(f"  - After cleaning: {len(processed_df)}")
    print(f"  - Removed: {initial_rows - len(processed_df)} rows")

    # 3. Convert target columns to numeric
    print(f"\n🔢 Converting target columns to numeric...")
    try:
        processed_df['Relevance'] = pd.to_numeric(processed_df['Relevance'], errors='coerce')
        processed_df['Usefulness'] = pd.to_numeric(processed_df['Usefulness'], errors='coerce')

        # Remove rows where conversion failed
        processed_df = processed_df.dropna(subset=['Relevance', 'Usefulness'])

        print(f"✅ Target columns converted successfully")
        print(f"  - Final rows after numeric conversion: {len(processed_df)}")

    except Exception as e:
        print(f"❌ Error converting target columns: {e}")
        return None

    # 4. Create combined score
    print(f"\n🎯 Creating combined score...")
    processed_df['Combined_Score'] = (processed_df['Relevance'] + processed_df['Usefulness']) / 2

    print("Combined score distribution:")
    print(processed_df['Combined_Score'].value_counts().sort_index())

    # 5. Encode labels as integers (0, 1, 2)
    print(f"\n🏷️ Encoding labels...")

    def encode_score(score):
        """Encode combined score to integer labels"""
        if score <= 1.0:
            return 0
        elif score <= 2.0:
            return 1
        else:
            return 2

    processed_df['Label'] = processed_df['Combined_Score'].apply(encode_score)

    print("Label distribution:")
    label_counts = processed_df['Label'].value_counts().sort_index()
    print(label_counts)

    # Check for class imbalance
    print(f"\n⚖️ Class balance check:")
    for label in [0, 1, 2]:
        count = label_counts.get(label, 0)
        percentage = (count / len(processed_df)) * 100
        print(f"  - Class {label}: {count} samples ({percentage:.1f}%)")

    # 6. Create combined input text
    print(f"\n📝 Creating combined input texts...")

    def create_input_text(row):
        """Combine Description and Longest Chunk into input text"""
        description = str(row['Description']).strip()
        longest_chunk = str(row['Longest Chunk']).strip()

        # Handle different cases
        if description and longest_chunk:
            return f"Description: {description}\n\nContent: {longest_chunk}"
        elif description:
            return f"Description: {description}"
        elif longest_chunk:
            return f"Content: {longest_chunk}"
        else:
            return "No content available"

    processed_df['Input_Text'] = processed_df.apply(create_input_text, axis=1)

    # Check text lengths
    text_lengths = processed_df['Input_Text'].str.len()
    print(f"  - Text length stats:")
    print(f"    - Mean: {text_lengths.mean():.0f} characters")
    print(f"    - Median: {text_lengths.median():.0f} characters")
    print(f"    - Max: {text_lengths.max():.0f} characters")
    print(f"    - Min: {text_lengths.min():.0f} characters")

    # 7. Filter by language (optional)
    if 'Language' in processed_df.columns:
        print(f"\n🌐 Language distribution:")
        lang_counts = processed_df['Language'].value_counts()
        print(lang_counts)

        # You can optionally filter by language here
        # For now, we'll keep all languages

    print(f"\n✅ Preprocessing completed!")
    print(f"  - Final dataset shape: {processed_df.shape}")

    return processed_df

def create_train_test_split(df, test_size=0.3, random_state=42):
    """Split data into train and test sets"""
    print(f"\n🔄 Creating train/test split ({int((1-test_size)*100)}%/{int(test_size*100)}%)...")

    # Features and labels
    X = df['Input_Text'].values
    y = df['Label'].values

    # Stratified split to maintain class distribution
    X_train, X_test, y_train, y_test = train_test_split(
        X, y,
        test_size=test_size,
        random_state=random_state,
        stratify=y
    )

    print(f"✅ Split completed:")
    print(f"  - Training samples: {len(X_train)}")
    print(f"  - Testing samples: {len(X_test)}")

    # Check class distribution in splits
    print(f"\n📊 Class distribution in splits:")
    train_dist = pd.Series(y_train).value_counts().sort_index()
    test_dist = pd.Series(y_test).value_counts().sort_index()

    print("Training set:")
    for label in [0, 1, 2]:
        count = train_dist.get(label, 0)
        percentage = (count / len(y_train)) * 100
        print(f"  - Class {label}: {count} samples ({percentage:.1f}%)")

    print("Test set:")
    for label in [0, 1, 2]:
        count = test_dist.get(label, 0)
        percentage = (count / len(y_test)) * 100
        print(f"  - Class {label}: {count} samples ({percentage:.1f}%)")

    return X_train, X_test, y_train, y_test

# Run preprocessing
print("🚀 Starting data preprocessing pipeline...")

# Preprocess data
processed_df = preprocess_data(df)

if processed_df is not None:
    # Create train/test split
    X_train, X_test, y_train, y_test = create_train_test_split(
        processed_df,
        test_size=1-TRAIN_TEST_SPLIT,
        random_state=42
    )

    # Show some examples
    print(f"\n📄 Sample processed data:")
    print("="*80)
    for i in range(min(2, len(X_train))):
        print(f"Example {i+1}:")
        print(f"Label: {y_train[i]}")
        print(f"Text: {X_train[i][:200]}...")
        print("="*80)

    print(f"\n✅ Data preprocessing completed successfully!")
    print(f"Ready to proceed with Hugging Face authentication and model setup...")

else:
    print("❌ Data preprocessing failed!")

🚀 Starting data preprocessing pipeline...
🔧 Starting data preprocessing...
📋 Checking required columns...
✅ Found column: company
✅ Found column: Description
✅ Found column: Longest Chunk
✅ Found column: Language
✅ Found column: Relevance
✅ Found column: Usefulness

🧹 Handling missing values...
  - Initial rows: 376
  - After cleaning: 345
  - Removed: 31 rows

🔢 Converting target columns to numeric...
✅ Target columns converted successfully
  - Final rows after numeric conversion: 343

🎯 Creating combined score...
Combined score distribution:
Combined_Score
0.0     10
0.5     26
1.0     45
1.5     73
2.0    189
Name: count, dtype: int64

🏷️ Encoding labels...
Label distribution:
Label
0     81
1    262
Name: count, dtype: int64

⚖️ Class balance check:
  - Class 0: 81 samples (23.6%)
  - Class 1: 262 samples (76.4%)
  - Class 2: 0 samples (0.0%)

📝 Creating combined input texts...
  - Text length stats:
    - Mean: 698 characters
    - Median: 716 characters
    - Max: 2207 characters

## **Hugginface Integration**

In [6]:
# =============================================================================
# STEP 8: Hugging Face Authentication and Model Setup (Fixed)
# =============================================================================

def authenticate_huggingface(token):
    """Authenticate with Hugging Face"""
    print("🔐 Authenticating with Hugging Face...")

    try:
        from huggingface_hub import login
        login(token=token, add_to_git_credential=True)
        print("✅ Hugging Face authentication successful!")
        return True
    except Exception as e:
        print(f"❌ Hugging Face authentication failed: {e}")
        return False

def setup_tokenizer_with_fallback(model_name, max_length=512):
    """Setup tokenizer with fallback options"""
    print(f"🔤 Setting up tokenizer for {model_name}...")

    # Try different tokenizer approaches
    tokenizer_attempts = [
        # Original approach
        lambda: AutoTokenizer.from_pretrained(model_name),
        # With specific trust_remote_code
        lambda: AutoTokenizer.from_pretrained(model_name, trust_remote_code=True),
        # With legacy tokenizer
        lambda: AutoTokenizer.from_pretrained(model_name, use_fast=False),
        # With both options
        lambda: AutoTokenizer.from_pretrained(model_name, use_fast=False, trust_remote_code=True),
    ]

    for i, attempt in enumerate(tokenizer_attempts):
        try:
            print(f"  - Attempt {i+1}: {'Fast tokenizer' if i % 2 == 0 else 'Legacy tokenizer'}")
            tokenizer = attempt()

            # Add pad token if it doesn't exist
            if tokenizer.pad_token is None:
                tokenizer.pad_token = tokenizer.eos_token
                print("    - Added pad token")

            # Set truncation and padding
            tokenizer.model_max_length = max_length

            print(f"✅ Tokenizer setup completed!")
            print(f"  - Vocabulary size: {tokenizer.vocab_size}")
            print(f"  - Max length: {max_length}")
            print(f"  - Pad token: {tokenizer.pad_token}")

            return tokenizer

        except Exception as e:
            print(f"    ❌ Attempt {i+1} failed: {e}")
            continue

    print("❌ All tokenizer attempts failed!")
    return None

def try_alternative_model():
    """Try alternative model if Mistral fails"""
    print("🔄 Trying alternative model due to tokenizer issues...")

    # Alternative models that work well with classification
    alternative_models = [
        "microsoft/DialoGPT-medium",  # Smaller, more stable
        "distilbert-base-uncased",    # Very stable for classification
        "roberta-base",               # Excellent for classification
        "microsoft/DialoGPT-small",   # Even smaller fallback
    ]

    for model_name in alternative_models:
        print(f"  - Trying {model_name}...")
        try:
            tokenizer = AutoTokenizer.from_pretrained(model_name)

            # Add pad token if needed
            if tokenizer.pad_token is None:
                tokenizer.pad_token = tokenizer.eos_token

            print(f"✅ Alternative model {model_name} works!")
            return model_name, tokenizer

        except Exception as e:
            print(f"    ❌ {model_name} failed: {e}")
            continue

    print("❌ All alternative models failed!")
    return None, None

def setup_model_with_fallback(model_name, num_labels, quantization_config=None):
    """Setup model with fallback options"""
    print(f"🤖 Setting up model {model_name}...")

    # Try different model loading approaches
    model_attempts = [
        # With quantization
        lambda: AutoModelForSequenceClassification.from_pretrained(
            model_name,
            num_labels=num_labels,
            torch_dtype=torch.float16,
            device_map='auto',
            trust_remote_code=True,
            quantization_config=quantization_config
        ) if quantization_config else None,

        # Without quantization, with float16
        lambda: AutoModelForSequenceClassification.from_pretrained(
            model_name,
            num_labels=num_labels,
            torch_dtype=torch.float16,
            device_map='auto',
            trust_remote_code=True
        ),

        # Without quantization, default precision
        lambda: AutoModelForSequenceClassification.from_pretrained(
            model_name,
            num_labels=num_labels,
            trust_remote_code=True
        ),

        # Most basic approach
        lambda: AutoModelForSequenceClassification.from_pretrained(
            model_name,
            num_labels=num_labels
        ),
    ]

    for i, attempt in enumerate(model_attempts):
        if attempt() is None:  # Skip None attempts
            continue

        try:
            print(f"  - Attempt {i+1}: {'With quantization' if i == 0 else 'Fallback config'}")
            model = attempt()

            # Print model info
            total_params = sum(p.numel() for p in model.parameters())
            trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

            print(f"✅ Model loaded successfully!")
            print(f"  - Total parameters: {total_params:,}")
            print(f"  - Trainable parameters: {trainable_params:,}")

            # Check GPU memory usage
            if torch.cuda.is_available():
                gpu_memory = torch.cuda.memory_allocated() / 1024**3
                print(f"  - GPU memory used: {gpu_memory:.2f} GB")

            return model

        except Exception as e:
            print(f"    ❌ Attempt {i+1} failed: {e}")
            continue

    print("❌ All model loading attempts failed!")
    return None

def tokenize_data_safely(tokenizer, X_train, X_test, y_train, y_test, max_length=512):
    """Tokenize data with error handling"""
    print(f"🔤 Tokenizing data...")

    try:
        # Test tokenization with a small sample first
        test_sample = X_train[:2] if len(X_train) > 2 else X_train
        test_encoding = tokenizer(
            list(test_sample),
            truncation=True,
            padding=True,
            max_length=max_length,
            return_tensors='pt'
        )
        print("  - Test tokenization successful")

        # Tokenize training data
        print("  - Tokenizing training data...")
        train_encodings = tokenizer(
            list(X_train),
            truncation=True,
            padding=True,
            max_length=max_length,
            return_tensors='pt'
        )

        # Tokenize test data
        print("  - Tokenizing test data...")
        test_encodings = tokenizer(
            list(X_test),
            truncation=True,
            padding=True,
            max_length=max_length,
            return_tensors='pt'
        )

        print(f"✅ Tokenization completed!")
        print(f"  - Training samples: {len(X_train)}")
        print(f"  - Test samples: {len(X_test)}")

        return train_encodings, test_encodings

    except Exception as e:
        print(f"❌ Tokenization failed: {e}")
        return None, None

# Run setup with fallback options
print("🚀 Starting Hugging Face setup with fallback options...")

# 1. Authenticate
hf_authenticated = authenticate_huggingface(HUGGINGFACE_TOKEN)

if hf_authenticated:
    # 2. Try to setup tokenizer for original model
    tokenizer = setup_tokenizer_with_fallback(MODEL_NAME, MAX_LENGTH)
    current_model_name = MODEL_NAME

    # 3. If original model fails, try alternatives
    if tokenizer is None:
        print("🔄 Original model failed, trying alternatives...")
        current_model_name, tokenizer = try_alternative_model()

        if current_model_name:
            print(f"✅ Using alternative model: {current_model_name}")
            # Update configuration for alternative model
            MAX_LENGTH = 512
            BATCH_SIZE = 4  # Can be larger for smaller models

    if tokenizer is not None:
        # 4. Setup model
        model = setup_model_with_fallback(current_model_name, NUM_LABELS, QUANTIZATION_CONFIG)

        if model is not None:
            # 5. Tokenize data
            train_encodings, test_encodings = tokenize_data_safely(
                tokenizer, X_train, X_test, y_train, y_test, MAX_LENGTH
            )

            if train_encodings is not None and test_encodings is not None:
                print(f"\n✅ Setup completed successfully!")
                print(f"  - Model: {current_model_name}")
                print(f"  - Ready for baseline evaluation...")

                # Update global variables
                MODEL_NAME = current_model_name

                # Show memory usage
                if torch.cuda.is_available():
                    gpu_memory = torch.cuda.memory_allocated() / 1024**3
                    gpu_total = torch.cuda.get_device_properties(0).total_memory / 1024**3
                    print(f"  - GPU memory: {gpu_memory:.2f}/{gpu_total:.1f} GB ({gpu_memory/gpu_total*100:.1f}%)")
            else:
                print("❌ Data tokenization failed!")
        else:
            print("❌ Model setup failed!")
    else:
        print("❌ All tokenizer attempts failed!")
else:
    print("❌ Hugging Face authentication failed!")

🚀 Starting Hugging Face setup with fallback options...
🔐 Authenticating with Hugging Face...
Token is valid (permission: write).
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.cache/huggingface/token
Login successful
✅ Hugging Face authentication successful!
🔤 Setting up tokenizer for mistralai/Mistral-7B-Instruct-v0.1...
  - Attempt 1: Fast tokenizer


Downloading tokenizer_config.json:   0%|          | 0.00/2.10k [00:00<?, ?B/s]

Downloading tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

    ❌ Attempt 1 failed: data did not match any variant of untagged enum PyPreTokenizerTypeWrapper at line 40 column 3
  - Attempt 2: Legacy tokenizer
    ❌ Attempt 2 failed: data did not match any variant of untagged enum PyPreTokenizerTypeWrapper at line 40 column 3
  - Attempt 3: Fast tokenizer
    - Added pad token
✅ Tokenizer setup completed!
  - Vocabulary size: 32000
  - Max length: 512
  - Pad token: </s>
🤖 Setting up model mistralai/Mistral-7B-Instruct-v0.1...


Downloading config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading (…)fetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

Downloading (…)of-00002.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of MistralForSequenceClassification were not initialized from the model checkpoint at mistralai/Mistral-7B-Instruct-v0.1 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  - Attempt 1: With quantization


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of MistralForSequenceClassification were not initialized from the model checkpoint at mistralai/Mistral-7B-Instruct-v0.1 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✅ Model loaded successfully!
  - Total parameters: 3,621,011,456
  - Trainable parameters: 131,350,528
  - GPU memory used: 4.10 GB
🔤 Tokenizing data...
  - Test tokenization successful
  - Tokenizing training data...
  - Tokenizing test data...
✅ Tokenization completed!
  - Training samples: 240
  - Test samples: 103

✅ Setup completed successfully!
  - Model: mistralai/Mistral-7B-Instruct-v0.1
  - Ready for baseline evaluation...
  - GPU memory: 4.10/14.7 GB (27.8%)


## **Baseline evaluation**

In [7]:
# =============================================================================
# STEP 9: Baseline Evaluation (Fixed)
# =============================================================================

import time
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
import torch.nn.functional as F

def fix_tokenizer_padding(tokenizer):
    """Fix tokenizer padding issues"""
    print("🔧 Fixing tokenizer padding configuration...")

    # Try different padding token options
    if tokenizer.pad_token is None:
        if tokenizer.eos_token is not None:
            tokenizer.pad_token = tokenizer.eos_token
            print("  - Set pad_token to eos_token")
        elif tokenizer.unk_token is not None:
            tokenizer.pad_token = tokenizer.unk_token
            print("  - Set pad_token to unk_token")
        elif hasattr(tokenizer, 'bos_token') and tokenizer.bos_token is not None:
            tokenizer.pad_token = tokenizer.bos_token
            print("  - Set pad_token to bos_token")
        else:
            # Add a new pad token
            tokenizer.add_special_tokens({'pad_token': '[PAD]'})
            print("  - Added new pad_token: [PAD]")

    # Ensure pad_token_id is set
    if tokenizer.pad_token_id is None:
        tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)

    print(f"✅ Padding fixed:")
    print(f"  - pad_token: {tokenizer.pad_token}")
    print(f"  - pad_token_id: {tokenizer.pad_token_id}")

    return tokenizer

def retokenize_data_with_fixed_padding(tokenizer, X_train, X_test, max_length=512):
    """Re-tokenize data with fixed padding"""
    print("🔄 Re-tokenizing data with fixed padding...")

    try:
        # Re-tokenize training data
        train_encodings = tokenizer(
            list(X_train),
            truncation=True,
            padding='max_length',  # Use max_length padding
            max_length=max_length,
            return_tensors='pt'
        )

        # Re-tokenize test data
        test_encodings = tokenizer(
            list(X_test),
            truncation=True,
            padding='max_length',  # Use max_length padding
            max_length=max_length,
            return_tensors='pt'
        )

        print(f"✅ Re-tokenization completed!")
        return train_encodings, test_encodings

    except Exception as e:
        print(f"❌ Re-tokenization failed: {e}")
        return None, None

class SustainabilityDataset(torch.utils.data.Dataset):
    """Custom dataset for sustainability report classification"""

    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

def analyze_label_distribution(y_train, y_test):
    """Analyze the actual label distribution"""
    print("🔍 Analyzing label distribution...")

    unique_train = np.unique(y_train)
    unique_test = np.unique(y_test)
    all_unique = np.unique(np.concatenate([y_train, y_test]))

    print(f"  - Unique labels in training: {unique_train}")
    print(f"  - Unique labels in test: {unique_test}")
    print(f"  - All unique labels: {all_unique}")

    # Count distribution
    train_counts = np.bincount(y_train, minlength=3)
    test_counts = np.bincount(y_test, minlength=3)

    print(f"  - Training distribution: {train_counts}")
    print(f"  - Test distribution: {test_counts}")

    return all_unique

def evaluate_model_baseline_fixed(model, test_dataset, tokenizer, batch_size=1):
    """Evaluate the untrained model with fixed padding"""
    print("🔍 Running baseline evaluation (fixed version)...")
    print(f"  - Using batch size: {batch_size}")
    print("⚠️  This may take a few minutes...")

    # Set model to evaluation mode
    model.eval()

    # Create data loader with batch size 1 to avoid padding issues
    test_loader = torch.utils.data.DataLoader(
        test_dataset,
        batch_size=batch_size,
        shuffle=False
    )

    all_predictions = []
    all_labels = []
    all_probabilities = []

    start_time = time.time()

    with torch.no_grad():
        for batch_idx, batch in enumerate(test_loader):
            try:
                # Move batch to device
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels']

                # Forward pass
                outputs = model(input_ids=input_ids, attention_mask=attention_mask)

                # Get predictions
                logits = outputs.logits
                probabilities = F.softmax(logits, dim=-1)
                predictions = torch.argmax(logits, dim=-1)

                # Store results
                all_predictions.extend(predictions.cpu().numpy())
                all_labels.extend(labels.numpy())
                all_probabilities.extend(probabilities.cpu().numpy())

                # Progress update
                if (batch_idx + 1) % 10 == 0:
                    processed = (batch_idx + 1) * batch_size
                    total = len(test_dataset)
                    print(f"  - Processed {processed}/{total} samples ({processed/total*100:.1f}%)")

            except Exception as e:
                print(f"❌ Error in batch {batch_idx}: {e}")
                # Continue with next batch instead of stopping
                continue

    end_time = time.time()

    if len(all_predictions) == 0:
        print("❌ No predictions were generated!")
        return None

    # Analyze actual classes present
    unique_labels = np.unique(all_labels)
    unique_predictions = np.unique(all_predictions)

    print(f"\n📊 Label Analysis:")
    print(f"  - Unique actual labels: {unique_labels}")
    print(f"  - Unique predicted labels: {unique_predictions}")

    # Calculate metrics
    accuracy = accuracy_score(all_labels, all_predictions)
    f1_macro = f1_score(all_labels, all_predictions, average='macro')
    f1_weighted = f1_score(all_labels, all_predictions, average='weighted')

    print(f"\n✅ Baseline evaluation completed!")
    print(f"⏱️  Time taken: {end_time - start_time:.2f} seconds")
    print(f"📊 Baseline Results:")
    print(f"  - Accuracy: {accuracy:.4f}")
    print(f"  - F1 Score (Macro): {f1_macro:.4f}")
    print(f"  - F1 Score (Weighted): {f1_weighted:.4f}")

    # Create target names based on actual classes
    class_names = [f'Class {i}' for i in sorted(unique_labels)]

    # Detailed classification report
    print(f"\n📋 Detailed Classification Report:")
    try:
        report = classification_report(
            all_labels,
            all_predictions,
            labels=sorted(unique_labels),
            target_names=class_names,
            zero_division=0
        )
        print(report)
    except Exception as e:
        print(f"Could not generate detailed report: {e}")
        # Basic report
        for label in sorted(unique_labels):
            mask = np.array(all_labels) == label
            if mask.sum() > 0:
                label_acc = accuracy_score(np.array(all_labels)[mask], np.array(all_predictions)[mask])
                print(f"  - Class {label}: {label_acc:.4f} accuracy ({mask.sum()} samples)")

    # Confusion matrix
    print(f"\n🔄 Confusion Matrix:")
    try:
        cm = confusion_matrix(all_labels, all_predictions, labels=sorted(unique_labels))
        print("Predicted →")
        print(f"Actual ↓  {sorted(unique_labels)}")
        for i, (label, row) in enumerate(zip(sorted(unique_labels), cm)):
            print(f"  {label}: {row}")
    except Exception as e:
        print(f"Could not generate confusion matrix: {e}")
        cm = None

    # Store baseline results
    baseline_results = {
        'accuracy': accuracy,
        'f1_macro': f1_macro,
        'f1_weighted': f1_weighted,
        'predictions': all_predictions,
        'labels': all_labels,
        'probabilities': all_probabilities,
        'unique_labels': unique_labels.tolist(),
        'confusion_matrix': cm.tolist() if cm is not None else None,
        'evaluation_time': end_time - start_time
    }

    return baseline_results

# Fix tokenizer and re-tokenize data
print("🚀 Starting baseline evaluation with fixes...")

# 1. Fix tokenizer padding
tokenizer = fix_tokenizer_padding(tokenizer)

# 2. Re-tokenize data with fixed padding
train_encodings_fixed, test_encodings_fixed = retokenize_data_with_fixed_padding(
    tokenizer, X_train, X_test, MAX_LENGTH
)

if train_encodings_fixed is not None and test_encodings_fixed is not None:
    # 3. Analyze label distribution
    unique_labels = analyze_label_distribution(y_train, y_test)

    # 4. Create datasets
    train_dataset = SustainabilityDataset(train_encodings_fixed, y_train)
    test_dataset = SustainabilityDataset(test_encodings_fixed, y_test)

    print(f"✅ Datasets created:")
    print(f"  - Training dataset: {len(train_dataset)} samples")
    print(f"  - Test dataset: {len(test_dataset)} samples")

    # 5. Run baseline evaluation with batch size 1
    baseline_results = evaluate_model_baseline_fixed(
        model, test_dataset, tokenizer, batch_size=1
    )

    if baseline_results is not None:
        print(f"\n💾 Baseline evaluation completed successfully!")
        print(f"🎯 Ready to proceed with fine-tuning!")

        # Update encodings for fine-tuning
        train_encodings = train_encodings_fixed
        test_encodings = test_encodings_fixed

        # Show GPU memory usage
        if torch.cuda.is_available():
            gpu_memory = torch.cuda.memory_allocated() / 1024**3
            gpu_total = torch.cuda.get_device_properties(0).total_memory / 1024**3
            print(f"📊 GPU Memory Usage: {gpu_memory:.2f}/{gpu_total:.1f} GB ({gpu_memory/gpu_total*100:.1f}%)")
    else:
        print("❌ Baseline evaluation failed!")
else:
    print("❌ Failed to fix tokenization!")

🚀 Starting baseline evaluation with fixes...
🔧 Fixing tokenizer padding configuration...
✅ Padding fixed:
  - pad_token: </s>
  - pad_token_id: 2
🔄 Re-tokenizing data with fixed padding...
✅ Re-tokenization completed!
🔍 Analyzing label distribution...
  - Unique labels in training: [0 1]
  - Unique labels in test: [0 1]
  - All unique labels: [0 1]
  - Training distribution: [ 57 183   0]
  - Test distribution: [24 79  0]
✅ Datasets created:
  - Training dataset: 240 samples
  - Test dataset: 103 samples
🔍 Running baseline evaluation (fixed version)...
  - Using batch size: 1
⚠️  This may take a few minutes...
  - Processed 10/103 samples (9.7%)
  - Processed 20/103 samples (19.4%)
  - Processed 30/103 samples (29.1%)
  - Processed 40/103 samples (38.8%)
  - Processed 50/103 samples (48.5%)
  - Processed 60/103 samples (58.3%)
  - Processed 70/103 samples (68.0%)
  - Processed 80/103 samples (77.7%)
  - Processed 90/103 samples (87.4%)
  - Processed 100/103 samples (97.1%)

📊 Label Ana

In [8]:
# =============================================================================
# STEP 10: Store Baseline Results in Google Sheet
# =============================================================================

def convert_to_serializable(obj):
    """Convert numpy/torch data types to Python native types"""
    if isinstance(obj, (np.integer, np.int64, np.int32)):
        return int(obj)
    elif isinstance(obj, (np.floating, np.float64, np.float32, np.float16)):
        return float(obj)
    elif isinstance(obj, np.ndarray):
        return obj.tolist()
    elif hasattr(obj, 'item'):  # For single-element tensors
        return obj.item()
    else:
        return obj

def create_baseline_results_sheet(service, sheet_id, baseline_results, X_test, y_test):
    """Create and populate baseline results sheet"""
    print("📊 Creating baseline results sheet...")

    try:
        # 1. Create new sheet for baseline results
        requests = [{
            'addSheet': {
                'properties': {
                    'title': 'Baseline_Results'
                }
            }
        }]

        body = {'requests': requests}
        service.spreadsheets().batchUpdate(spreadsheetId=sheet_id, body=body).execute()
        print("✅ Created 'Baseline_Results' sheet")

        # 2. Prepare baseline metrics data (convert all to native Python types)
        metrics_data = [
            ['Metric', 'Value'],
            ['Accuracy', convert_to_serializable(baseline_results['accuracy'])],
            ['F1 Score (Macro)', convert_to_serializable(baseline_results['f1_macro'])],
            ['F1 Score (Weighted)', convert_to_serializable(baseline_results['f1_weighted'])],
            ['Evaluation Time (seconds)', convert_to_serializable(baseline_results['evaluation_time'])],
            ['Total Test Samples', len(baseline_results['labels'])],
            ['Unique Actual Labels', str(baseline_results['unique_labels'])],
            ['GPU Memory Used (GB)', f"{torch.cuda.memory_allocated() / 1024**3:.2f}" if torch.cuda.is_available() else "N/A"],
            ['Model Name', MODEL_NAME],
            ['Max Length', MAX_LENGTH],
            ['Batch Size', 1]  # We used batch size 1 for baseline
        ]

        # 3. Write metrics to sheet
        range_name = 'Baseline_Results!A1:B' + str(len(metrics_data))
        body = {'values': metrics_data}
        service.spreadsheets().values().update(
            spreadsheetId=sheet_id,
            range=range_name,
            valueInputOption='RAW',
            body=body
        ).execute()

        print("✅ Baseline metrics written to sheet")

        # 4. Prepare detailed predictions data (convert all data types)
        predictions_data = [['Sample_ID', 'Actual_Label', 'Predicted_Label', 'Confidence', 'Text_Preview']]

        for i, (actual, pred, prob, text) in enumerate(zip(
            baseline_results['labels'],
            baseline_results['predictions'],
            baseline_results['probabilities'],
            X_test
        )):
            # Convert all data types to native Python types
            confidence = convert_to_serializable(max(prob))  # Get highest probability
            actual_label = convert_to_serializable(actual)
            pred_label = convert_to_serializable(pred)

            # Clean text preview
            text_preview = str(text)[:100] + "..." if len(str(text)) > 100 else str(text)
            # Remove any problematic characters
            text_preview = text_preview.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ')

            predictions_data.append([
                i + 1,
                actual_label,
                pred_label,
                round(confidence, 4),
                text_preview
            ])

        # 5. Write predictions to sheet (starting from column D)
        range_name = f'Baseline_Results!D1:H{len(predictions_data)}'
        body = {'values': predictions_data}
        service.spreadsheets().values().update(
            spreadsheetId=sheet_id,
            range=range_name,
            valueInputOption='RAW',
            body=body
        ).execute()

        print("✅ Baseline predictions written to sheet")

        # 6. Add confusion matrix if available
        if baseline_results['confusion_matrix'] is not None:
            cm_data = [['Confusion Matrix', '', ''], ['', 'Predicted 0', 'Predicted 1']]
            cm = baseline_results['confusion_matrix']
            for i, row in enumerate(cm):
                cm_data.append([f'Actual {i}'] + [convert_to_serializable(x) for x in row])

            # Write confusion matrix (starting from column J)
            range_name = f'Baseline_Results!J1:L{len(cm_data)}'
            body = {'values': cm_data}
            service.spreadsheets().values().update(
                spreadsheetId=sheet_id,
                range=range_name,
                valueInputOption='RAW',
                body=body
            ).execute()

            print("✅ Confusion matrix written to sheet")

        # 7. Add class distribution analysis
        class_dist_data = [['Class Distribution Analysis', '', '']]

        # Actual distribution
        actual_counts = np.bincount(baseline_results['labels'], minlength=3)
        pred_counts = np.bincount(baseline_results['predictions'], minlength=3)

        class_dist_data.append(['Class', 'Actual Count', 'Predicted Count'])
        for i in range(len(actual_counts)):
            class_dist_data.append([
                f'Class {i}',
                convert_to_serializable(actual_counts[i]),
                convert_to_serializable(pred_counts[i])
            ])

        # Write class distribution (starting from column J, below confusion matrix)
        start_row = 8 if baseline_results['confusion_matrix'] is not None else 1
        range_name = f'Baseline_Results!J{start_row}:L{start_row + len(class_dist_data) - 1}'
        body = {'values': class_dist_data}
        service.spreadsheets().values().update(
            spreadsheetId=sheet_id,
            range=range_name,
            valueInputOption='RAW',
            body=body
        ).execute()

        print("✅ Class distribution written to sheet")

        return True

    except Exception as e:
        print(f"❌ Error creating baseline results sheet: {e}")
        import traceback
        traceback.print_exc()
        return False

def create_dataset_info_sheet(service, sheet_id, X_train, X_test, y_train, y_test):
    """Create sheet with dataset information"""
    print("📋 Creating dataset info sheet...")

    try:
        # 1. Create new sheet for dataset info
        requests = [{
            'addSheet': {
                'properties': {
                    'title': 'Dataset_Info'
                }
            }
        }]

        body = {'requests': requests}
        service.spreadsheets().batchUpdate(spreadsheetId=sheet_id, body=body).execute()
        print("✅ Created 'Dataset_Info' sheet")

        # 2. Prepare dataset statistics (convert all to native types)
        dataset_stats = [
            ['Dataset Statistics', 'Value'],
            ['Total Samples', len(X_train) + len(X_test)],
            ['Training Samples', len(X_train)],
            ['Test Samples', len(X_test)],
            ['Train/Test Split', f"{len(X_train)}/{len(X_test)} ({len(X_train)/(len(X_train)+len(X_test))*100:.1f}%/{len(X_test)/(len(X_train)+len(X_test))*100:.1f}%)"],
            ['Number of Classes', len(np.unique(np.concatenate([y_train, y_test])))],
            ['Unique Labels', str(sorted(np.unique(np.concatenate([y_train, y_test]))))],
            [''],  # Empty row
            ['Training Set Distribution', ''],
        ]

        # Add training distribution
        train_counts = np.bincount(y_train, minlength=3)
        for i, count in enumerate(train_counts):
            if count > 0:
                percentage = (count / len(y_train)) * 100
                dataset_stats.append([f'  Class {i}', f'{int(count)} ({percentage:.1f}%)'])

        dataset_stats.append([''])  # Empty row
        dataset_stats.append(['Test Set Distribution', ''])

        # Add test distribution
        test_counts = np.bincount(y_test, minlength=3)
        for i, count in enumerate(test_counts):
            if count > 0:
                percentage = (count / len(y_test)) * 100
                dataset_stats.append([f'  Class {i}', f'{int(count)} ({percentage:.1f}%)'])

        # 3. Write dataset stats to sheet
        range_name = f'Dataset_Info!A1:B{len(dataset_stats)}'
        body = {'values': dataset_stats}
        service.spreadsheets().values().update(
            spreadsheetId=sheet_id,
            range=range_name,
            valueInputOption='RAW',
            body=body
        ).execute()

        print("✅ Dataset statistics written to sheet")

        # 4. Add text length analysis
        text_lengths_train = [len(str(text)) for text in X_train]
        text_lengths_test = [len(str(text)) for text in X_test]
        all_lengths = text_lengths_train + text_lengths_test

        text_analysis = [
            ['Text Length Analysis', 'Value'],
            ['Average Length (characters)', f'{np.mean(all_lengths):.0f}'],
            ['Median Length (characters)', f'{np.median(all_lengths):.0f}'],
            ['Min Length (characters)', f'{int(np.min(all_lengths))}'],
            ['Max Length (characters)', f'{int(np.max(all_lengths))}'],
            ['Standard Deviation', f'{np.std(all_lengths):.0f}'],
            [''],  # Empty row
            ['Training Set Text Lengths', ''],
            ['  Average', f'{np.mean(text_lengths_train):.0f}'],
            ['  Median', f'{np.median(text_lengths_train):.0f}'],
            [''],  # Empty row
            ['Test Set Text Lengths', ''],
            ['  Average', f'{np.mean(text_lengths_test):.0f}'],
            ['  Median', f'{np.median(text_lengths_test):.0f}'],
        ]

        # Write text analysis (starting from column D)
        range_name = f'Dataset_Info!D1:E{len(text_analysis)}'
        body = {'values': text_analysis}
        service.spreadsheets().values().update(
            spreadsheetId=sheet_id,
            range=range_name,
            valueInputOption='RAW',
            body=body
        ).execute()

        print("✅ Text analysis written to sheet")

        return True

    except Exception as e:
        print(f"❌ Error creating dataset info sheet: {e}")
        import traceback
        traceback.print_exc()
        return False

# Store baseline results in Google Sheet
print("🚀 Storing baseline results in Google Sheet...")

if 'service' in globals() and service is not None:
    # Create baseline results sheet
    baseline_stored = create_baseline_results_sheet(
        service, SHEET_ID, baseline_results, X_test, y_test
    )

    # Create dataset info sheet
    dataset_stored = create_dataset_info_sheet(
        service, SHEET_ID, X_train, X_test, y_train, y_test
    )

    if baseline_stored and dataset_stored:
        print(f"\n✅ All results stored successfully!")
        print(f"📊 Created sheets:")
        print(f"  - 'Baseline_Results': Contains baseline metrics and predictions")
        print(f"  - 'Dataset_Info': Contains dataset statistics and analysis")
        print(f"\n🔗 Check your Google Sheet: {GOOGLE_SHEET_URL}")
        print(f"\n🎯 Ready to proceed with fine-tuning!")
    else:
        print("❌ Failed to store some results")
        print("ℹ️  Results are stored in memory for fine-tuning comparison")
else:
    print("❌ Google Sheets service not available")
    print("ℹ️  Results are stored in memory for fine-tuning comparison")

# Summary of what we have so far
print(f"\n📋 Summary:")
print(f"  - Baseline Accuracy: {baseline_results['accuracy']:.4f}")
print(f"  - Baseline F1 (Macro): {baseline_results['f1_macro']:.4f}")
print(f"  - Test Samples: {len(baseline_results['labels'])}")
print(f"  - Classes in Test Set: {baseline_results['unique_labels']}")
print(f"  - Model: {MODEL_NAME}")
print(f"  - Ready for fine-tuning!")

🚀 Storing baseline results in Google Sheet...
📊 Creating baseline results sheet...
✅ Created 'Baseline_Results' sheet
✅ Baseline metrics written to sheet
✅ Baseline predictions written to sheet
✅ Confusion matrix written to sheet
✅ Class distribution written to sheet
📋 Creating dataset info sheet...
✅ Created 'Dataset_Info' sheet
✅ Dataset statistics written to sheet
✅ Text analysis written to sheet

✅ All results stored successfully!
📊 Created sheets:
  - 'Baseline_Results': Contains baseline metrics and predictions
  - 'Dataset_Info': Contains dataset statistics and analysis

🔗 Check your Google Sheet: https://docs.google.com/spreadsheets/d/1CpWL01U9HSfmre2OjFj3GkMV816EYZOryxWGDDVouy4/edit?gid=1497010733#gid=1497010733

🎯 Ready to proceed with fine-tuning!

📋 Summary:
  - Baseline Accuracy: 0.2233
  - Baseline F1 (Macro): 0.1433
  - Test Samples: 103
  - Classes in Test Set: [0, 1]
  - Model: mistralai/Mistral-7B-Instruct-v0.1
  - Ready for fine-tuning!


In [26]:
# =============================================================================
# STEP 10.1: Add Class Translation and Enhanced Results
# =============================================================================

def translate_labels_to_scores(labels):
    """Translate encoded labels back to combined scores"""
    score_mapping = {
        0: "Low (0-1)",      # Combined score 0-1
        1: "Medium (1-2)",   # Combined score 1-2
        2: "High (2-3)"      # Combined score 2-3
    }
    return [score_mapping.get(label, f"Unknown ({label})") for label in labels]

def translate_labels_to_relevance_usefulness(labels):
    """Translate encoded labels to relevance/usefulness interpretation"""
    interpretation_mapping = {
        0: "Low Relevance & Low Usefulness",
        1: "Medium Relevance & Medium Usefulness",
        2: "High Relevance & High Usefulness"
    }
    return [interpretation_mapping.get(label, f"Unknown ({label})") for label in labels]

def add_translated_results_sheet(service, sheet_id, baseline_results, X_test, y_test):
    """Add a sheet with translated class meanings"""
    print("🔄 Adding translated results sheet...")

    try:
        # 1. Create new sheet for translated results
        requests = [{
            'addSheet': {
                'properties': {
                    'title': 'Baseline_Results_Translated'
                }
            }
        }]

        body = {'requests': requests}
        service.spreadsheets().batchUpdate(spreadsheetId=sheet_id, body=body).execute()
        print("✅ Created 'Baseline_Results_Translated' sheet")

        # 2. Create class mapping explanation
        class_explanation = [
            ['Class Mapping Explanation', '', ''],
            ['Encoded Label', 'Combined Score Range', 'Meaning'],
            ['0', '0.0 - 1.0', 'Low Relevance & Low Usefulness'],
            ['1', '1.0 - 2.0', 'Medium Relevance & Medium Usefulness'],
            ['2', '2.0 - 3.0', 'High Relevance & High Usefulness'],
            [''],  # Empty row
            ['Note: Combined Score = (Relevance + Usefulness) / 2', '', ''],
            [''],  # Empty row
        ]

        # Write class explanation
        range_name = f'Baseline_Results_Translated!A1:C{len(class_explanation)}'
        body = {'values': class_explanation}
        service.spreadsheets().values().update(
            spreadsheetId=sheet_id,
            range=range_name,
            valueInputOption='RAW',
            body=body
        ).execute()

        # 3. Enhanced baseline metrics with translations
        enhanced_metrics = [
            ['Enhanced Baseline Metrics', 'Value', 'Interpretation'],
            ['Accuracy', f'{baseline_results["accuracy"]:.4f}', f'{baseline_results["accuracy"]*100:.2f}% of predictions correct'],
            ['F1 Score (Macro)', f'{baseline_results["f1_macro"]:.4f}', 'Average F1 across all classes'],
            ['F1 Score (Weighted)', f'{baseline_results["f1_weighted"]:.4f}', 'F1 weighted by class frequency'],
            [''],  # Empty row
            ['Class Distribution Analysis', '', ''],
        ]

        # Add class distribution with translations
        actual_counts = np.bincount(baseline_results['labels'], minlength=3)
        pred_counts = np.bincount(baseline_results['predictions'], minlength=3)

        for i in range(3):
            class_meaning = translate_labels_to_relevance_usefulness([i])[0]
            enhanced_metrics.append([
                f'Class {i} ({class_meaning})',
                f'Actual: {int(actual_counts[i])}, Predicted: {int(pred_counts[i])}',
                f'{actual_counts[i]/len(baseline_results["labels"])*100:.1f}% of actual data'
            ])

        # Write enhanced metrics (starting from row 10)
        start_row = len(class_explanation) + 2
        range_name = f'Baseline_Results_Translated!A{start_row}:C{start_row + len(enhanced_metrics) - 1}'
        body = {'values': enhanced_metrics}
        service.spreadsheets().values().update(
            spreadsheetId=sheet_id,
            range=range_name,
            valueInputOption='RAW',
            body=body
        ).execute()

        # 4. Detailed predictions with translations
        translated_predictions = [
            ['Sample_ID', 'Actual_Class', 'Actual_Meaning', 'Predicted_Class', 'Predicted_Meaning', 'Confidence', 'Correct?', 'Text_Preview']
        ]

        actual_translations = translate_labels_to_relevance_usefulness(baseline_results['labels'])
        pred_translations = translate_labels_to_relevance_usefulness(baseline_results['predictions'])

        for i, (actual, pred, actual_trans, pred_trans, prob, text) in enumerate(zip(
            baseline_results['labels'],
            baseline_results['predictions'],
            actual_translations,
            pred_translations,
            baseline_results['probabilities'],
            X_test
        )):
            confidence = convert_to_serializable(max(prob))
            is_correct = "✓" if actual == pred else "✗"

            text_preview = str(text)[:80] + "..." if len(str(text)) > 80 else str(text)
            text_preview = text_preview.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ')

            translated_predictions.append([
                i + 1,
                convert_to_serializable(actual),
                actual_trans,
                convert_to_serializable(pred),
                pred_trans,
                round(confidence, 4),
                is_correct,
                text_preview
            ])

        # Write translated predictions (starting from column E)
        range_name = f'Baseline_Results_Translated!E1:L{len(translated_predictions)}'
        body = {'values': translated_predictions}
        service.spreadsheets().values().update(
            spreadsheetId=sheet_id,
            range=range_name,
            valueInputOption='RAW',
            body=body
        ).execute()

        print("✅ Translated predictions written to sheet")

        # 5. Add performance insights
        insights_start_row = start_row + len(enhanced_metrics) + 2

        # Calculate some insights
        correct_predictions = sum(1 for a, p in zip(baseline_results['labels'], baseline_results['predictions']) if a == p)
        total_predictions = len(baseline_results['labels'])

        # Most common mistakes
        mistake_analysis = []
        for actual in [0, 1]:  # Only classes present in test set
            for pred in [0, 1, 2]:  # All possible predictions
                if actual != pred:
                    count = sum(1 for a, p in zip(baseline_results['labels'], baseline_results['predictions'])
                              if a == actual and p == pred)
                    if count > 0:
                        actual_meaning = translate_labels_to_relevance_usefulness([actual])[0]
                        pred_meaning = translate_labels_to_relevance_usefulness([pred])[0]
                        mistake_analysis.append([
                            f'Confused {actual_meaning}',
                            f'with {pred_meaning}',
                            f'{count} times ({count/total_predictions*100:.1f}%)'
                        ])

        insights_data = [
            ['Performance Insights', '', ''],
            ['Total Correct Predictions', f'{correct_predictions}/{total_predictions}', f'{correct_predictions/total_predictions*100:.2f}%'],
            ['Total Incorrect Predictions', f'{total_predictions - correct_predictions}/{total_predictions}', f'{(total_predictions - correct_predictions)/total_predictions*100:.2f}%'],
            [''],  # Empty row
            ['Most Common Mistakes', '', ''],
        ]

        insights_data.extend(mistake_analysis)

        # Write insights
        range_name = f'Baseline_Results_Translated!A{insights_start_row}:C{insights_start_row + len(insights_data) - 1}'
        body = {'values': insights_data}
        service.spreadsheets().values().update(
            spreadsheetId=sheet_id,
            range=range_name,
            valueInputOption='RAW',
            body=body
        ).execute()

        print("✅ Performance insights written to sheet")

        return True

    except Exception as e:
        print(f"❌ Error creating translated results sheet: {e}")
        import traceback
        traceback.print_exc()
        return False

def display_translated_summary(baseline_results):
    """Display a summary with translated class meanings"""
    print("\n" + "="*80)
    print("📊 BASELINE RESULTS SUMMARY WITH TRANSLATIONS")
    print("="*80)

    # Class mapping
    print("\n🔍 Class Mapping:")
    print("  Class 0: Low Relevance & Low Usefulness (Combined Score 0-1)")
    print("  Class 1: Medium Relevance & Medium Usefulness (Combined Score 1-2)")
    print("  Class 2: High Relevance & High Usefulness (Combined Score 2-3)")

    # Performance metrics
    print(f"\n📈 Performance Metrics:")
    print(f"  Accuracy: {baseline_results['accuracy']:.4f} ({baseline_results['accuracy']*100:.2f}%)")
    print(f"  F1 Score (Macro): {baseline_results['f1_macro']:.4f}")
    print(f"  F1 Score (Weighted): {baseline_results['f1_weighted']:.4f}")

    # Class distribution
    print(f"\n📊 Class Distribution in Test Set:")
    actual_counts = np.bincount(baseline_results['labels'], minlength=3)
    pred_counts = np.bincount(baseline_results['predictions'], minlength=3)

    class_meanings = [
        "Low Relevance & Low Usefulness",
        "Medium Relevance & Medium Usefulness",
        "High Relevance & High Usefulness"
    ]

    for i in range(3):
        if actual_counts[i] > 0 or pred_counts[i] > 0:
            print(f"  Class {i} ({class_meanings[i]}):")
            print(f"    Actual: {int(actual_counts[i])} ({actual_counts[i]/len(baseline_results['labels'])*100:.1f}%)")
            print(f"    Predicted: {int(pred_counts[i])} ({pred_counts[i]/len(baseline_results['predictions'])*100:.1f}%)")

    # Key insights
    print(f"\n🔑 Key Insights:")
    print(f"  • Model is performing very poorly (near random guessing)")
    print(f"  • Test set only contains Classes 0 and 1 (no high relevance/usefulness samples)")
    print(f"  • Model is predicting all 3 classes despite training data distribution")
    print(f"  • Strong class imbalance: {actual_counts[1]} medium vs {actual_counts[0]} low samples")
    print(f"  • Fine-tuning should significantly improve these results")

    print("="*80)

# Add translated results
print("🚀 Adding translated class meanings to results...")

if 'service' in globals() and service is not None:
    translated_added = add_translated_results_sheet(
        service, SHEET_ID, baseline_results, X_test, y_test
    )

    if translated_added:
        print(f"\n✅ Translated results sheet created successfully!")
        print(f"📊 New sheet created: 'Baseline_Results_Translated'")
        print(f"🔗 Check your Google Sheet: {GOOGLE_SHEET_URL}")
    else:
        print("❌ Failed to create translated results sheet")
else:
    print("⚠️  Google Sheets service not available - showing summary only")

# Display translated summary
display_translated_summary(baseline_results)

print(f"\n🎯 Ready to proceed with fine-tuning!")
print(f"📋 Expected improvements after fine-tuning:")
print(f"  • Accuracy should improve from {baseline_results['accuracy']*100:.2f}% to >70%")
print(f"  • F1 scores should improve significantly")
print(f"  • Better class separation and fewer prediction errors")

🚀 Adding translated class meanings to results...
🔄 Adding translated results sheet...
✅ Created 'Baseline_Results_Translated' sheet
✅ Translated predictions written to sheet
✅ Performance insights written to sheet

✅ Translated results sheet created successfully!
📊 New sheet created: 'Baseline_Results_Translated'
🔗 Check your Google Sheet: https://docs.google.com/spreadsheets/d/1CpWL01U9HSfmre2OjFj3GkMV816EYZOryxWGDDVouy4/edit?gid=1497010733#gid=1497010733

📊 BASELINE RESULTS SUMMARY WITH TRANSLATIONS

🔍 Class Mapping:
  Class 0: Low Relevance & Low Usefulness (Combined Score 0-1)
  Class 1: Medium Relevance & Medium Usefulness (Combined Score 1-2)
  Class 2: High Relevance & High Usefulness (Combined Score 2-3)

📈 Performance Metrics:
  Accuracy: 0.3786 (37.86%)
  F1 Score (Macro): 0.3076
  F1 Score (Weighted): 0.4260

📊 Class Distribution in Test Set:
  Class 0 (Low Relevance & Low Usefulness):
    Actual: 24 (23.3%)
    Predicted: 46 (44.7%)
  Class 1 (Medium Relevance & Medium Usef

## **Fine-tune Model**
fine-tuning with mistral not working, you can skip this section and use the other model alter in the notebook

In [13]:
# =============================================================================
# STEP 21: Fix Mistral Training Issues and Improve Performance
# =============================================================================

import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from torch.utils.data import DataLoader
from torch.optim import AdamW
from sklearn.metrics import accuracy_score, f1_score
import gc

def diagnose_mistral_issues(model, tokenizer, X_train, y_train):
    """Diagnose why Mistral training is failing"""
    print("🔍 Diagnosing Mistral training issues...")

    # Test with a single sample
    test_text = X_train[0]
    test_label = y_train[0]

    print(f"📝 Test sample:")
    print(f"  - Text length: {len(test_text)} characters")
    print(f"  - Label: {test_label}")
    print(f"  - Text preview: {test_text[:100]}...")

    # Tokenize single sample
    encoding = tokenizer(
        test_text,
        truncation=True,
        padding='max_length',
        max_length=128,
        return_tensors='pt'
    )

    print(f"📊 Tokenization:")
    print(f"  - Input shape: {encoding['input_ids'].shape}")
    print(f"  - Attention mask sum: {encoding['attention_mask'].sum().item()}")

    # Move to device
    device = next(model.parameters()).device
    encoding = {k: v.to(device) for k, v in encoding.items()}
    label = torch.tensor([test_label], dtype=torch.long).to(device)

    # Test forward pass
    model.eval()
    with torch.no_grad():
        outputs = model(
            input_ids=encoding['input_ids'],
            attention_mask=encoding['attention_mask']
        )

        logits = outputs.logits
        probabilities = F.softmax(logits, dim=-1)

        print(f"🔍 Model output:")
        print(f"  - Logits shape: {logits.shape}")
        print(f"  - Logits: {logits}")
        print(f"  - Probabilities: {probabilities}")
        print(f"  - Predicted class: {torch.argmax(logits, dim=-1).item()}")

        # Check for issues
        if torch.isnan(logits).any():
            print("  ❌ NaN in logits!")
            return False

        if torch.isinf(logits).any():
            print("  ❌ Inf in logits!")
            return False

        # Test loss calculation
        loss = F.cross_entropy(logits, label)
        print(f"  - Loss: {loss.item()}")

        if torch.isnan(loss):
            print("  ❌ NaN loss!")
            return False

        print("  ✅ Forward pass looks good")

    # Test training step
    model.train()
    optimizer = AdamW(model.parameters(), lr=5e-5)

    print("🔧 Testing training step...")

    optimizer.zero_grad()

    outputs = model(
        input_ids=encoding['input_ids'],
        attention_mask=encoding['attention_mask'],
        labels=label
    )

    loss = outputs.loss
    print(f"  - Training loss: {loss.item()}")

    if torch.isnan(loss) or torch.isinf(loss):
        print("  ❌ Invalid training loss!")
        return False

    # Backward pass
    loss.backward()

    # Check gradients
    grad_norm = 0
    for param in model.parameters():
        if param.grad is not None:
            grad_norm += param.grad.data.norm(2).item() ** 2

    grad_norm = grad_norm ** 0.5
    print(f"  - Gradient norm: {grad_norm}")

    if grad_norm == 0:
        print("  ❌ Zero gradients!")
        return False

    if grad_norm > 1000:
        print("  ⚠️  Very large gradients!")

    optimizer.step()
    print("  ✅ Training step successful")

    return True

def improved_mistral_training(model, tokenizer, X_train, X_test, y_train, y_test):
    """Improved training with better hyperparameters"""
    print("🎯 Starting improved Mistral training...")

    # Better hyperparameters
    max_len = 128
    batch_size = 1
    gradient_accumulation_steps = 4  # Reduced from 8
    learning_rate = 1e-4  # Increased learning rate
    epochs = 4  # More epochs
    warmup_steps = 10

    print(f"⚙️ Improved configuration:")
    print(f"  - Learning rate: {learning_rate} (increased)")
    print(f"  - Gradient accumulation: {gradient_accumulation_steps} (reduced)")
    print(f"  - Epochs: {epochs} (increased)")
    print(f"  - Warmup steps: {warmup_steps}")

    # Tokenize data
    train_encodings = tokenizer(
        list(X_train),
        truncation=True,
        padding='max_length',
        max_length=max_len,
        return_tensors='pt'
    )

    test_encodings = tokenizer(
        list(X_test),
        truncation=True,
        padding='max_length',
        max_length=max_len,
        return_tensors='pt'
    )

    # Dataset
    class ImprovedDataset(torch.utils.data.Dataset):
        def __init__(self, encodings, labels):
            self.encodings = encodings
            self.labels = labels

        def __getitem__(self, idx):
            return {
                'input_ids': self.encodings['input_ids'][idx],
                'attention_mask': self.encodings['attention_mask'][idx],
                'labels': torch.tensor(self.labels[idx], dtype=torch.long)
            }

        def __len__(self):
            return len(self.labels)

    train_dataset = ImprovedDataset(train_encodings, y_train)
    test_dataset = ImprovedDataset(test_encodings, y_test)

    # Data loaders
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    # Better optimizer with warmup
    optimizer = AdamW(model.parameters(), lr=learning_rate, weight_decay=0.01, eps=1e-6)

    # Learning rate scheduler
    from torch.optim.lr_scheduler import LinearLR
    scheduler = LinearLR(optimizer, start_factor=0.1, total_iters=warmup_steps)

    # Training loop
    device = next(model.parameters()).device
    model.train()

    best_accuracy = 0
    best_results = None
    training_step = 0

    for epoch in range(epochs):
        print(f"\n📈 Epoch {epoch + 1}/{epochs}")

        epoch_loss = 0
        valid_batches = 0

        for batch_idx, batch in enumerate(train_loader):
            batch = {k: v.to(device) for k, v in batch.items()}

            # Forward pass
            outputs = model(
                input_ids=batch['input_ids'],
                attention_mask=batch['attention_mask'],
                labels=batch['labels']
            )

            loss = outputs.loss / gradient_accumulation_steps

            # Skip invalid losses
            if torch.isnan(loss) or torch.isinf(loss):
                print(f"  ⚠️  Skipping batch {batch_idx} (invalid loss)")
                continue

            # Backward pass
            loss.backward()

            epoch_loss += loss.item() * gradient_accumulation_steps
            valid_batches += 1

            # Gradient accumulation
            if (batch_idx + 1) % gradient_accumulation_steps == 0:
                # Gradient clipping
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=0.5)  # Reduced clipping

                # Optimizer step
                optimizer.step()
                optimizer.zero_grad()

                # Learning rate scheduling (only during warmup)
                if training_step < warmup_steps:
                    scheduler.step()

                training_step += 1

                # Memory cleanup
                torch.cuda.empty_cache()

            # Progress update
            if (batch_idx + 1) % 40 == 0:
                avg_loss = epoch_loss / max(valid_batches, 1)
                current_lr = optimizer.param_groups[0]['lr']
                print(f"  - Batch {batch_idx + 1}: Loss={avg_loss:.4f}, LR={current_lr:.2e}")

        # Evaluation
        print(f"  - Evaluating...")
        model.eval()

        all_predictions = []
        all_labels = []
        eval_loss = 0
        eval_batches = 0

        with torch.no_grad():
            for batch in test_loader:
                batch = {k: v.to(device) for k, v in batch.items()}

                outputs = model(
                    input_ids=batch['input_ids'],
                    attention_mask=batch['attention_mask'],
                    labels=batch['labels']
                )

                if not torch.isnan(outputs.loss):
                    eval_loss += outputs.loss.item()
                    eval_batches += 1

                predictions = torch.argmax(outputs.logits, dim=-1)
                all_predictions.extend(predictions.cpu().numpy())
                all_labels.extend(batch['labels'].cpu().numpy())

        # Calculate metrics
        accuracy = accuracy_score(all_labels, all_predictions)
        f1_macro = f1_score(all_labels, all_predictions, average='macro')
        f1_weighted = f1_score(all_labels, all_predictions, average='weighted')
        avg_eval_loss = eval_loss / max(eval_batches, 1)

        print(f"  - Train Loss: {epoch_loss / max(valid_batches, 1):.4f}")
        print(f"  - Eval Loss: {avg_eval_loss:.4f}")
        print(f"  - Accuracy: {accuracy:.4f}")
        print(f"  - F1 (macro): {f1_macro:.4f}")
        print(f"  - F1 (weighted): {f1_weighted:.4f}")
        print(f"  - Valid batches: {valid_batches}/{len(train_loader)}")

        # Save best model
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            print(f"  🎯 New best accuracy: {best_accuracy:.4f}")

            best_results = {
                'eval_accuracy': accuracy,
                'eval_f1_macro': f1_macro,
                'eval_f1_weighted': f1_weighted,
                'predictions': all_predictions,
                'labels': all_labels,
                'model_name': "mistralai/Mistral-7B-Instruct-v0.1"
            }

            # Save model
            try:
                model.save_pretrained("./mistral_improved")
                tokenizer.save_pretrained("./mistral_improved")
                print("  💾 Best model saved!")
            except Exception as e:
                print(f"  ⚠️  Save failed: {e}")

        model.train()
        torch.cuda.empty_cache()
        gc.collect()

    return best_results

# Run diagnosis and improved training
print("🔍 Starting Mistral diagnosis and improved training...")

if 'final_mistral_model' in locals() and 'final_mistral_tokenizer' in locals():

    # Get training data
    if 'mistral_X_train' in globals():
        X_train = mistral_X_train
        X_test = mistral_X_test
        y_train = mistral_y_train
        y_test = mistral_y_test

        print(f"📊 Training data available:")
        print(f"  - Training samples: {len(X_train)}")
        print(f"  - Test samples: {len(X_test)}")

        # 1. Diagnose issues
        diagnosis_ok = diagnose_mistral_issues(
            final_mistral_model, final_mistral_tokenizer, X_train, y_train
        )

        if diagnosis_ok:
            print("✅ Diagnosis passed, starting improved training...")

            # 2. Run improved training
            improved_results = improved_mistral_training(
                final_mistral_model, final_mistral_tokenizer, X_train, X_test, y_train, y_test
            )

            if improved_results is not None:
                print(f"\n🎉 Improved Mistral training completed!")
                print(f"📊 Improved Results:")
                print(f"  - Accuracy: {improved_results['eval_accuracy']:.4f}")
                print(f"  - F1 (macro): {improved_results['eval_f1_macro']:.4f}")
                print(f"  - F1 (weighted): {improved_results['eval_f1_weighted']:.4f}")

                # Compare with previous results
                if 'final_mistral_results' in locals():
                    prev_acc = final_mistral_results['eval_accuracy']
                    new_acc = improved_results['eval_accuracy']
                    improvement = new_acc - prev_acc

                    print(f"\n📈 Comparison:")
                    print(f"  - Previous accuracy: {prev_acc:.4f}")
                    print(f"  - Improved accuracy: {new_acc:.4f}")
                    print(f"  - Improvement: {improvement:+.4f}")

                    if improvement > 0.05:  # 5% improvement
                        print("  ✅ Significant improvement!")
                        # Update final results
                        final_mistral_results = improved_results
                    else:
                        print("  ⚠️  Limited improvement")

                else:
                    final_mistral_results = improved_results

                print(f"\n🎯 Mistral training optimization completed!")

            else:
                print("❌ Improved training failed!")
        else:
            print("❌ Diagnosis failed - fundamental issues with model")
    else:
        print("❌ Training data not available!")
else:
    print("❌ Mistral model not available!")

🔍 Starting Mistral diagnosis and improved training...
📊 Training data available:
  - Training samples: 240
  - Test samples: 103
🔍 Diagnosing Mistral training issues...
📝 Test sample:
  - Text length: 61 characters
  - Label: 1
  - Text preview: Description: co2_scope3_2024 (location-based)

Content: 64,97...
📊 Tokenization:
  - Input shape: torch.Size([1, 128])
  - Attention mask sum: 28
🔍 Model output:
  - Logits shape: torch.Size([1, 3])
  - Logits: tensor([[nan, nan, nan]], device='cuda:0', dtype=torch.float16)
  - Probabilities: tensor([[nan, nan, nan]], device='cuda:0', dtype=torch.float16)
  - Predicted class: 0
  ❌ NaN in logits!
❌ Diagnosis failed - fundamental issues with model


# **Clean Training Approach with "distilbert-base-uncased" for training debugging**

In [33]:
# =============================================================================
# STEP 11: Clean Training Approach with "distilbert-base-uncased" for training debugging
# =============================================================================

import os
import gc
import torch
import numpy as np
from torch.utils.data import DataLoader
from torch.optim import AdamW
from transformers import get_linear_schedule_with_warmup

def clean_model_setup():
    """Clean model setup without any problematic configurations"""
    print("🧹 Setting up clean model for training...")

    # Clear everything
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        gc.collect()

    try:
        # Use a simpler, more stable model approach
        print("  - Loading model with basic configuration...")

        # Try without quantization first for debugging
        clean_model = AutoModelForSequenceClassification.from_pretrained(
            "distilbert-base-uncased",  # More stable model for testing
            num_labels=NUM_LABELS,
            torch_dtype=torch.float32,  # Use FP32 for stability
            device_map=None,  # No automatic device mapping
        )

        # Move to device manually
        clean_model = clean_model.to(device)

        # Setup tokenizer
        clean_tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
        if clean_tokenizer.pad_token is None:
            clean_tokenizer.pad_token = clean_tokenizer.eos_token

        print(f"✅ Clean model loaded on {next(clean_model.parameters()).device}")
        print(f"  - Model dtype: {next(clean_model.parameters()).dtype}")

        return clean_model, clean_tokenizer

    except Exception as e:
        print(f"❌ Clean model setup failed: {e}")
        return None, None

def simple_tokenize_and_prepare(tokenizer, X_train, X_test, y_train, y_test):
    """Simple tokenization with debugging"""
    print("📊 Simple tokenization...")

    # Tokenize with shorter sequences for stability
    max_len = 256  # Shorter for debugging

    train_encodings = tokenizer(
        list(X_train),
        truncation=True,
        padding='max_length',
        max_length=max_len,
        return_tensors='pt'
    )

    test_encodings = tokenizer(
        list(X_test),
        truncation=True,
        padding='max_length',
        max_length=max_len,
        return_tensors='pt'
    )

    print(f"✅ Tokenization completed:")
    print(f"  - Max length: {max_len}")
    print(f"  - Train shape: {train_encodings['input_ids'].shape}")
    print(f"  - Test shape: {test_encodings['input_ids'].shape}")

    return train_encodings, test_encodings

def debug_training_step(model, tokenizer, train_encodings, y_train):
    """Debug a single training step"""
    print("🔍 Debugging single training step...")

    # Take first sample
    sample_input = {
        'input_ids': train_encodings['input_ids'][:1].to(device),
        'attention_mask': train_encodings['attention_mask'][:1].to(device)
    }
    sample_label = torch.tensor([y_train[0]], dtype=torch.long).to(device)

    model.train()

    # Forward pass
    print("  - Forward pass...")
    outputs = model(**sample_input, labels=sample_label)

    loss = outputs.loss
    logits = outputs.logits

    print(f"  - Loss: {loss.item()}")
    print(f"  - Logits shape: {logits.shape}")
    print(f"  - Logits: {logits}")

    # Check for NaN
    if torch.isnan(loss):
        print("  ❌ Loss is NaN!")
        return False

    # Backward pass
    print("  - Backward pass...")
    loss.backward()

    # Check gradients
    grad_norm = 0
    for param in model.parameters():
        if param.grad is not None:
            grad_norm += param.grad.data.norm(2).item() ** 2
    grad_norm = grad_norm ** 0.5

    print(f"  - Gradient norm: {grad_norm}")

    if grad_norm > 1000:
        print("  ⚠️  Large gradient norm detected!")

    # Clear gradients
    model.zero_grad()

    return True

def manual_training_clean(model, tokenizer, train_encodings, test_encodings, y_train, y_test):
    """Clean manual training without any mixed precision"""
    print("🎯 Starting clean manual training...")

    # Create simple datasets
    class SimpleDataset(torch.utils.data.Dataset):
        def __init__(self, encodings, labels):
            self.encodings = encodings
            self.labels = labels

        def __getitem__(self, idx):
            return {
                'input_ids': self.encodings['input_ids'][idx],
                'attention_mask': self.encodings['attention_mask'][idx],
                'labels': torch.tensor(self.labels[idx], dtype=torch.long)
            }

        def __len__(self):
            return len(self.labels)

    train_dataset = SimpleDataset(train_encodings, y_train)
    test_dataset = SimpleDataset(test_encodings, y_test)

    # Create data loaders
    train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=2, shuffle=False)

    # Setup optimizer with smaller learning rate
    optimizer = AdamW(model.parameters(), lr=1e-5, weight_decay=0.01)

    # Training loop
    model.train()

    for epoch in range(2):  # Just 2 epochs for debugging
        print(f"\n📈 Epoch {epoch + 1}/2")

        epoch_loss = 0
        num_batches = 0

        for batch_idx, batch in enumerate(train_loader):
            # Move to device
            batch = {k: v.to(device) for k, v in batch.items()}

            # Zero gradients
            optimizer.zero_grad()

            # Forward pass
            outputs = model(
                input_ids=batch['input_ids'],
                attention_mask=batch['attention_mask'],
                labels=batch['labels']
            )

            loss = outputs.loss

            # Check for NaN
            if torch.isnan(loss):
                print(f"  ❌ NaN loss at batch {batch_idx}")
                continue

            # Backward pass
            loss.backward()

            # Gradient clipping
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

            # Optimizer step
            optimizer.step()

            epoch_loss += loss.item()
            num_batches += 1

            if (batch_idx + 1) % 10 == 0:
                avg_loss = epoch_loss / num_batches
                print(f"  - Batch {batch_idx + 1}: Loss={loss.item():.4f}, Avg Loss={avg_loss:.4f}")

        # Evaluation
        print(f"  - Evaluating...")
        model.eval()

        all_predictions = []
        all_labels = []

        with torch.no_grad():
            for batch in test_loader:
                batch = {k: v.to(device) for k, v in batch.items()}

                outputs = model(
                    input_ids=batch['input_ids'],
                    attention_mask=batch['attention_mask']
                )

                predictions = torch.argmax(outputs.logits, dim=-1)

                all_predictions.extend(predictions.cpu().numpy())
                all_labels.extend(batch['labels'].cpu().numpy())

        # Calculate metrics
        accuracy = accuracy_score(all_labels, all_predictions)
        f1_macro = f1_score(all_labels, all_predictions, average='macro')
        f1_weighted = f1_score(all_labels, all_predictions, average='weighted')

        print(f"  - Accuracy: {accuracy:.4f}")
        print(f"  - F1 (macro): {f1_macro:.4f}")
        print(f"  - F1 (weighted): {f1_weighted:.4f}")

        model.train()

        # Memory cleanup
        torch.cuda.empty_cache()
        gc.collect()

    return {
        'eval_accuracy': accuracy,
        'eval_f1_macro': f1_macro,
        'eval_f1_weighted': f1_weighted
    }

# Start clean training approach
print("🧹 Starting completely clean training approach...")

# 1. Clean model setup
clean_model, clean_tokenizer = clean_model_setup()

if clean_model is not None and clean_tokenizer is not None:

    # 2. Simple tokenization
    train_enc, test_enc = simple_tokenize_and_prepare(
        clean_tokenizer, X_train, X_test, y_train, y_test
    )

    # 3. Debug single training step
    debug_success = debug_training_step(clean_model, clean_tokenizer, train_enc, y_train)

    if debug_success:
        print("✅ Single step debug passed!")

        # 4. Run clean training
        results = manual_training_clean(
            clean_model, clean_tokenizer, train_enc, test_enc, y_train, y_test
        )

        print(f"\n🎉 Clean training completed!")
        print(f"📊 Results:")
        print(f"  - Accuracy: {results['eval_accuracy']:.4f}")
        print(f"  - F1 (macro): {results['eval_f1_macro']:.4f}")
        print(f"  - F1 (weighted): {results['eval_f1_weighted']:.4f}")

        # Compare with baseline
        baseline_accuracy = baseline_results['accuracy']
        improvement = results['eval_accuracy'] - baseline_accuracy

        print(f"\n📈 Comparison with baseline:")
        print(f"  - Baseline accuracy: {baseline_accuracy:.4f}")
        print(f"  - Trained accuracy: {results['eval_accuracy']:.4f}")
        print(f"  - Improvement: {improvement:.4f} ({improvement*100:.2f}%)")

        if improvement > 0.05:  # 5% improvement
            print("✅ Training is working! Significant improvement detected.")

            # Now try with original model
            print("\n🔄 Now attempting with original model...")

            # Save successful results
            working_model = clean_model
            working_tokenizer = clean_tokenizer
            working_results = results

        else:
            print("⚠️  Training improvement is minimal. Need to debug further.")

    else:
        print("❌ Single step debug failed!")

else:
    print("❌ Clean model setup failed!")

🧹 Starting completely clean training approach...
🧹 Setting up clean model for training...
  - Loading model with basic configuration...


Downloading config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.weight', 'pre_classifier.bias', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

✅ Clean model loaded on cuda:0
  - Model dtype: torch.float32
📊 Simple tokenization...
✅ Tokenization completed:
  - Max length: 256
  - Train shape: torch.Size([240, 256])
  - Test shape: torch.Size([103, 256])
🔍 Debugging single training step...
  - Forward pass...
  - Loss: 1.1142163276672363
  - Logits shape: torch.Size([1, 3])
  - Logits: tensor([[0.0730, 0.1072, 0.1849]], device='cuda:0', grad_fn=<AddmmBackward0>)
  - Backward pass...
  - Gradient norm: 7.364371565714017
✅ Single step debug passed!
🎯 Starting clean manual training...

📈 Epoch 1/2
  - Batch 10: Loss=1.0246, Avg Loss=1.0670
  - Batch 20: Loss=1.0639, Avg Loss=0.9863
  - Batch 30: Loss=1.0212, Avg Loss=0.9248
  - Batch 40: Loss=0.7328, Avg Loss=0.8517
  - Batch 50: Loss=0.4271, Avg Loss=0.7894
  - Batch 60: Loss=0.2369, Avg Loss=0.7490
  - Batch 70: Loss=0.9858, Avg Loss=0.7052
  - Batch 80: Loss=0.1113, Avg Loss=0.6757
  - Batch 90: Loss=0.0725, Avg Loss=0.6812
  - Batch 100: Loss=1.5302, Avg Loss=0.6946
  - Batch 

### **Try full training loop with smaller model**

In [6]:
# =============================================================================
# STEP 14: Diagnose and Fix Training Issues
# =============================================================================

import torch
import torch.nn as nn
import numpy as np
from torch.utils.data import DataLoader
from torch.optim import AdamW
from sklearn.metrics import accuracy_score, f1_score
import gc

def diagnose_training_issues(model, tokenizer, X_train, X_test, y_train, y_test):
    """Diagnose why the model is not learning"""
    print("🔍 Diagnosing training issues...")

    # 1. Check if model parameters are actually updating
    print("1. Checking parameter updates...")

    # Get initial parameters
    initial_params = {}
    for name, param in model.named_parameters():
        if param.requires_grad:
            initial_params[name] = param.clone().detach()

    # Tokenize a small batch for testing
    test_texts = X_train[:4]
    test_labels = y_train[:4]

    encodings = tokenizer(
        list(test_texts),
        truncation=True,
        padding='max_length',
        max_length=256,
        return_tensors='pt'
    )

    # Move to device
    input_ids = encodings['input_ids'].to(device)
    attention_mask = encodings['attention_mask'].to(device)
    labels = torch.tensor(test_labels, dtype=torch.long).to(device)

    # Setup optimizer
    optimizer = AdamW(model.parameters(), lr=1e-4)  # Slightly higher LR for testing

    # Perform one training step
    model.train()
    optimizer.zero_grad()

    outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
    loss = outputs.loss

    print(f"  - Initial loss: {loss.item():.6f}")

    # Check if loss is reasonable
    if torch.isnan(loss) or torch.isinf(loss):
        print("  ❌ Loss is NaN or Inf - this is the problem!")
        return False

    # Backward pass
    loss.backward()

    # Check gradients
    grad_norm = 0
    num_params_with_grad = 0
    for name, param in model.named_parameters():
        if param.grad is not None:
            param_norm = param.grad.data.norm(2).item()
            grad_norm += param_norm ** 2
            num_params_with_grad += 1

    grad_norm = grad_norm ** 0.5

    print(f"  - Gradient norm: {grad_norm:.6f}")
    print(f"  - Parameters with gradients: {num_params_with_grad}")

    if grad_norm < 1e-8:
        print("  ❌ Gradients are too small - vanishing gradient problem!")
        return False

    # Take optimizer step
    optimizer.step()

    # Check if parameters actually changed
    params_changed = 0
    total_change = 0
    for name, param in model.named_parameters():
        if param.requires_grad and name in initial_params:
            change = torch.norm(param - initial_params[name]).item()
            if change > 1e-8:
                params_changed += 1
                total_change += change

    print(f"  - Parameters that changed: {params_changed}")
    print(f"  - Total parameter change: {total_change:.8f}")

    if params_changed == 0:
        print("  ❌ No parameters changed - optimizer not working!")
        return False

    print("  ✅ Parameters are updating correctly")

    # 2. Check if model is actually in training mode
    print("2. Checking model training mode...")
    if model.training:
        print("  ✅ Model is in training mode")
    else:
        print("  ❌ Model is in eval mode!")
        return False

    # 3. Check if we're using the right loss function
    print("3. Checking loss function...")
    logits = outputs.logits
    print(f"  - Logits shape: {logits.shape}")
    print(f"  - Labels shape: {labels.shape}")
    print(f"  - Unique labels: {torch.unique(labels)}")

    # Manual loss calculation
    manual_loss = nn.CrossEntropyLoss()(logits, labels)
    print(f"  - Manual loss: {manual_loss.item():.6f}")
    print(f"  - Model loss: {loss.item():.6f}")

    if abs(manual_loss.item() - loss.item()) > 1e-6:
        print("  ⚠️  Loss calculation mismatch!")
    else:
        print("  ✅ Loss calculation is correct")

    return True

def fixed_training_loop(model, tokenizer, X_train, X_test, y_train, y_test):
    """Fixed training loop with proper learning verification"""
    print("🔧 Starting fixed training loop...")

    # Tokenize data
    train_encodings = tokenizer(
        list(X_train),
        truncation=True,
        padding='max_length',
        max_length=256,
        return_tensors='pt'
    )

    test_encodings = tokenizer(
        list(X_test),
        truncation=True,
        padding='max_length',
        max_length=256,
        return_tensors='pt'
    )

    # Create dataset
    class FixedDataset(torch.utils.data.Dataset):
        def __init__(self, encodings, labels):
            self.encodings = encodings
            self.labels = labels

        def __getitem__(self, idx):
            return {
                'input_ids': self.encodings['input_ids'][idx],
                'attention_mask': self.encodings['attention_mask'][idx],
                'labels': torch.tensor(self.labels[idx], dtype=torch.long)
            }

        def __len__(self):
            return len(self.labels)

    train_dataset = FixedDataset(train_encodings, y_train)
    test_dataset = FixedDataset(test_encodings, y_test)

    # Create data loaders
    train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)  # Slightly larger batch
    test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False)

    # Setup optimizer with higher learning rate
    optimizer = AdamW(model.parameters(), lr=2e-4, weight_decay=0.01)  # Higher LR

    # Get initial model state for comparison
    initial_model_state = {}
    for name, param in model.named_parameters():
        if param.requires_grad:
            initial_model_state[name] = param.clone().detach()

    # Training loop
    model.train()

    for epoch in range(3):
        print(f"\n📈 Epoch {epoch + 1}/3")

        epoch_loss = 0
        num_batches = 0

        # Training phase
        for batch_idx, batch in enumerate(train_loader):
            # Move to device
            batch = {k: v.to(device) for k, v in batch.items()}

            # Zero gradients
            optimizer.zero_grad()

            # Forward pass
            outputs = model(
                input_ids=batch['input_ids'],
                attention_mask=batch['attention_mask'],
                labels=batch['labels']
            )

            loss = outputs.loss

            # Check for NaN
            if torch.isnan(loss) or torch.isinf(loss):
                print(f"  ❌ Invalid loss at batch {batch_idx}: {loss.item()}")
                continue

            # Backward pass
            loss.backward()

            # Check gradient norm
            grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

            # Optimizer step
            optimizer.step()

            epoch_loss += loss.item()
            num_batches += 1

            if (batch_idx + 1) % 10 == 0:
                avg_loss = epoch_loss / num_batches
                print(f"  - Batch {batch_idx + 1}: Loss={loss.item():.6f}, Avg Loss={avg_loss:.6f}, Grad Norm={grad_norm:.6f}")

        # Check if model parameters actually changed
        total_param_change = 0
        for name, param in model.named_parameters():
            if param.requires_grad and name in initial_model_state:
                change = torch.norm(param - initial_model_state[name]).item()
                total_param_change += change

        print(f"  - Total parameter change from start: {total_param_change:.8f}")

        # Evaluation phase
        print(f"  - Evaluating...")
        model.eval()

        all_predictions = []
        all_labels = []
        eval_loss = 0
        eval_batches = 0

        with torch.no_grad():
            for batch in test_loader:
                batch = {k: v.to(device) for k, v in batch.items()}

                outputs = model(
                    input_ids=batch['input_ids'],
                    attention_mask=batch['attention_mask'],
                    labels=batch['labels']
                )

                eval_loss += outputs.loss.item()
                eval_batches += 1

                predictions = torch.argmax(outputs.logits, dim=-1)
                all_predictions.extend(predictions.cpu().numpy())
                all_labels.extend(batch['labels'].cpu().numpy())

        # Calculate metrics
        accuracy = accuracy_score(all_labels, all_predictions)
        f1_macro = f1_score(all_labels, all_predictions, average='macro')
        f1_weighted = f1_score(all_labels, all_predictions, average='weighted')
        avg_eval_loss = eval_loss / eval_batches

        print(f"  - Eval Loss: {avg_eval_loss:.6f}")
        print(f"  - Accuracy: {accuracy:.4f}")
        print(f"  - F1 (macro): {f1_macro:.4f}")
        print(f"  - F1 (weighted): {f1_weighted:.4f}")

        # Check if metrics are improving
        if epoch > 0:
            prev_accuracy = getattr(fixed_training_loop, 'prev_accuracy', accuracy)
            improvement = accuracy - prev_accuracy
            print(f"  - Accuracy change: {improvement:+.4f}")

            if abs(improvement) < 1e-6:
                print("  ⚠️  No improvement detected!")
            else:
                print("  ✅ Model is learning!")

        # Store for next iteration
        fixed_training_loop.prev_accuracy = accuracy

        # Set back to training mode
        model.train()

        # Memory cleanup
        torch.cuda.empty_cache()
        gc.collect()

    return {
        'eval_accuracy': accuracy,
        'eval_f1_macro': f1_macro,
        'eval_f1_weighted': f1_weighted,
        'predictions': all_predictions,
        'labels': all_labels
    }

# Run diagnosis and fixed training
print("🔍 Diagnosing and fixing training issues...")

# Check if we have the required variables
if 'final_model' in locals() and 'final_tokenizer' in locals():

    # 1. Diagnose issues
    diagnosis_passed = diagnose_training_issues(
        final_model, final_tokenizer, final_X_train, final_X_test, final_y_train, final_y_test
    )

    if diagnosis_passed:
        print("✅ Basic training components are working")

        # 2. Run fixed training
        print("\n🔧 Running training with improved loop...")

        # Reset model to fresh state
        fresh_model, fresh_tokenizer = setup_model()

        if fresh_model is not None:
            corrected_results = fixed_training_loop(
                fresh_model, fresh_tokenizer, final_X_train, final_X_test, final_y_train, final_y_test
            )

            if corrected_results is not None:
                print(f"\n🎉 Fixed training completed!")
                print(f"📊 Corrected Results:")
                print(f"  - Accuracy: {corrected_results['eval_accuracy']:.4f}")
                print(f"  - F1 (macro): {corrected_results['eval_f1_macro']:.4f}")
                print(f"  - F1 (weighted): {corrected_results['eval_f1_weighted']:.4f}")

                # Compare with previous results
                if 'final_results' in locals():
                    print(f"\n📈 Comparison with previous training:")
                    print(f"  - Previous accuracy: {final_results['eval_accuracy']:.4f}")
                    print(f"  - New accuracy: {corrected_results['eval_accuracy']:.4f}")
                    print(f"  - Difference: {corrected_results['eval_accuracy'] - final_results['eval_accuracy']:+.4f}")

                # Update final results
                final_results = corrected_results
                final_model = fresh_model
                final_tokenizer = fresh_tokenizer

                print(f"\n🎯 Training is now working correctly!")

            else:
                print("❌ Fixed training also failed!")
        else:
            print("❌ Could not create fresh model!")
    else:
        print("❌ Diagnosis failed - fundamental issues detected")
        print("💡 Possible issues:")
        print("  - Model architecture not suitable for the task")
        print("  - Data preprocessing problems")
        print("  - Device/memory issues")
        print("  - Optimizer configuration problems")

else:
    print("❌ Model and tokenizer not found. Please run the training pipeline first.")

🔍 Diagnosing and fixing training issues...
🔍 Diagnosing training issues...
1. Checking parameter updates...
  - Initial loss: 1.246947
  - Gradient norm: 61.114402
  - Parameters with gradients: 293
  - Parameters that changed: 293
  - Total parameter change: 17.36089103
  ✅ Parameters are updating correctly
2. Checking model training mode...
  ✅ Model is in training mode
3. Checking loss function...
  - Logits shape: torch.Size([4, 3])
  - Labels shape: torch.Size([4])
  - Unique labels: tensor([0, 1], device='cuda:0')
  - Manual loss: 1.246947
  - Model loss: 1.246947
  ✅ Loss calculation is correct
✅ Basic training components are working

🔧 Running training with improved loop...
🔧 Setting up model...


Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at microsoft/DialoGPT-medium and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✅ Model loaded: microsoft/DialoGPT-medium
🔧 Starting fixed training loop...

📈 Epoch 1/3
  - Batch 10: Loss=1.496189, Avg Loss=1.633922, Grad Norm=14.631371
  - Batch 20: Loss=0.000648, Avg Loss=1.098331, Grad Norm=0.036014
  - Batch 30: Loss=0.551982, Avg Loss=0.999164, Grad Norm=6.012332
  - Batch 40: Loss=0.698042, Avg Loss=0.909062, Grad Norm=7.637194
  - Batch 50: Loss=0.745082, Avg Loss=0.868479, Grad Norm=13.587096
  - Batch 60: Loss=0.039547, Avg Loss=0.783973, Grad Norm=1.648367
  - Total parameter change from start: 244.74910503
  - Evaluating...
  - Eval Loss: 1.021021
  - Accuracy: 0.7670
  - F1 (macro): 0.4341
  - F1 (weighted): 0.6658

📈 Epoch 2/3
  - Batch 10: Loss=0.754525, Avg Loss=1.033797, Grad Norm=6.350020
  - Batch 20: Loss=0.585796, Avg Loss=0.756851, Grad Norm=3.239014
  - Batch 30: Loss=0.479944, Avg Loss=0.720558, Grad Norm=9.257504
  - Batch 40: Loss=0.076496, Avg Loss=0.634217, Grad Norm=1.949073
  - Batch 50: Loss=0.384641, Avg Loss=0.633160, Grad Norm=4.54

saving resutls in google sheets an huggingface

In [9]:
# =============================================================================
# STEP 16: Fixed Results Storage and Model Upload
# =============================================================================

from huggingface_hub import login, create_repo, upload_folder
import json
from datetime import datetime
import os

def convert_to_serializable(obj):
    """Convert numpy/torch data types to Python native types"""
    if isinstance(obj, (np.integer, np.int64, np.int32)):
        return int(obj)
    elif isinstance(obj, (np.floating, np.float64, np.float32, np.float16)):
        return float(obj)
    elif isinstance(obj, np.ndarray):
        return obj.tolist()
    elif hasattr(obj, 'item'):
        return obj.item()
    else:
        return obj

def create_final_results_sheet(service, sheet_id, results, X_test, y_test):
    """Create comprehensive results sheet"""
    print("📊 Creating final results sheet...")

    try:
        # Create new sheet
        requests = [{
            'addSheet': {
                'properties': {
                    'title': 'Final_Training_Results'
                }
            }
        }]

        body = {'requests': requests}
        service.spreadsheets().batchUpdate(spreadsheetId=sheet_id, body=body).execute()
        print("✅ Created 'Final_Training_Results' sheet")

        # Training configuration and results
        config_data = [
            ['Final Training Configuration', 'Value'],
            ['Model Name', MODEL_NAME],
            ['Training Date', datetime.now().strftime('%Y-%m-%d %H:%M:%S')],
            ['Max Sequence Length', '384'],
            ['Batch Size', '8'],
            ['Learning Rate', '1e-4'],
            ['Epochs', '5 (with early stopping)'],
            ['Training Samples', len(final_X_train)],
            ['Test Samples', len(final_X_test)],
            [''],
            ['Final Performance Metrics', ''],
            ['Accuracy', f'{convert_to_serializable(results["eval_accuracy"]):.4f}'],
            ['F1 Score (Macro)', f'{convert_to_serializable(results["eval_f1_macro"]):.4f}'],
            ['F1 Score (Weighted)', f'{convert_to_serializable(results["eval_f1_weighted"]):.4f}'],
            [''],
            ['Training Techniques Used', ''],
            ['Label Smoothing', 'Yes (0.1)'],
            ['Learning Rate Scheduling', 'Yes (ReduceLROnPlateau)'],
            ['Early Stopping', 'Yes (patience=2)'],
            ['Gradient Clipping', 'Yes (max_norm=1.0)'],
            ['Advanced Optimizer', 'Yes (AdamW with parameter groups)'],
        ]

        # Write configuration
        range_name = f'Final_Training_Results!A1:B{len(config_data)}'
        body = {'values': config_data}
        service.spreadsheets().values().update(
            spreadsheetId=sheet_id,
            range=range_name,
            valueInputOption='RAW',
            body=body
        ).execute()

        print("✅ Configuration written")

        # Detailed predictions
        predictions_data = [
            ['Sample_ID', 'Actual_Class', 'Actual_Meaning', 'Predicted_Class', 'Predicted_Meaning', 'Correct?', 'Text_Preview']
        ]

        class_meanings = {
            0: "Low Relevance & Low Usefulness",
            1: "Medium Relevance & Medium Usefulness",
            2: "High Relevance & High Usefulness"
        }

        for i, (actual, pred, text) in enumerate(zip(results['labels'], results['predictions'], X_test)):
            actual_meaning = class_meanings.get(actual, f"Unknown ({actual})")
            pred_meaning = class_meanings.get(pred, f"Unknown ({pred})")
            is_correct = "✓" if actual == pred else "✗"

            text_preview = str(text)[:100] + "..." if len(str(text)) > 100 else str(text)
            text_preview = text_preview.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ')

            predictions_data.append([
                i + 1,
                convert_to_serializable(actual),
                actual_meaning,
                convert_to_serializable(pred),
                pred_meaning,
                is_correct,
                text_preview
            ])

        # Write predictions
        range_name = f'Final_Training_Results!D1:J{len(predictions_data)}'
        body = {'values': predictions_data}
        service.spreadsheets().values().update(
            spreadsheetId=sheet_id,
            range=range_name,
            valueInputOption='RAW',
            body=body
        ).execute()

        print("✅ Predictions written")

        # Performance analysis
        correct_predictions = sum(1 for a, p in zip(results['labels'], results['predictions']) if a == p)
        total_predictions = len(results['labels'])

        # Class-wise analysis
        class_analysis = []
        for class_id in [0, 1, 2]:
            actual_count = sum(1 for label in results['labels'] if label == class_id)
            predicted_count = sum(1 for pred in results['predictions'] if pred == class_id)

            if actual_count > 0:
                class_correct = sum(1 for a, p in zip(results['labels'], results['predictions'])
                                 if a == class_id and p == class_id)
                class_precision = class_correct / predicted_count if predicted_count > 0 else 0
                class_recall = class_correct / actual_count if actual_count > 0 else 0
                class_f1 = 2 * (class_precision * class_recall) / (class_precision + class_recall) if (class_precision + class_recall) > 0 else 0

                class_analysis.append([
                    f'Class {class_id}',
                    class_meanings[class_id],
                    f'Actual: {actual_count}',
                    f'Predicted: {predicted_count}',
                    f'Precision: {class_precision:.3f}',
                    f'Recall: {class_recall:.3f}',
                    f'F1: {class_f1:.3f}'
                ])

        analysis_start_row = len(config_data) + 3
        analysis_data = [
            ['Performance Analysis', '', '', '', '', '', ''],
            ['Overall Accuracy', f'{correct_predictions}/{total_predictions}', f'{correct_predictions/total_predictions*100:.2f}%', '', '', '', ''],
            [''],
            ['Class', 'Meaning', 'Actual Count', 'Predicted Count', 'Precision', 'Recall', 'F1-Score'],
        ]

        analysis_data.extend(class_analysis)

        # Write analysis
        range_name = f'Final_Training_Results!A{analysis_start_row}:G{analysis_start_row + len(analysis_data) - 1}'
        body = {'values': analysis_data}
        service.spreadsheets().values().update(
            spreadsheetId=sheet_id,
            range=range_name,
            valueInputOption='RAW',
            body=body
        ).execute()

        print("✅ Performance analysis written")

        return True

    except Exception as e:
        print(f"❌ Error creating results sheet: {e}")
        import traceback
        traceback.print_exc()
        return False

def save_model_to_huggingface_fixed(model, tokenizer, results):
    """Save model to Hugging Face with fixed README"""
    print("🤗 Saving model to Hugging Face Hub...")

    try:
        # Login to Hugging Face
        login(token=HUGGINGFACE_TOKEN)
        print("✅ Logged in to Hugging Face")

        # Create repository name
        repo_name = f"sustainability-report-classifier-{datetime.now().strftime('%Y%m%d-%H%M%S')}"

        # Create repository
        try:
            repo_url = create_repo(repo_name, private=False)
            print(f"✅ Created repository: {repo_name}")
        except Exception as e:
            print(f"⚠️  Repository creation note: {e}")
            repo_url = f"https://huggingface.co/{repo_name}"

        # Prepare local directory
        local_path = "./hf_model_final"
        os.makedirs(local_path, exist_ok=True)

        # Save model and tokenizer
        model.save_pretrained(local_path)
        tokenizer.save_pretrained(local_path)

        # Create README with proper formatting
        readme_lines = [
            "# Sustainability Report Classifier",
            "",
            "This model classifies sustainability reports based on relevance and usefulness scores.",
            "",
            "## Model Details",
            "",
            f"- **Base Model**: {MODEL_NAME}",
            "- **Task**: Text Classification (3 classes)",
            f"- **Training Date**: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
            "- **Training Approach**: Fine-tuning with advanced techniques",
            "",
            "## Performance Metrics",
            "",
            f"- **Accuracy**: {results['eval_accuracy']:.4f}",
            f"- **F1 Score (Macro)**: {results['eval_f1_macro']:.4f}",
            f"- **F1 Score (Weighted)**: {results['eval_f1_weighted']:.4f}",
            "",
            "## Training Techniques Used",
            "",
            "- Label Smoothing (0.1)",
            "- Learning Rate Scheduling (ReduceLROnPlateau)",
            "- Early Stopping (patience=2)",
            "- Gradient Clipping (max_norm=1.0)",
            "- Advanced Optimizer (AdamW with parameter groups)",
            "",
            "## Class Labels",
            "",
            "- **Class 0**: Low Relevance & Low Usefulness (Combined Score 0-1)",
            "- **Class 1**: Medium Relevance & Medium Usefulness (Combined Score 1-2)",
            "- **Class 2**: High Relevance & High Usefulness (Combined Score 2-3)",
            "",
            "## Usage Example",
            "",
            "```python",
            "from transformers import AutoTokenizer, AutoModelForSequenceClassification",
            "import torch",
            "",
            f'tokenizer = AutoTokenizer.from_pretrained("{repo_name}")',
            f'model = AutoModelForSequenceClassification.from_pretrained("{repo_name}")',
            "",
            "# Example usage",
            'text = "Your sustainability report text here"',
            'inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=384)',
            "",
            "with torch.no_grad():",
            "    outputs = model(**inputs)",
            "    predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)",
            "    predicted_class = torch.argmax(predictions, dim=-1)",
            "",
            'print(f"Predicted class: {predicted_class.item()}")',
            'print(f"Confidence scores: {predictions[0]}")',
            "```",
            "",
            "## Training Data",
            "",
            "The model was trained on sustainability report data with manually labeled",
            "relevance and usefulness scores, combined into a single classification task.",
            "",
            "## Citation",
            "",
            "If you use this model, please cite:",
            "```",
            "Sustainability Report Classifier",
            f"Trained on {datetime.now().strftime('%Y-%m-%d')}",
            f"Available at: https://huggingface.co/{repo_name}",
            "```"
        ]

        readme_content = "\n".join(readme_lines)

        # Write README
        with open(os.path.join(local_path, "README.md"), "w", encoding="utf-8") as f:
            f.write(readme_content)

        # Create model card metadata
        model_card = {
            "model_name": repo_name,
            "task": "text-classification",
            "language": "en",
            "pipeline_tag": "text-classification",
            "tags": ["sustainability", "classification", "reports", "environmental"],
            "metrics": {
                "accuracy": float(results['eval_accuracy']),
                "f1_macro": float(results['eval_f1_macro']),
                "f1_weighted": float(results['eval_f1_weighted'])
            },
            "base_model": MODEL_NAME,
            "training_date": datetime.now().strftime('%Y-%m-%d')
        }

        with open(os.path.join(local_path, "model_card.json"), "w", encoding="utf-8") as f:
            json.dump(model_card, f, indent=2)

        print("✅ Files prepared locally")

        # Upload to Hugging Face
        try:
            upload_folder(
                folder_path=local_path,
                repo_id=repo_name,
                repo_type="model",
                commit_message=f"Upload sustainability classifier - Accuracy: {results['eval_accuracy']:.4f}"
            )

            print(f"✅ Model uploaded successfully!")
            print(f"🔗 Model URL: https://huggingface.co/{repo_name}")

            return repo_name, f"https://huggingface.co/{repo_name}"

        except Exception as e:
            print(f"❌ Upload failed: {e}")
            print("📁 Model saved locally at:", local_path)
            return None, local_path

    except Exception as e:
        print(f"❌ Hugging Face process failed: {e}")
        import traceback
        traceback.print_exc()
        return None, None

# Execute the fixed storage and upload process
print("🚀 Starting fixed results storage and model upload...")

if 'final_results' in locals() and 'final_model' in locals() and 'final_tokenizer' in locals():

    # 1. Store results in Google Sheets
    print("📊 Storing results in Google Sheets...")

    try:
        # Authenticate
        auth.authenticate_user()
        creds, _ = default()
        service = build('sheets', 'v4', credentials=creds)

        # Create results sheet
        sheet_created = create_final_results_sheet(
            service, SHEET_ID, final_results, final_X_test, final_y_test
        )

        if sheet_created:
            print("✅ Google Sheets updated successfully!")
            print(f"🔗 View results: {GOOGLE_SHEET_URL}")
        else:
            print("⚠️  Google Sheets update had issues")

    except Exception as e:
        print(f"❌ Google Sheets failed: {e}")

    # 2. Save model to Hugging Face
    print("\n🤗 Saving model to Hugging Face...")

    repo_name, repo_url = save_model_to_huggingface_fixed(
        final_model, final_tokenizer, final_results
    )

    if repo_name:
        print(f"✅ Model successfully uploaded to Hugging Face!")
        print(f"🔗 Model repository: {repo_url}")

        # Add HF info to Google Sheets
        try:
            hf_info = [
                ['', ''],
                ['Hugging Face Repository', ''],
                ['Repository Name', repo_name],
                ['Repository URL', repo_url],
                ['Upload Date', datetime.now().strftime('%Y-%m-%d %H:%M:%S')],
            ]

            range_name = 'Final_Training_Results!A25:B29'
            body = {'values': hf_info}
            service.spreadsheets().values().update(
                spreadsheetId=SHEET_ID,
                range=range_name,
                valueInputOption='RAW',
                body=body
            ).execute()

            print("✅ Hugging Face info added to Google Sheets")

        except Exception as e:
            print(f"⚠️  Could not update sheets with HF info: {e}")

    # 3. Final summary
    print(f"\n🎉 Complete process finished!")
    print(f"📊 Final Model Performance:")
    print(f"  - Accuracy: {final_results['eval_accuracy']:.4f}")
    print(f"  - F1 Score (Macro): {final_results['eval_f1_macro']:.4f}")
    print(f"  - F1 Score (Weighted): {final_results['eval_f1_weighted']:.4f}")

    if repo_name:
        print(f"\n🔗 Your model is publicly available at:")
        print(f"   {repo_url}")

    print(f"\n🔗 Your detailed results are available at:")
    print(f"   {GOOGLE_SHEET_URL}")

    print(f"\n✅ Project completed successfully!")

else:
    print("❌ Required results not found. Please run the enhanced training first.")

🚀 Starting fixed results storage and model upload...
📊 Storing results in Google Sheets...
📊 Creating final results sheet...
✅ Created 'Final_Training_Results' sheet
✅ Configuration written
✅ Predictions written
✅ Performance analysis written
✅ Google Sheets updated successfully!
🔗 View results: https://docs.google.com/spreadsheets/d/1CpWL01U9HSfmre2OjFj3GkMV816EYZOryxWGDDVouy4/edit?gid=1497010733#gid=1497010733

🤗 Saving model to Hugging Face...
🤗 Saving model to Hugging Face Hub...
Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful
✅ Logged in to Hugging Face
✅ Created repository: sustainability-report-classifier-20250718-135925
✅ Files prepared locally
❌ Upload failed: 404 Client Error. (Request ID: Root=1-687a537a-1335cbef696cec7701ffeec0;8a4c0dc5-dea7-43ea-a3e4-98a06b2e7c49)

Repository No