# Translation Quality Estimation Model Training - Colab Runner

This notebook runs the `train_model.py` script on Google Colab with GPU support.

## Features:
- ‚úÖ Automatic GPU detection and configuration
- ‚úÖ Configurable sample size
- ‚úÖ Optimized for Colab's GPU environment
- ‚úÖ Easy to use with minimal setup

## Instructions:
1. Upload your data files to Colab or mount Google Drive
2. Set `SAMPLE_SIZE` variable below
3. Run all cells


In [None]:
# ============================================================================
# Configuration
# ============================================================================

# Set the number of sentence pairs to use for training
# Use None for full dataset, or a number like 200000 for a sample
SAMPLE_SIZE = 2000  # Change this value as needed

# Mount Google Drive and change to project directory
from google.colab import drive
import os

print("üìÇ Mounting Google Drive...")
drive.mount('/content/drive', force_remount=True)

print("üìÅ Changing to project directory...")
os.chdir('/content/drive/MyDrive/DSAN6600final')

# Data path - now relative to project directory
DATA_PATH = 'data/hi-zh.txt/'

# Get current working directory for reference
PROJECT_DIR = os.getcwd()

print(f"\n‚úÖ Configuration:")
print(f"  üìä SAMPLE_SIZE: {SAMPLE_SIZE if SAMPLE_SIZE else 'Full dataset'}")
print(f"  üìÅ DATA_PATH: {DATA_PATH}")
print(f"  üìÇ PROJECT_DIR: {PROJECT_DIR}")
print(f"  üíæ OUTPUT_DIR: models/")


In [None]:
# ============================================================================
# Install Dependencies
# ============================================================================

print("üì¶ Installing dependencies...")
print("  ‚Üí Installing core packages...")
%pip install -q pandas>=1.5.0 numpy>=1.23.0 scikit-learn>=1.2.0
print("  ‚Üí Installing ML libraries...")
%pip install -q sentence-transformers>=2.2.0
%pip install -q tensorflow>=2.10.0
print("  ‚Üí Installing transformers and PyTorch...")
%pip install -q transformers>=4.20.0 torch>=1.12.0
print("  ‚Üí Installing utilities...")
%pip install -q scipy>=1.9.0
%pip install -q tqdm  # Progress bars

print("\n‚úÖ All dependencies installed!")


In [None]:
# ============================================================================
# Check GPU Availability
# ============================================================================

import tensorflow as tf

# Check for GPU
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    print(f"‚úì GPU detected: {len(gpus)} GPU(s)")
    for i, gpu in enumerate(gpus):
        print(f"  GPU {i}: {gpu.name}")
    # Enable memory growth
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        print("‚úì GPU memory growth enabled")
    except RuntimeError as e:
        print(f"  Warning: {e}")
else:
    print("‚ö† No GPU detected. Training will be slower on CPU.")
    print("  To enable GPU in Colab: Runtime ‚Üí Change runtime type ‚Üí GPU")

# Check TensorFlow version
print(f"\nTensorFlow version: {tf.__version__}")


In [None]:
# ============================================================================
# Setup Project Structure
# ============================================================================

import os
import sys
from pathlib import Path

# We're already in the project directory, so use relative paths
# Create models directory if it doesn't exist
os.makedirs('models', exist_ok=True)

# Add scripts directory to path (relative to current directory)
scripts_dir = os.path.join(os.getcwd(), 'scripts')
sys.path.insert(0, scripts_dir)

print("‚úÖ Project structure verified")
print(f"  üìÇ Working directory: {os.getcwd()}")
print(f"  üìÅ Scripts directory: {scripts_dir}")
print(f"  üíæ Models directory: {os.path.join(os.getcwd(), 'models')}")


In [None]:
# ============================================================================
# Upload Data Files (if not already uploaded)
# ============================================================================

# Check if data files exist
data_files = [
    'CCMatrix.hi-zh.hi',
    'CCMatrix.hi-zh.zh',
    'CCMatrix.hi-zh.scores'
]

data_path = Path(DATA_PATH)
all_exist = all((data_path / f).exists() for f in data_files)

if all_exist:
    print(f"‚úì Data files found at: {DATA_PATH}")
    for f in data_files:
        file_path = data_path / f
        if file_path.exists():
            size_mb = file_path.stat().st_size / (1024 * 1024)
            print(f"  ‚úì {f} ({size_mb:.2f} MB)")
else:
    print("‚ö† Data files not found. Please upload them:")
    print("  Option 1: Use Colab's file upload (Files ‚Üí Upload)")
    print("  Option 2: Mount Google Drive and copy files")
    print("  Option 3: Use wget/curl to download from URL")
    print(f"\n  Expected location: {DATA_PATH}")
    print(f"  Required files: {', '.join(data_files)}")
    
    # Option: Upload files interactively
    from google.colab import files
    print("\nüì§ Upload data files now? (Uncomment the line below)")
    # uploaded = files.upload()  # Uncomment to enable file upload


In [None]:
# ============================================================================
# Verify train_model.py exists and modify for Colab
# ============================================================================

import re
from pathlib import Path

# Script path relative to project directory
script_path = 'scripts/train_model.py'

# Check if script exists
if os.path.exists(script_path):
    print(f"‚úÖ Found train_model.py at: {os.path.abspath(script_path)}")
    
    # Read the script
    with open(script_path, 'r', encoding='utf-8') as f:
        script_content = f.read()
    
    # Modify the script to use correct paths and SAMPLE_SIZE
    print("\nüìù Modifying script for Colab environment...")
    
    # Replace DATA_PATH (handle both '../data/hi-zh.txt/' and relative paths)
    script_content = re.sub(
        r"DATA_PATH = ['\"].*?['\"]",
        f"DATA_PATH = '{DATA_PATH}'",
        script_content
    )
    
    # Replace OUTPUT_DIR (handle both '../models/' and relative paths)
    script_content = re.sub(
        r"OUTPUT_DIR = ['\"].*?['\"]",
        "OUTPUT_DIR = 'models/'",
        script_content
    )
    
    # Replace SAMPLE_SIZE (find the line and replace)
    script_content = re.sub(
        r'SAMPLE_SIZE = \d+|SAMPLE_SIZE = None',
        f'SAMPLE_SIZE = {SAMPLE_SIZE if SAMPLE_SIZE else None}',
        script_content
    )
    
    # Write modified script back
    with open(script_path, 'w', encoding='utf-8') as f:
        f.write(script_content)
    
    print("‚úÖ Script modified for Colab")
    print(f"  üìÅ DATA_PATH: {DATA_PATH}")
    print(f"  üíæ OUTPUT_DIR: models/")
    print(f"  üìä SAMPLE_SIZE: {SAMPLE_SIZE if SAMPLE_SIZE else 'Full dataset'}")
else:
    print("‚ùå train_model.py not found!")
    print(f"  Expected location: {os.path.abspath(script_path)}")
    print("  Please ensure the script is in the scripts/ directory")


In [None]:
# ============================================================================
# Verify Training Configuration
# ============================================================================

# Script path relative to project directory
script_path = 'scripts/train_model.py'

if os.path.exists(script_path):
    print("‚úÖ train_model.py found. Ready to execute.")
    print(f"\nüìä Training Configuration:")
    print(f"  üìä Sample Size: {SAMPLE_SIZE if SAMPLE_SIZE else 'Full dataset'}")
    print(f"  üìÅ Data Path: {os.path.abspath(DATA_PATH)}")
    print(f"  üíæ Output Directory: {os.path.abspath('models/')}")
    print(f"  üñ•Ô∏è  GPU Available: {len(gpus) > 0 if 'gpus' in locals() else 'Checking...'}")
    print(f"  üìÇ Working Directory: {os.getcwd()}")
else:
    print("‚ö† train_model.py not found. Please ensure it's in the scripts/ directory")


In [None]:
# ============================================================================
# Run Training Script
# ============================================================================

import subprocess
import sys
from datetime import datetime

# Script path relative to project directory
script_path = 'scripts/train_model.py'
abs_script_path = os.path.abspath(script_path)

if os.path.exists(script_path):
    print("=" * 70)
    print("üöÄ STARTING TRAINING")
    print("=" * 70)
    print(f"üìÖ Start time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    print(f"üìä Sample Size: {SAMPLE_SIZE if SAMPLE_SIZE else 'Full dataset'}")
    print(f"üìÅ Data Path: {os.path.abspath(DATA_PATH)}")
    print(f"üíæ Output Directory: {os.path.abspath('models/')}")
    print(f"üìÇ Working Directory: {os.getcwd()}")
    print("=" * 70)
    print()
    
    # Run the script from project root (script uses relative paths)
    # Use exec to run in the same process so we can see output in real-time
    try:
        print("üìñ Reading training script...")
        # Read and execute the script
        with open(script_path, 'r', encoding='utf-8') as f:
            script_code = f.read()
        print("‚úÖ Script loaded. Starting execution...\n")
        
        # Execute in current namespace (we're already in project directory)
        exec(script_code, {'__name__': '__main__', '__file__': abs_script_path})
        
        print("\n" + "=" * 70)
        print(f"‚úÖ TRAINING COMPLETED")
        print(f"üìÖ End time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
        print("=" * 70)
        
    except KeyboardInterrupt:
        print("\n\n‚ö†Ô∏è Training interrupted by user")
    except Exception as e:
        print(f"\n‚ùå Error during training: {e}")
        import traceback
        traceback.print_exc()
else:
    print("‚ùå train_model.py not found!")
    print(f"  Expected location: {abs_script_path}")
    print("  Please ensure the script is in the scripts/ directory")


In [None]:
# ============================================================================
# Check Training Results
# ============================================================================

import os
from pathlib import Path

# Models directory relative to project directory
models_dir = Path('models')

if models_dir.exists():
    print("üìÅ Training Output Files:")
    print("=" * 70)
    print(f"  Location: {os.path.abspath(models_dir)}")
    print()
    
    files = list(models_dir.glob('*'))
    if files:
        for f in sorted(files):
            size_mb = f.stat().st_size / (1024 * 1024) if f.is_file() else 0
            file_type = "üìÑ" if f.is_file() else "üìÅ"
            print(f"{file_type} {f.name} ({size_mb:.2f} MB)" if f.is_file() else f"{file_type} {f.name}/")
    else:
        print("  No files found yet. Training may still be in progress.")
    
    # Check for summary file
    summary_file = models_dir / 'training_summary.txt'
    if summary_file.exists():
        print("\n" + "=" * 70)
        print("TRAINING SUMMARY")
        print("=" * 70)
        with open(summary_file, 'r') as f:
            print(f.read())
else:
    print("‚ö† Models directory not found")
    print(f"  Expected location: {os.path.abspath(models_dir)}")


In [None]:
# ============================================================================
# Download Results (Optional)
# ============================================================================

from google.colab import files
from pathlib import Path
import zipfile

# Models directory relative to project directory
models_dir = Path('models')

if models_dir.exists() and any(models_dir.iterdir()):
    print("üì• Download training results?")
    print(f"  Models directory: {os.path.abspath(models_dir)}")
    print("  Uncomment the code below to download all model files as a zip")
    
    # Uncomment to enable download
    # zip_path = '/content/training_results.zip'
    # with zipfile.ZipFile(zip_path, 'w') as zipf:
    #     for file in models_dir.rglob('*'):
    #         if file.is_file():
    #             # Use relative path from project directory
    #             zipf.write(file, file.relative_to(os.getcwd()))
    # 
    # files.download(zip_path)
    # print("‚úÖ Download started!")
else:
    print("‚ö† No results to download yet")
    print(f"  Models directory: {os.path.abspath(models_dir)}")


## Notes:

1. **GPU Setup**: Make sure to enable GPU in Colab (Runtime ‚Üí Change runtime type ‚Üí GPU)

2. **Project Structure**: After mounting Drive, the notebook changes to the project directory (`/content/drive/MyDrive/DSAN6600final`). All paths are relative to this directory.

3. **Data Path**: Data should be in `data/hi-zh.txt/` relative to project root

4. **Sample Size**: Adjust `SAMPLE_SIZE` in the first cell to control how many pairs to use

5. **Monitoring**: Training progress will be displayed in real-time in the notebook

6. **Results**: All trained models and results will be saved to `models/` directory (relative to project root)
 

# ============================================================================
# Prediction: Test Your Model
# ============================================================================

Use the cells below to predict alignment scores for Hindi-Chinese sentence pairs.


In [None]:
# ============================================================================
# Load Prediction Functions
# ============================================================================

import sys
import os
from pathlib import Path

# Add scripts to path
scripts_dir = os.path.join(os.getcwd(), 'scripts')
if scripts_dir not in sys.path:
    sys.path.insert(0, scripts_dir)

# Import prediction functions
try:
    from predict_quality import predict_single, load_model_and_artifacts, extract_features
    print("‚úÖ Prediction functions loaded successfully!")
except ImportError as e:
    print(f"‚ùå Error loading prediction functions: {e}")
    print("  Make sure predict_quality.py is in the scripts/ directory")


In [None]:
# ============================================================================
# Predict Alignment Score for a Sentence Pair
# ============================================================================

# Input your sentences here
hindi_sentence = "‡§Æ‡•à‡§Ç ‡§è‡§ï ‡§õ‡§æ‡§§‡•ç‡§∞ ‡§π‡•Ç‡§Å‡•§"  # Change this to your Hindi sentence
chinese_sentence = "ÊàëÊòØ‰∏ÄÂêçÂ≠¶Áîü„ÄÇ"  # Change this to your Chinese sentence

# Check if model exists
models_dir = Path('models')
model_files = list(models_dir.glob('quality_estimation_*.h5')) + list(models_dir.glob('quality_estimation_*.pkl'))

if model_files:
    print("=" * 70)
    print("üîÆ PREDICTING ALIGNMENT SCORE")
    print("=" * 70)
    print(f"\nüìù Input Sentences:")
    print(f"  Hindi: {hindi_sentence}")
    print(f"  Chinese: {chinese_sentence}")
    print("\nüîÑ Computing prediction...")
    
    try:
        # Predict
        score, confidence_interval = predict_single(hindi_sentence, chinese_sentence)
        
        print("\n" + "=" * 70)
        print("üìä PREDICTION RESULT")
        print("=" * 70)
        print(f"\n‚úÖ Predicted Alignment Score: {score:.4f}")
        print(f"üìà 95% Confidence Interval: [{confidence_interval[0]:.4f}, {confidence_interval[1]:.4f}]")
        print(f"\nüí° Interpretation:")
        print(f"  ‚Ä¢ Higher scores indicate better alignment/similarity")
        print(f"  ‚Ä¢ Typical range: ~1.06 to ~1.24 (based on CCMatrix scores)")
        if score > 1.15:
            print(f"  ‚Ä¢ This pair shows {'strong' if score > 1.20 else 'good'} alignment")
        elif score > 1.10:
            print(f"  ‚Ä¢ This pair shows moderate alignment")
        else:
            print(f"  ‚Ä¢ This pair shows lower alignment")
        print("=" * 70)
        
    except Exception as e:
        print(f"\n‚ùå Error during prediction: {e}")
        import traceback
        traceback.print_exc()
else:
    print("‚ö†Ô∏è No trained model found!")
    print(f"  Expected location: {os.path.abspath(models_dir)}")
    print("  Please run the training cells first to train a model.")


In [None]:
# ============================================================================
# Batch Prediction (Multiple Pairs)
# ============================================================================

# Example: Predict scores for multiple sentence pairs
sentence_pairs = [
    ("‡§Æ‡•à‡§Ç ‡§è‡§ï ‡§õ‡§æ‡§§‡•ç‡§∞ ‡§π‡•Ç‡§Å‡•§", "ÊàëÊòØ‰∏ÄÂêçÂ≠¶Áîü„ÄÇ"),
    ("‡§Ø‡§π ‡§è‡§ï ‡§Ö‡§ö‡•ç‡§õ‡•Ä ‡§ï‡§ø‡§§‡§æ‡§¨ ‡§π‡•à‡•§", "ËøôÊòØ‰∏ÄÊú¨Â•Ω‰π¶„ÄÇ"),
    # Add more pairs here
]

if model_files:
    print("=" * 70)
    print("üîÆ BATCH PREDICTION")
    print("=" * 70)
    print(f"\nüìù Predicting scores for {len(sentence_pairs)} sentence pairs...")
    
    try:
        from predict_quality import predict_batch
        
        results = predict_batch(sentence_pairs)
        
        print("\n" + "=" * 70)
        print("üìä BATCH PREDICTION RESULTS")
        print("=" * 70)
        
        for i, result in enumerate(results, 1):
            print(f"\n{i}. Pair {i}:")
            print(f"   Hindi: {result['hindi']}")
            print(f"   Chinese: {result['chinese']}")
            print(f"   üìä Score: {result['predicted_score']:.4f}")
            print(f"   üìà CI: [{result['confidence_lower']:.4f}, {result['confidence_upper']:.4f}]")
        
        print("\n" + "=" * 70)
        
        # Create a summary DataFrame
        import pandas as pd
        df_results = pd.DataFrame(results)
        print("\nüìã Summary Table:")
        print(df_results[['predicted_score', 'confidence_lower', 'confidence_upper']].describe())
        
    except Exception as e:
        print(f"\n‚ùå Error during batch prediction: {e}")
        import traceback
        traceback.print_exc()
else:
    print("‚ö†Ô∏è No trained model found! Please run training first.")
