# XGBoost Skill Model Training Notebook

This notebook provides an interactive interface for training, evaluating, and managing XGBoost models for skill inference.

## Workflow:
1. Data preparation and exploration
2. Model training for each skill type
3. Model evaluation against teacher ratings
4. Model versioning and metadata management

## Skills trained:
- Empathy
- Problem Solving
- Self-Regulation
- Resilience

## 1. Setup and Imports

import sys
import logging
from pathlib import Path

In [15]:
import sys
import logging
from pathlib import Path

In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display

In [19]:
import sys
import logging
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display

# Add parent directory to path
sys.path.append(str(Path.cwd().parent.parent))

# Import our training modules=
from app.ml.train_models import SkillModelTrainer
from app.ml.evaluate_models import ModelEvaluator
from app.ml.model_metadata import ModelRegistry, ModelMetadata
from app.models.assessment import SkillType

# Configure plotting
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
%matplotlib inline

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

print("✓ Imports complete")

✓ Imports complete


## 2. Configuration

In [20]:
# File paths - UPDATE THESE
TRAINING_DATA_PATH = "./data/training_data.csv"  # Path to training data
TEST_DATA_PATH = "./data/test_data.csv"  # Path to test data with teacher ratings
MODELS_DIR = "./models"  # Directory to save trained models
MODEL_VERSION = "1.0.0"  # Version for this training run

# Create models directory
Path(MODELS_DIR).mkdir(parents=True, exist_ok=True)

print(f"Training data: {TRAINING_DATA_PATH}")
print(f"Test data: {TEST_DATA_PATH}")
print(f"Models directory: {MODELS_DIR}")
print(f"Model version: {MODEL_VERSION}")

Training data: ./data/training_data.csv
Test data: ./data/test_data.csv
Models directory: ./models
Model version: 1.0.0


## 3. Data Exploration

In [None]:
# Load training data
training_df = pd.read_csv(TRAINING_DATA_PATH)

print(f"Training dataset shape: {training_df.shape}")
print(f"\nColumns: {list(training_df.columns)}")
print(f"\nFirst few rows:")
display(training_df.head())

FileNotFoundError: [Errno 2] No such file or directory: './data/training_data.csv'

In [None]:
# Check for missing values
missing_data = training_df.isnull().sum()
missing_data = missing_data[missing_data > 0].sort_values(ascending=False)

if len(missing_data) > 0:
    print("Missing values per column:")
    print(missing_data)
else:
    print("✓ No missing values found")

In [None]:
# Display target variable distributions
target_cols = ['empathy_score', 'problem_solving_score', 'self_regulation_score', 'resilience_score']

fig, axes = plt.subplots(2, 2, figsize=(12, 8))
axes = axes.flatten()

for idx, col in enumerate(target_cols):
    if col in training_df.columns:
        axes[idx].hist(training_df[col], bins=20, edgecolor='black')
        axes[idx].set_title(f'{col.replace("_", " ").title()} Distribution')
        axes[idx].set_xlabel('Score')
        axes[idx].set_ylabel('Frequency')
        mean_val = training_df[col].mean()
        axes[idx].axvline(mean_val, color='red', linestyle='--', label=f'Mean: {mean_val:.3f}')
        axes[idx].legend()
    else:
        axes[idx].text(0.5, 0.5, f'{col} not found', ha='center', va='center')

plt.tight_layout()
plt.show()

# Print statistics
print("\nTarget variable statistics:")
for col in target_cols:
    if col in training_df.columns:
        print(f"\n{col}:")
        print(training_df[col].describe())

## 4. Model Training

Train XGBoost models for each skill type.

In [None]:
# Initialize trainer
trainer = SkillModelTrainer(
    data_path=TRAINING_DATA_PATH,
    models_dir=MODELS_DIR,
    model_version=MODEL_VERSION
)

print("✓ Trainer initialized")
print(f"Skills to train: {[skill.value for skill in trainer.skill_types]}")

In [None]:
# Train all skill models
print("Starting training...\n")
trainer.train_all_skills()
print("\n✓ Training complete!")

### 4.1 Training Individual Skills (Optional)

Train individual skills if needed for debugging or experimentation.

In [None]:
# Train a specific skill (uncomment to use)
# skill_type = SkillType.EMPATHY  # Change this to train different skills
# 
# df = trainer.load_data()
# X, y, feature_names = trainer.prepare_data(df, skill_type)
# model, metrics = trainer.train_model(X, y, skill_type)
# trainer.save_model(model, feature_names, skill_type, metrics, len(X))
# 
# print(f"\n✓ {skill_type.value} model trained successfully")
# print(f"Metrics: {metrics}")

## 5. Model Evaluation

Evaluate trained models against teacher ratings.

In [None]:
# Initialize evaluator
evaluator = ModelEvaluator(
    models_dir=MODELS_DIR,
    test_data_path=TEST_DATA_PATH
)

print(f"✓ Evaluator initialized")
print(f"Loaded models: {list(evaluator.models.keys())}")

In [None]:
# Evaluate all models
results = evaluator.evaluate_all_skills()

# Save evaluation report
report_path = Path(MODELS_DIR) / "evaluation_report.json"
evaluator.save_evaluation_report(results, str(report_path))

print(f"\n✓ Evaluation complete. Report saved to {report_path}")

### 5.1 Visualization of Results

In [None]:
# Extract metrics for visualization
skills = []
pearson_scores = []
spearman_scores = []
rmse_scores = []
mae_scores = []
r2_scores = []

for skill, metrics in results.items():
    if skill != 'summary' and isinstance(metrics, dict):
        skills.append(skill.replace('_', ' ').title())
        pearson_scores.append(metrics.get('pearson_r', 0))
        spearman_scores.append(metrics.get('spearman_r', 0))
        rmse_scores.append(metrics.get('rmse', 0))
        mae_scores.append(metrics.get('mae', 0))
        r2_scores.append(metrics.get('r2_score', 0))

# Create visualizations
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Correlation metrics
x = np.arange(len(skills))
width = 0.35
axes[0, 0].bar(x - width/2, pearson_scores, width, label='Pearson r', alpha=0.8)
axes[0, 0].bar(x + width/2, spearman_scores, width, label='Spearman r', alpha=0.8)
axes[0, 0].set_ylabel('Correlation Coefficient')
axes[0, 0].set_title('Correlation with Teacher Ratings')
axes[0, 0].set_xticks(x)
axes[0, 0].set_xticklabels(skills, rotation=45, ha='right')
axes[0, 0].legend()
axes[0, 0].axhline(y=0.7, color='r', linestyle='--', alpha=0.3, label='Target: 0.7')
axes[0, 0].grid(axis='y', alpha=0.3)

# Error metrics
axes[0, 1].bar(x - width/2, rmse_scores, width, label='RMSE', alpha=0.8)
axes[0, 1].bar(x + width/2, mae_scores, width, label='MAE', alpha=0.8)
axes[0, 1].set_ylabel('Error')
axes[0, 1].set_title('Prediction Error Metrics')
axes[0, 1].set_xticks(x)
axes[0, 1].set_xticklabels(skills, rotation=45, ha='right')
axes[0, 1].legend()
axes[0, 1].grid(axis='y', alpha=0.3)

# R² scores
axes[1, 0].bar(skills, r2_scores, color='green', alpha=0.7)
axes[1, 0].set_ylabel('R² Score')
axes[1, 0].set_title('R² Scores by Skill')
axes[1, 0].set_xticklabels(skills, rotation=45, ha='right')
axes[1, 0].axhline(y=0.5, color='r', linestyle='--', alpha=0.3, label='Target: 0.5')
axes[1, 0].legend()
axes[1, 0].grid(axis='y', alpha=0.3)

# Tolerance metrics
tolerance_10 = [results[skill.lower().replace(' ', '_')].get('within_0.1', 0) * 100 for skill in skills]
tolerance_15 = [results[skill.lower().replace(' ', '_')].get('within_0.15', 0) * 100 for skill in skills]
tolerance_20 = [results[skill.lower().replace(' ', '_')].get('within_0.2', 0) * 100 for skill in skills]

x = np.arange(len(skills))
width = 0.25
axes[1, 1].bar(x - width, tolerance_10, width, label='Within ±0.1', alpha=0.8)
axes[1, 1].bar(x, tolerance_15, width, label='Within ±0.15', alpha=0.8)
axes[1, 1].bar(x + width, tolerance_20, width, label='Within ±0.2', alpha=0.8)
axes[1, 1].set_ylabel('Percentage (%)')
axes[1, 1].set_title('Prediction Tolerance')
axes[1, 1].set_xticks(x)
axes[1, 1].set_xticklabels(skills, rotation=45, ha='right')
axes[1, 1].legend()
axes[1, 1].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Display summary metrics table
summary_data = []
for skill, metrics in results.items():
    if skill != 'summary' and isinstance(metrics, dict):
        summary_data.append({
            'Skill': skill.replace('_', ' ').title(),
            'Pearson r': f"{metrics.get('pearson_r', 0):.3f}",
            'Spearman r': f"{metrics.get('spearman_r', 0):.3f}",
            'RMSE': f"{metrics.get('rmse', 0):.3f}",
            'MAE': f"{metrics.get('mae', 0):.3f}",
            'R²': f"{metrics.get('r2_score', 0):.3f}",
            'Within ±0.1': f"{metrics.get('within_0.1', 0)*100:.1f}%"
        })

summary_df = pd.DataFrame(summary_data)
print("\nModel Performance Summary:")
display(summary_df)

# Display overall summary
if 'summary' in results:
    print("\nOverall Averages:")
    for metric, value in results['summary'].items():
        print(f"  {metric}: {value:.3f}")

## 6. Model Registry and Metadata

View and manage model versions and metadata.

In [None]:
# Initialize registry
registry = ModelRegistry(models_dir=MODELS_DIR)

# List all registered models
models = registry.list_models()

print("Registered Models:")
print("=" * 80)

for skill_type, metadata in models.items():
    print(f"\n{skill_type.upper()}:")
    print(f"  Version: {metadata.version}")
    print(f"  Training Date: {metadata.training_date}")
    print(f"  Model Type: {metadata.model_type}")
    print(f"  Training Samples: {metadata.training_samples}")
    print(f"  Feature Count: {metadata.feature_count}")
    print(f"  Performance:")
    for metric, value in metadata.performance_metrics.items():
        print(f"    {metric}: {value:.4f}")
    print(f"  Checksum: {metadata.model_checksum[:16]}...")
    
    # Verify integrity
    integrity_ok = registry.verify_model_integrity(skill_type)
    status = "✓ Valid" if integrity_ok else "✗ Corrupted"
    print(f"  Integrity: {status}")

In [None]:
# Display hyperparameters
print("\nModel Hyperparameters:")
print("=" * 80)

for skill_type, metadata in models.items():
    print(f"\n{skill_type.upper()}:")
    for param, value in metadata.hyperparameters.items():
        print(f"  {param}: {value}")

## 7. Feature Importance Analysis

In [None]:
import joblib

# Analyze feature importance for each skill
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
axes = axes.flatten()

for idx, skill_type in enumerate([SkillType.EMPATHY, SkillType.PROBLEM_SOLVING, 
                                    SkillType.SELF_REGULATION, SkillType.RESILIENCE]):
    model_path = Path(MODELS_DIR) / f"{skill_type.value}_model.pkl"
    features_path = Path(MODELS_DIR) / f"{skill_type.value}_features.pkl"
    
    if model_path.exists() and features_path.exists():
        model = joblib.load(model_path)
        feature_names = joblib.load(features_path)
        
        # Get feature importance
        importance = model.feature_importances_
        
        # Sort by importance
        indices = np.argsort(importance)[::-1][:15]  # Top 15 features
        
        # Plot
        axes[idx].barh(range(len(indices)), importance[indices], alpha=0.8)
        axes[idx].set_yticks(range(len(indices)))
        axes[idx].set_yticklabels([feature_names[i] for i in indices])
        axes[idx].set_xlabel('Importance')
        axes[idx].set_title(f'{skill_type.value.replace("_", " ").title()} - Top 15 Features')
        axes[idx].invert_yaxis()
        axes[idx].grid(axis='x', alpha=0.3)
    else:
        axes[idx].text(0.5, 0.5, f'Model not found for {skill_type.value}', 
                      ha='center', va='center', transform=axes[idx].transAxes)

plt.tight_layout()
plt.show()

## 8. Predictions on New Data

In [None]:
# Example: Make predictions on test data
test_df = pd.read_csv(TEST_DATA_PATH)

print(f"Test data shape: {test_df.shape}")
print(f"\nFirst few rows:")
display(test_df.head())

In [None]:
# Make predictions for all skills
predictions = {}

for skill_type in [SkillType.EMPATHY, SkillType.PROBLEM_SOLVING, 
                   SkillType.SELF_REGULATION, SkillType.RESILIENCE]:
    model_path = Path(MODELS_DIR) / f"{skill_type.value}_model.pkl"
    features_path = Path(MODELS_DIR) / f"{skill_type.value}_features.pkl"
    
    if model_path.exists() and features_path.exists():
        model = joblib.load(model_path)
        feature_names = joblib.load(features_path)
        
        # Extract features
        X = evaluator.extract_features(test_df, skill_type)
        
        # Make predictions
        y_pred = model.predict(X)
        y_pred = np.clip(y_pred, 0.0, 1.0)
        
        predictions[skill_type.value] = y_pred
        test_df[f'{skill_type.value}_predicted'] = y_pred

print("✓ Predictions complete")
print(f"\nPrediction columns added: {[f'{s}_predicted' for s in predictions.keys()]}")

In [None]:
# Visualize predictions vs ground truth
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
axes = axes.flatten()

skill_types = [SkillType.EMPATHY, SkillType.PROBLEM_SOLVING, 
               SkillType.SELF_REGULATION, SkillType.RESILIENCE]

for idx, skill_type in enumerate(skill_types):
    teacher_col = f"{skill_type.value}_teacher"
    pred_col = f"{skill_type.value}_predicted"
    
    if teacher_col in test_df.columns and pred_col in test_df.columns:
        axes[idx].scatter(test_df[teacher_col], test_df[pred_col], alpha=0.5)
        axes[idx].plot([0, 1], [0, 1], 'r--', alpha=0.5, label='Perfect prediction')
        axes[idx].set_xlabel('Teacher Rating')
        axes[idx].set_ylabel('Model Prediction')
        axes[idx].set_title(f'{skill_type.value.replace("_", " ").title()}')
        axes[idx].legend()
        axes[idx].grid(alpha=0.3)
        axes[idx].set_xlim([0, 1])
        axes[idx].set_ylim([0, 1])
    else:
        axes[idx].text(0.5, 0.5, 'Data not available', 
                      ha='center', va='center', transform=axes[idx].transAxes)

plt.tight_layout()
plt.show()

## 9. Export Results

In [None]:
# Save predictions to CSV
output_path = Path(MODELS_DIR) / "test_predictions.csv"
test_df.to_csv(output_path, index=False)

print(f"✓ Predictions saved to {output_path}")
print(f"\nSaved columns: {list(test_df.columns)}")

## 10. Summary

Training workflow complete! This notebook has:

1. ✓ Loaded and explored training data
2. ✓ Trained XGBoost models for all skill types
3. ✓ Evaluated models against teacher ratings
4. ✓ Visualized model performance
5. ✓ Analyzed feature importance
6. ✓ Made predictions on test data
7. ✓ Saved results and model metadata

### Next Steps:

- Review model performance metrics and identify areas for improvement
- Experiment with hyperparameter tuning
- Collect more training data to improve model accuracy
- Deploy models to production environment
- Set up monitoring for model performance drift