# IntelliStore ML: Model Training

This notebook trains machine learning models to predict hot/cold object tiering based on access patterns.

In [None]:
import pandas as pd
import numpy as np
import json
import joblib
import onnx
import onnxruntime as ort
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
import xgboost as xgb
import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn as sns
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("Libraries imported successfully!")

## 1. Load and Prepare Data

In [None]:
# Load processed data
df = pd.read_csv('../data/processed_access_logs.csv')
print(f"Loaded data shape: {df.shape}")

# Load feature metadata
with open('../data/feature_metadata.json', 'r') as f:
    feature_metadata = json.load(f)

print(f"Target distribution:")
print(df['is_hot'].value_counts(normalize=True))

# Display basic info
df.info()

## 2. Feature Engineering and Preprocessing

In [None]:
def prepare_features(df):
    """Prepare features for model training"""
    # Create a copy
    data = df.copy()
    
    # Handle categorical features
    categorical_features = ['size_category', 'current_tier']
    label_encoders = {}
    
    for col in categorical_features:
        if col in data.columns:
            le = LabelEncoder()
            data[col + '_encoded'] = le.fit_transform(data[col].astype(str))
            label_encoders[col] = le
    
    # Select features for training
    feature_columns = [
        'hour_of_day', 'day_of_week', 'is_weekend', 'is_business_hours',
        'object_age_days', 'size', 'is_media', 'user_activity_level',
        'bucket_popularity', 'access_count_7d', 'download_count_7d',
        'unique_users_7d', 'avg_daily_access', 'last_access_hours_ago',
        'recent_access_trend', 'size_category_encoded', 'current_tier_encoded'
    ]
    
    # Filter to existing columns
    feature_columns = [col for col in feature_columns if col in data.columns]
    
    X = data[feature_columns]
    y = data['is_hot']
    
    # Handle missing values
    X = X.fillna(X.median())
    
    return X, y, feature_columns, label_encoders

# Prepare features
X, y, feature_columns, label_encoders = prepare_features(df)
print(f"Feature matrix shape: {X.shape}")
print(f"Features: {feature_columns}")

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set: {X_train.shape}, Test set: {X_test.shape}")
print(f"Training target distribution: {y_train.value_counts(normalize=True)}")

## 3. Model Training and Comparison

In [None]:
# Initialize models
models = {
    'RandomForest': RandomForestClassifier(n_estimators=100, random_state=42),
    'XGBoost': xgb.XGBClassifier(random_state=42, eval_metric='logloss'),
    'LightGBM': lgb.LGBMClassifier(random_state=42, verbose=-1)
}

# Train and evaluate models
model_results = {}

for name, model in models.items():
    print(f"\n=== Training {name} ===")
    
    # Train model
    model.fit(X_train, y_train)
    
    # Predictions
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    
    # Cross-validation score
    cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='roc_auc')
    
    # Metrics
    roc_auc = roc_auc_score(y_test, y_pred_proba)
    
    model_results[name] = {
        'model': model,
        'y_pred': y_pred,
        'y_pred_proba': y_pred_proba,
        'roc_auc': roc_auc,
        'cv_mean': cv_scores.mean(),
        'cv_std': cv_scores.std()
    }
    
    print(f"ROC AUC: {roc_auc:.4f}")
    print(f"CV ROC AUC: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

# Compare models
comparison_df = pd.DataFrame({
    'Model': list(model_results.keys()),
    'ROC_AUC': [results['roc_auc'] for results in model_results.values()],
    'CV_Mean': [results['cv_mean'] for results in model_results.values()],
    'CV_Std': [results['cv_std'] for results in model_results.values()]
})

print("\n=== Model Comparison ===")
print(comparison_df.round(4))

## 4. Hyperparameter Tuning

In [None]:
# Select best model for tuning (usually XGBoost or LightGBM)
best_model_name = comparison_df.loc[comparison_df['ROC_AUC'].idxmax(), 'Model']
print(f"Best model: {best_model_name}")

# Hyperparameter tuning for XGBoost
if best_model_name == 'XGBoost':
    param_grid = {
        'n_estimators': [100, 200],
        'max_depth': [3, 6, 9],
        'learning_rate': [0.01, 0.1, 0.2],
        'subsample': [0.8, 1.0],
        'colsample_bytree': [0.8, 1.0]
    }
    
    base_model = xgb.XGBClassifier(random_state=42, eval_metric='logloss')

elif best_model_name == 'LightGBM':
    param_grid = {
        'n_estimators': [100, 200],
        'max_depth': [3, 6, 9],
        'learning_rate': [0.01, 0.1, 0.2],
        'subsample': [0.8, 1.0],
        'colsample_bytree': [0.8, 1.0]
    }
    
    base_model = lgb.LGBMClassifier(random_state=42, verbose=-1)

else:  # RandomForest
    param_grid = {
        'n_estimators': [100, 200],
        'max_depth': [10, 20, None],
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 2]
    }
    
    base_model = RandomForestClassifier(random_state=42)

# Grid search
print(f"\nPerforming hyperparameter tuning for {best_model_name}...")
grid_search = GridSearchCV(
    base_model, param_grid, cv=3, scoring='roc_auc', n_jobs=-1, verbose=1
)

grid_search.fit(X_train, y_train)

print(f"Best parameters: {grid_search.best_params_}")
print(f"Best CV score: {grid_search.best_score_:.4f}")

# Best model
best_model = grid_search.best_estimator_
y_pred_best = best_model.predict(X_test)
y_pred_proba_best = best_model.predict_proba(X_test)[:, 1]
roc_auc_best = roc_auc_score(y_test, y_pred_proba_best)

print(f"\nBest model test ROC AUC: {roc_auc_best:.4f}")
print("\nBest model classification report:")
print(classification_report(y_test, y_pred_best))

## 5. Model Evaluation and Visualization

In [None]:
# ROC Curves
plt.figure(figsize=(15, 5))

# ROC curves for all models
plt.subplot(1, 3, 1)
for name, results in model_results.items():
    fpr, tpr, _ = roc_curve(y_test, results['y_pred_proba'])
    plt.plot(fpr, tpr, label=f"{name} (AUC = {results['roc_auc']:.3f})")

plt.plot([0, 1], [0, 1], 'k--', label='Random')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves - Model Comparison')
plt.legend()
plt.grid(True)

# Confusion Matrix for best model
plt.subplot(1, 3, 2)
cm = confusion_matrix(y_test, y_pred_best)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title(f'Confusion Matrix - {best_model_name}')
plt.xlabel('Predicted')
plt.ylabel('Actual')

# Feature Importance
plt.subplot(1, 3, 3)
if hasattr(best_model, 'feature_importances_'):
    importance_df = pd.DataFrame({
        'feature': feature_columns,
        'importance': best_model.feature_importances_
    }).sort_values('importance', ascending=True)
    
    plt.barh(range(len(importance_df)), importance_df['importance'])
    plt.yticks(range(len(importance_df)), importance_df['feature'])
    plt.xlabel('Feature Importance')
    plt.title(f'Feature Importance - {best_model_name}')
    plt.grid(True, axis='x')

plt.tight_layout()
plt.show()

# Prediction distribution
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.hist(y_pred_proba_best[y_test == 0], bins=30, alpha=0.7, label='Cold (Actual)', density=True)
plt.hist(y_pred_proba_best[y_test == 1], bins=30, alpha=0.7, label='Hot (Actual)', density=True)
plt.xlabel('Predicted Probability')
plt.ylabel('Density')
plt.title('Prediction Probability Distribution')
plt.legend()
plt.grid(True)

plt.subplot(1, 2, 2)
thresholds = np.arange(0.1, 1.0, 0.1)
precisions = []
recalls = []

for threshold in thresholds:
    y_pred_thresh = (y_pred_proba_best >= threshold).astype(int)
    from sklearn.metrics import precision_score, recall_score
    precision = precision_score(y_test, y_pred_thresh)
    recall = recall_score(y_test, y_pred_thresh)
    precisions.append(precision)
    recalls.append(recall)

plt.plot(thresholds, precisions, 'o-', label='Precision')
plt.plot(thresholds, recalls, 's-', label='Recall')
plt.xlabel('Threshold')
plt.ylabel('Score')
plt.title('Precision-Recall vs Threshold')
plt.legend()
plt.grid(True)

plt.tight_layout()
plt.show()

## 6. Model Export

In [None]:
# Save the best model
model_dir = '../models'
import os
os.makedirs(model_dir, exist_ok=True)

# Save as joblib (for Python)
joblib.dump(best_model, f'{model_dir}/tiering_model.joblib')
print(f"Saved model to {model_dir}/tiering_model.joblib")

# Save preprocessing objects
preprocessing_objects = {
    'feature_columns': feature_columns,
    'label_encoders': label_encoders
}
joblib.dump(preprocessing_objects, f'{model_dir}/preprocessing.joblib')
print(f"Saved preprocessing objects to {model_dir}/preprocessing.joblib")

# Convert to ONNX for production inference
try:
    # Prepare input type for ONNX conversion
    initial_type = [('float_input', FloatTensorType([None, len(feature_columns)]))]
    
    # Convert to ONNX
    onnx_model = convert_sklearn(best_model, initial_types=initial_type)
    
    # Save ONNX model
    with open(f'{model_dir}/tiering_model.onnx', 'wb') as f:
        f.write(onnx_model.SerializeToString())
    
    print(f"Saved ONNX model to {model_dir}/tiering_model.onnx")
    
    # Test ONNX model
    ort_session = ort.InferenceSession(f'{model_dir}/tiering_model.onnx')
    
    # Test prediction
    test_input = X_test.iloc[:5].values.astype(np.float32)
    ort_inputs = {ort_session.get_inputs()[0].name: test_input}
    ort_outputs = ort_session.run(None, ort_inputs)
    
    print("ONNX model test successful!")
    print(f"Sample predictions: {ort_outputs[1][:5, 1]}")
    
except Exception as e:
    print(f"ONNX conversion failed: {e}")
    print("Continuing without ONNX model...")

# Save model metadata
model_metadata = {
    'model_type': best_model_name,
    'model_version': '1.0.0',
    'training_date': pd.Timestamp.now().isoformat(),
    'features': feature_columns,
    'target': 'is_hot',
    'metrics': {
        'roc_auc': float(roc_auc_best),
        'cv_score': float(grid_search.best_score_)
    },
    'hyperparameters': grid_search.best_params_,
    'training_samples': len(X_train),
    'test_samples': len(X_test),
    'class_distribution': {
        'hot': int(y_train.sum()),
        'cold': int(len(y_train) - y_train.sum())
    }
}

with open(f'{model_dir}/model_metadata.json', 'w') as f:
    json.dump(model_metadata, f, indent=2)

print(f"Saved model metadata to {model_dir}/model_metadata.json")

print("\n=== Model Training Complete ===")
print(f"Best model: {best_model_name}")
print(f"ROC AUC: {roc_auc_best:.4f}")
print(f"Model files saved to: {model_dir}/")