# ðŸ§  AI Scheduler Recommender â€” Training & Analysis

This notebook generates synthetic CPU scheduling workloads, trains an ML model to predict the best scheduling algorithm, and provides interactive analysis tools.

**Pipeline:**
1. Generate 10,000 synthetic workloads
2. Extract features and label each with the best algorithm
3. Train Random Forest + XGBoost classifiers
4. Evaluate accuracy, confusion matrix, and feature importance
5. Interactive prediction for new workloads
6. Save trained model for the Flask API

In [None]:
import sys
sys.path.insert(0, '../backend')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import joblib

from ai.dataset_generator import DatasetGenerator
from ai.feature_engineering import FEATURE_NAMES

plt.style.use('dark_background')
sns.set_palette('husl')
%matplotlib inline

print('âœ… Imports ready')

## 1. Generate Training Dataset

In [None]:
gen = DatasetGenerator(seed=42)
X_raw, y_raw, records = gen.generate(n_samples=10000)

X = np.array(X_raw)
y = np.array(y_raw)

print(f'Dataset: {X.shape[0]} samples, {X.shape[1]} features')
print(f'Labels: {np.unique(y, return_counts=True)}')

## 2. Visualize Workload Distributions

In [None]:
df = pd.DataFrame(X, columns=FEATURE_NAMES)
df['best_algorithm'] = y

fig, axes = plt.subplots(2, 3, figsize=(15, 8))
fig.suptitle('Workload Feature Distributions', fontsize=14, fontweight='bold')

for ax, col in zip(axes.flat, ['mean_burst', 'std_burst', 'n_processes', 'mean_arrival', 'cv_burst', 'cpu_bound_ratio']):
    ax.hist(df[col], bins=30, alpha=0.7, edgecolor='white', linewidth=0.5)
    ax.set_title(col, fontsize=10)
    ax.set_xlabel('')

plt.tight_layout()
plt.show()

In [None]:
# Label distribution
fig, ax = plt.subplots(figsize=(8, 5))
df['best_algorithm'].value_counts().plot(kind='bar', ax=ax, edgecolor='white', linewidth=0.5)
ax.set_title('Best Algorithm Distribution', fontweight='bold')
ax.set_ylabel('Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## 3. Train Models

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Random Forest
rf = RandomForestClassifier(n_estimators=200, max_depth=15, random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)
rf_acc = accuracy_score(y_test, rf_pred)

print(f'Random Forest Accuracy: {rf_acc:.4f}')
print()
print(classification_report(y_test, rf_pred))

In [None]:
# Cross-validation
cv_scores = cross_val_score(rf, X, y, cv=5, n_jobs=-1)
print(f'5-Fold CV: {cv_scores.mean():.4f} Â± {cv_scores.std():.4f}')
print(f'Per-fold:  {[f"{s:.4f}" for s in cv_scores]}')

In [None]:
# XGBoost (optional)
try:
    from xgboost import XGBClassifier
    from sklearn.preprocessing import LabelEncoder
    
    le = LabelEncoder()
    y_encoded = le.fit_transform(y)
    X_tr, X_te, y_tr, y_te = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)
    
    xgb = XGBClassifier(n_estimators=200, max_depth=10, learning_rate=0.1, random_state=42, eval_metric='mlogloss')
    xgb.fit(X_tr, y_tr)
    xgb_pred = xgb.predict(X_te)
    xgb_acc = accuracy_score(y_te, xgb_pred)
    
    print(f'XGBoost Accuracy: {xgb_acc:.4f}')
    print()
    print(classification_report(y_te, xgb_pred, target_names=le.classes_))
except ImportError:
    print('XGBoost not installed. Run: pip install xgboost')

## 4. Accuracy Results Summary

In [None]:
results_df = pd.DataFrame({
    'Model': ['Random Forest', 'XGBoost'],
    'Accuracy': [rf_acc, xgb_acc if 'xgb_acc' in dir() else 'N/A'],
    'CV Mean': [cv_scores.mean(), 'N/A'],
})
results_df.style.format({'Accuracy': '{:.4f}', 'CV Mean': '{:.4f}'}).set_caption('Model Comparison')

## 5. Confusion Matrix

In [None]:
classes = sorted(np.unique(y))
cm = confusion_matrix(y_test, rf_pred, labels=classes)

fig, ax = plt.subplots(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='viridis',
            xticklabels=classes, yticklabels=classes, ax=ax)
ax.set_xlabel('Predicted')
ax.set_ylabel('True')
ax.set_title('Random Forest Confusion Matrix', fontweight='bold')
plt.tight_layout()
plt.show()

## 6. Feature Importance

In [None]:
importances = pd.Series(rf.feature_importances_, index=FEATURE_NAMES).sort_values(ascending=True)

fig, ax = plt.subplots(figsize=(10, 6))
importances.plot(kind='barh', ax=ax, edgecolor='white', linewidth=0.5)
ax.set_title('Feature Importance (Random Forest)', fontweight='bold')
ax.set_xlabel('Importance')
plt.tight_layout()
plt.show()

## 7. Predict Best Algorithm for New Workload

In [None]:
from ai.feature_engineering import extract_features, features_to_vector

# Example workload
new_workload = [
    {'arrival': 0, 'burst': 8, 'priority': 2},
    {'arrival': 1, 'burst': 4, 'priority': 1},
    {'arrival': 2, 'burst': 9, 'priority': 3},
    {'arrival': 3, 'burst': 5, 'priority': 2},
    {'arrival': 4, 'burst': 2, 'priority': 4},
]

features = extract_features(new_workload, time_quantum=2)
feat_vec = features_to_vector(features)
prediction = rf.predict([feat_vec])[0]
probas = rf.predict_proba([feat_vec])[0]

print(f'Recommended: {prediction}')
print(f'Confidence:  {max(probas)*100:.1f}%')
print()
for cls, prob in sorted(zip(rf.classes_, probas), key=lambda x: -x[1]):
    bar = 'â–ˆ' * int(prob * 40)
    print(f'  {cls:12s} {prob*100:5.1f}% {bar}')

## 8. Save Model for API

In [None]:
model_path = '../backend/ai/model.joblib'
joblib.dump(rf, model_path)
print(f'Model saved to {model_path}')
print(f'   The Flask API will load this model automatically.')