# XGBoost Ranker Training

This notebook trains a Learning-to-Rank model using XGBoost for video ranking.

**Goal**: Create `xgboost_ranker.pkl` for ML-based video ranking.

## What This Does:
1. Creates synthetic training data (or use your own)
2. Trains XGBoost ranker
3. Evaluates ranking performance
4. Saves the model

**Upload this notebook to Kaggle and run it there!**

## 1. Install Dependencies

In [None]:
!pip install -q xgboost scikit-learn pandas numpy matplotlib seaborn

## 2. Import Libraries

In [None]:
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from typing import List, Dict

import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import ndcg_score
import os

print("‚úÖ Libraries imported successfully!")
print(f"XGBoost version: {xgb.__version__}")

## 3. Generate Synthetic Training Data

**Note**: Replace this with your actual user feedback data for better results!

In [None]:
def generate_synthetic_data(n_queries: int = 100, videos_per_query: int = 20) -> pd.DataFrame:
    """Generate synthetic video ranking data."""
    
    data = []
    
    for query_id in range(n_queries):
        for _ in range(videos_per_query):
            # Features
            views = np.random.lognormal(10, 2)  # Log-normal distribution for views
            likes = views * np.random.uniform(0.01, 0.1)  # 1-10% like rate
            subscribers = np.random.lognormal(8, 3)
            relevance = np.random.uniform(0, 1)
            duration = np.random.uniform(5, 60)  # 5-60 minutes
            days_old = np.random.uniform(0, 365 * 3)  # Up to 3 years old
            
            # Derived features
            like_ratio = likes / max(views, 1)
            recency_score = 1 / (1 + days_old / 365)
            duration_penalty = 1 if 10 <= duration <= 30 else 0.5
            
            # Target: relevance score (0-4, higher is better)
            # Good videos: high relevance, good engagement, recent
            target = (
                relevance * 2 +  # Relevance is most important
                like_ratio * 10 +
                recency_score * 0.5 +
                duration_penalty * 0.5 +
                np.random.normal(0, 0.2)  # Add noise
            )
            
            # FIX: XGBoost rank:ndcg requires INTEGER labels
            target = int(np.round(np.clip(target, 0, 4))) # Clip to 0-4 range and convert to integer
            
            data.append({
                'query_id': query_id,
                'views': views,
                'likes': likes,
                'subscribers': subscribers,
                'relevance': relevance,
                'duration': duration,
                'days_old': days_old,
                'like_ratio': like_ratio,
                'recency_score': recency_score,
                'duration_penalty': duration_penalty,
                'target': target
            })
    
    return pd.DataFrame(data)

# Generate data
df = generate_synthetic_data(n_queries=200, videos_per_query=30)

print(f"\nüìä Generated {len(df)} training samples")
print(f"Queries: {df['query_id'].nunique()}")
print(f"\nFeature columns: {list(df.columns)}")
print(f"\nSample data:")
df.head()

## 4. Data Exploration

In [None]:
# Statistics
print("\nüìà Data Statistics:")
print(df.describe())

# Target distribution
plt.figure(figsize=(10, 4))

plt.subplot(1, 2, 1)
plt.hist(df['target'], bins=30, edgecolor='black')
plt.xlabel('Target Score')
plt.ylabel('Frequency')
plt.title('Target Score Distribution')

plt.subplot(1, 2, 2)
correlation = df[['views', 'likes', 'subscribers', 'relevance', 'recency_score', 'target']].corr()['target'].sort_values(ascending=False)
correlation.plot(kind='barh')
plt.xlabel('Correlation with Target')
plt.title('Feature Correlations')

plt.tight_layout()
plt.show()

## 5. Prepare Training Data

In [None]:
# Feature columns
feature_cols = ['views', 'likes', 'subscribers', 'relevance', 'duration', 
                'days_old', 'like_ratio', 'recency_score', 'duration_penalty']

X = df[feature_cols].values
y = df['target'].values
groups = df.groupby('query_id').size().values  # Group sizes for ranking

# Split data
# For ranking, we need to keep queries together
unique_queries = df['query_id'].unique()
train_queries, test_queries = train_test_split(unique_queries, test_size=0.2, random_state=42)

train_mask = df['query_id'].isin(train_queries)
test_mask = df['query_id'].isin(test_queries)

X_train, y_train = X[train_mask], y[train_mask]
X_test, y_test = X[test_mask], y[test_mask]

train_groups = df[train_mask].groupby('query_id').size().values
test_groups = df[test_mask].groupby('query_id').size().values

print(f"\nüì¶ Data Split:")
print(f"Training: {len(X_train)} samples, {len(train_groups)} queries")
print(f"Testing: {len(X_test)} samples, {len(test_groups)} queries")

## 6. Train XGBoost Ranker

In [None]:
# Create DMatrix for XGBoost
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# Set group information for ranking
dtrain.set_group(train_groups)
dtest.set_group(test_groups)

# XGBoost parameters for ranking
params = {
    'objective': 'rank:ndcg',  # Ranking objective
    'eval_metric': 'ndcg@10',
    'eta': 0.1,  # Learning rate
    'max_depth': 6,
    'min_child_weight': 1,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'seed': 42
}

# Train model
print("\nüöÄ Training XGBoost Ranker...\n")
evals = [(dtrain, 'train'), (dtest, 'test')]
evals_result = {}

model = xgb.train(
    params,
    dtrain,
    num_boost_round=100,
    evals=evals,
    evals_result=evals_result,
    early_stopping_rounds=10,
    verbose_eval=10
)

print("\n‚úÖ Training complete!")

## 7. Evaluate Model Performance

In [None]:
# Plot training history
plt.figure(figsize=(10, 4))

plt.subplot(1, 2, 1)
plt.plot(evals_result['train']['ndcg@10'], label='Train')
plt.plot(evals_result['test']['ndcg@10'], label='Test')
plt.xlabel('Iteration')
plt.ylabel('nDCG@10')
plt.title('Training Progress')
plt.legend()
plt.grid(True, alpha=0.3)

# Feature importance
plt.subplot(1, 2, 2)
importance = model.get_score(importance_type='weight')
features = list(importance.keys())
scores = list(importance.values())
plt.barh(features, scores)
plt.xlabel('Importance')
plt.title('Feature Importance')

plt.tight_layout()
plt.show()

# Calculate nDCG on test set
y_pred = model.predict(dtest)

# Calculate nDCG per query and average
ndcg_scores = []
start_idx = 0
for group_size in test_groups:
    end_idx = start_idx + group_size
    y_true_group = y_test[start_idx:end_idx].reshape(1, -1)
    y_pred_group = y_pred[start_idx:end_idx].reshape(1, -1)
    
    if len(y_true_group[0]) > 1:  # Need at least 2 items
        ndcg = ndcg_score(y_true_group, y_pred_group, k=10)
        ndcg_scores.append(ndcg)
    
    start_idx = end_idx

avg_ndcg = np.mean(ndcg_scores)
print(f"\nüìä Test Set Performance:")
print(f"Average nDCG@10: {avg_ndcg:.4f}")
print(f"Min nDCG: {np.min(ndcg_scores):.4f}")
print(f"Max nDCG: {np.max(ndcg_scores):.4f}")

## 8. Create Ranker Wrapper Class

In [None]:
class XGBoostRanker:
    """Wrapper for XGBoost ranking model."""
    
    def __init__(self, model, feature_names: List[str]):
        self.model = model
        self.feature_names = feature_names
    
    def rank(self, videos: List[Dict]) -> List[Dict]:
        """Rank videos using the trained model."""
        if not videos:
            return []
        
        # Extract features
        features = []
        for video in videos:
            feature_vector = [video.get(feat, 0) for feat in self.feature_names]
            features.append(feature_vector)
        
        # Predict scores
        X = np.array(features)
        dmatrix = xgb.DMatrix(X)
        scores = self.model.predict(dmatrix)
        
        # Add scores to videos and sort
        for video, score in zip(videos, scores):
            video['ml_score'] = float(score)
        
        ranked_videos = sorted(videos, key=lambda x: x['ml_score'], reverse=True)
        return ranked_videos
    
    def predict_score(self, video: Dict) -> float:
        """Predict score for a single video."""
        feature_vector = [video.get(feat, 0) for feat in self.feature_names]
        X = np.array([feature_vector])
        dmatrix = xgb.DMatrix(X)
        return float(self.model.predict(dmatrix)[0])

# Create ranker instance
ranker = XGBoostRanker(model, feature_cols)

print("‚úÖ XGBoostRanker class created!")

## 9. Test the Ranker

In [None]:
# Create test videos
test_videos = [
    {
        'title': 'High Quality ML Tutorial',
        'views': 100000, 'likes': 5000, 'subscribers': 50000,
        'relevance': 0.95, 'duration': 20, 'days_old': 30,
        'like_ratio': 0.05, 'recency_score': 0.92, 'duration_penalty': 1.0
    },
    {
        'title': 'Old Low Quality Video',
        'views': 1000, 'likes': 10, 'subscribers': 500,
        'relevance': 0.3, 'duration': 60, 'days_old': 1000,
        'like_ratio': 0.01, 'recency_score': 0.27, 'duration_penalty': 0.5
    },
    {
        'title': 'Recent Viral Video',
        'views': 500000, 'likes': 40000, 'subscribers': 100000,
        'relevance': 0.75, 'duration': 15, 'days_old': 7,
        'like_ratio': 0.08, 'recency_score': 0.98, 'duration_penalty': 1.0
    }
]

# Rank videos
ranked = ranker.rank(test_videos.copy())

print("\nüèÜ Ranked Videos:\n")
for i, video in enumerate(ranked, 1):
    print(f"{i}. {video['title']}")
    print(f"   ML Score: {video['ml_score']:.4f}")
    print(f"   Relevance: {video['relevance']:.2f}, Views: {video['views']:,}\n")

## 10. Save the Model as .pkl File

In [None]:
# Save the ranker
output_path = 'xgboost_ranker.pkl'

with open(output_path, 'wb') as f:
    pickle.dump(ranker, f)

print(f"\n‚úÖ Model saved to: {output_path}")
print(f"File size: {os.path.getsize(output_path) / (1024*1024):.2f} MB")

# Test loading
with open(output_path, 'rb') as f:
    loaded_ranker = pickle.load(f)

# Verify loaded model works
test_score = loaded_ranker.predict_score(test_videos[0])

print(f"\n‚úÖ Model loaded successfully!")
print(f"Test prediction: {test_score:.4f}")
print("\nüì• Download this file and place it in: ml_models/ranking/xgboost_ranker.pkl")

## Next Steps

1. ‚úÖ Download `xgboost_ranker.pkl` from Kaggle
2. üìÅ Place it in: `c:\Users\Acer\Documents\GitHub\AutoYT-Playlist\ml_models\ranking\xgboost_ranker.pkl`
3. üöÄ The backend will use this for ML-based ranking!

---

**Model Info:**
- Algorithm: XGBoost Learning-to-Rank
- Objective: rank:ndcg
- Features: 9 (views, likes, subscribers, relevance, etc.)
- Performance: nDCG@10 ‚âà {avg_ndcg:.4f}

**To Improve:**
- Collect real user feedback data
- Add more features (comments, engagement rate, etc.)
- Tune hyperparameters
- Use cross-validation