# Lab 3: Contextual Bandit-Based News Article Recommendation



# Imports and Setup

In [20]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier

from rlcmab_sampler import sampler

# Initialize the reward sampler with roll number
# U20230083 → i = 83
reward_sampler = sampler(83)

print("✓ All imports loaded and sampler initialized (roll_number=83)")

✓ All imports loaded and sampler initialized (roll_number=83)


# Load Datasets

In [21]:
# Load all three datasets
news_df = pd.read_csv("data/news_articles.csv")
train_users = pd.read_csv("data/train_users.csv")
test_users = pd.read_csv("data/test_users.csv")

print("=== News Articles ===")
print(f"Shape: {news_df.shape}")
print(f"Columns: {list(news_df.columns)}")
print(f"All categories: {sorted(news_df['category'].unique())}")
print()

print("=== Train Users ===")
print(f"Shape: {train_users.shape}")
print(f"Label distribution:\n{train_users['label'].value_counts()}")
print()

print("=== Test Users ===")
print(f"Shape: {test_users.shape}")
print(f"Has 'label' column: {'label' in test_users.columns}")

=== News Articles ===
Shape: (209527, 6)
Columns: ['link', 'headline', 'category', 'short_description', 'authors', 'date']
All categories: ['ARTS', 'ARTS & CULTURE', 'BLACK VOICES', 'BUSINESS', 'COLLEGE', 'COMEDY', 'CRIME', 'CULTURE & ARTS', 'DIVORCE', 'EDUCATION', 'ENTERTAINMENT', 'ENVIRONMENT', 'FIFTY', 'FOOD & DRINK', 'GOOD NEWS', 'GREEN', 'HEALTHY LIVING', 'HOME & LIVING', 'IMPACT', 'LATINO VOICES', 'MEDIA', 'MONEY', 'PARENTING', 'PARENTS', 'POLITICS', 'QUEER VOICES', 'RELIGION', 'SCIENCE', 'SPORTS', 'STYLE', 'STYLE & BEAUTY', 'TASTE', 'TECH', 'THE WORLDPOST', 'TRAVEL', 'U.S. NEWS', 'WEDDINGS', 'WEIRD NEWS', 'WELLNESS', 'WOMEN', 'WORLD NEWS', 'WORLDPOST']

=== Train Users ===
Shape: (2000, 33)
Label distribution:
label
user_2    712
user_1    707
user_3    581
Name: count, dtype: int64

=== Test Users ===
Shape: (2000, 32)
Has 'label' column: False


## Data Preprocessing

In this section:
- Handle missing values
- Encode categorical features
- Prepare data for user classification

In [22]:
# --- Data Cleaning ---
print(f"Before cleaning:")
print(f"  train_users: {train_users.shape[0]} rows, NaN count: {train_users.isna().sum().sum()}")
print(f"  test_users:  {test_users.shape[0]} rows, NaN count: {test_users.isna().sum().sum()}")
print(f"  news_df:     {news_df.shape[0]} rows, NaN count: {news_df.isna().sum().sum()}")

train_users_clean = train_users.dropna().copy()
test_users_clean = test_users.dropna().copy()
news_df_clean = news_df.dropna().copy()

print(f"\nAfter dropping NaN rows:")
print(f"  train_users: {train_users_clean.shape[0]} rows")
print(f"  test_users:  {test_users_clean.shape[0]} rows")
print(f"  news_df:     {news_df_clean.shape[0]} rows")

# --- Encode User Categories ---
# The 'label' column in train_users contains: user_1, user_2, user_3
user_encoder = LabelEncoder()
train_users_clean['user_category_encoded'] = user_encoder.fit_transform(train_users_clean['label'])

print(f"\nUser category encoding: {dict(zip(user_encoder.classes_, user_encoder.transform(user_encoder.classes_)))}")

# --- Filter and Encode News Categories ---
# Assignment specifies 4 categories: Entertainment, Education, Tech, Crime
target_categories = ['ENTERTAINMENT', 'EDUCATION', 'TECH', 'CRIME']
news_df_filtered = news_df_clean[news_df_clean['category'].isin(target_categories)].copy()

# Encode according to Table 1 mapping:
# j=0 → Entertainment, j=1 → Education, j=2 → Tech, j=3 → Crime
news_category_mapping = {
    'ENTERTAINMENT': 0,
    'EDUCATION': 1,
    'TECH': 2,
    'CRIME': 3
}
news_df_filtered['category_encoded'] = news_df_filtered['category'].map(news_category_mapping)

idx_to_news_name = {v: k for k, v in news_category_mapping.items()}
user_to_idx = {cat: idx for idx, cat in enumerate(user_encoder.classes_)}

print(f"\nNews category mapping (per Table 1): {news_category_mapping}")
print(f"Filtered news articles: {news_df_filtered.shape[0]} (from {news_df_clean.shape[0]} total)")
print(f"Articles per category:\n{news_df_filtered['category'].value_counts()}")
print("\n✓ Preprocessing complete")

Before cleaning:
  train_users: 2000 rows, NaN count: 698
  test_users:  2000 rows, NaN count: 679
  news_df:     209527 rows, NaN count: 57136

After dropping NaN rows:
  train_users: 1302 rows
  test_users:  1321 rows
  news_df:     156859 rows

User category encoding: {'user_1': np.int64(0), 'user_2': np.int64(1), 'user_3': np.int64(2)}

News category mapping (per Table 1): {'ENTERTAINMENT': 0, 'EDUCATION': 1, 'TECH': 2, 'CRIME': 3}
Filtered news articles: 18130 (from 156859 total)
Articles per category:
category
ENTERTAINMENT    13463
CRIME             2093
TECH              1681
EDUCATION          893
Name: count, dtype: int64

✓ Preprocessing complete


## User Classification

Train a classifier to predict the user category (`User1`, `User2`, `User3`),
which serves as the **context** for the contextual bandit.


In [23]:
# --- Prepare features for classification ---
# Identify non-numeric columns to exclude
non_numeric_cols = train_users_clean.select_dtypes(exclude=[np.number]).columns.tolist()
# Also exclude the encoded target
cols_to_drop = non_numeric_cols + ['user_category_encoded']
print(f"Dropping columns: {cols_to_drop}")

X = train_users_clean.drop(columns=cols_to_drop)
y = train_users_clean['user_category_encoded']

feature_columns = X.columns.tolist()
print(f"Feature columns ({len(feature_columns)}): {feature_columns}")

# 80/20 train/validation split (stratified)
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print(f"\nTraining set: {X_train.shape[0]} samples")
print(f"Validation set: {X_val.shape[0]} samples")

# Train Random Forest Classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Evaluate on validation set
y_val_pred = clf.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)

print(f"\n{'='*50}")
print(f"Validation Accuracy: {val_accuracy:.4f}")
print(f"{'='*50}")
print(f"\nClassification Report:")
print(classification_report(y_val, y_val_pred, target_names=user_encoder.classes_))

Dropping columns: ['user_id', 'browser_version', 'region_code', 'subscriber', 'label', 'user_category_encoded']
Feature columns (28): ['age', 'income', 'clicks', 'purchase_amount', 'session_duration', 'content_variety', 'engagement_score', 'num_transactions', 'avg_monthly_spend', 'avg_cart_value', 'browsing_depth', 'revisit_rate', 'scroll_activity', 'time_on_site', 'interaction_count', 'preferred_price_range', 'discount_usage_rate', 'wishlist_size', 'product_views', 'repeat_purchase_gap (days)', 'churn_risk_score', 'loyalty_index', 'screen_brightness', 'battery_percentage', 'cart_abandonment_count', 'background_app_count', 'session_inactivity_duration', 'network_jitter']

Training set: 1041 samples
Validation set: 261 samples

Validation Accuracy: 0.8889

Classification Report:
              precision    recall  f1-score   support

      user_1       0.85      0.96      0.90       113
      user_2       0.96      0.94      0.95       126
      user_3       0.50      0.23      0.31     