In [5]:
# ===============================
# Imports
# ===============================
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report


In [6]:
# ===============================
# Load Datasets
# ===============================
train_users = pd.read_csv("data/train_users.csv")
test_users  = pd.read_csv("data/test_users.csv")
articles    = pd.read_csv("data/news_articles.csv")

print("Train shape:", train_users.shape)
print("Test shape :", test_users.shape)
print("\nColumns:", train_users.columns)


Train shape: (2000, 6)
Test shape : (2000, 6)

Columns: Index(['user_id', 'age', 'income', 'clicks', 'purchase_amount', 'label'], dtype='object')


In [7]:
# ===============================
# Data Cleaning (Missing Values)
# ===============================
def clean_dataframe(df):
    for col in df.columns:
        if df[col].dtype == "object":
            df[col].fillna(df[col].mode()[0], inplace=True)
        else:
            df[col].fillna(df[col].median(), inplace=True)
    return df

train_users = clean_dataframe(train_users)
test_users  = clean_dataframe(test_users)
articles    = clean_dataframe(articles)

print("\nMissing values after cleaning:")
print(train_users.isnull().sum())



Missing values after cleaning:
user_id            0
age                0
income             0
clicks             0
purchase_amount    0
label              0
dtype: int64


In [8]:
# ===============================
# Feature / Target Split
# Target column = 'label'
# ===============================
X_train = train_users.drop(columns=["label"])
y_train = train_users["label"]

X_test  = test_users.drop(columns=["label"])
y_test  = test_users["label"]


In [9]:
# ===============================
# Encode Target Labels
# ===============================
label_encoder = LabelEncoder()

y_train_enc = label_encoder.fit_transform(y_train)
y_test_enc  = label_encoder.transform(y_test)

print("\nLabel Encoding:")
for cls, idx in zip(label_encoder.classes_,
                    label_encoder.transform(label_encoder.classes_)):
    print(f"{cls} → {idx}")



Label Encoding:
user1 → 0
user2 → 1
user3 → 2


In [10]:
# ===============================
# Encode Features (One-Hot if needed)
# ===============================
X_train_enc = pd.get_dummies(X_train, drop_first=True)
X_test_enc  = pd.get_dummies(X_test, drop_first=True)

# Align train and test feature columns
X_train_enc, X_test_enc = X_train_enc.align(
    X_test_enc,
    join="left",
    axis=1,
    fill_value=0
)

print("\nEncoded feature shape:", X_train_enc.shape)



Encoded feature shape: (2000, 5)


In [11]:
# ===============================
# Train Classifier (Context Detector)
# ===============================
classifier = LogisticRegression(
    max_iter=1000,
    multi_class="auto",
    random_state=42
)

classifier.fit(X_train_enc, y_train_enc)


In [12]:
# ===============================
# Evaluation on Test Set
# ===============================
y_pred = classifier.predict(X_test_enc)

accuracy = accuracy_score(y_test_enc, y_pred)

print("\nClassification Accuracy:", accuracy)
print("\nClassification Report:")
print(classification_report(
    y_test_enc,
    y_pred,
    target_names=label_encoder.classes_
))



Classification Accuracy: 0.325

Classification Report:
              precision    recall  f1-score   support

       user1       0.34      0.40      0.36       672
       user2       0.32      0.44      0.37       679
       user3       0.30      0.13      0.18       649

    accuracy                           0.33      2000
   macro avg       0.32      0.32      0.31      2000
weighted avg       0.32      0.33      0.31      2000



In [13]:
# ===============================
# Function to Predict User Context
# (Used later in Bandit)
# ===============================
def predict_user_context(user_row):
    """
    Input: single user row (DataFrame with 1 row)
    Output: predicted user class (User1/User2/User3)
    """
    user_enc = pd.get_dummies(user_row, drop_first=True)
    user_enc = user_enc.reindex(columns=X_train_enc.columns, fill_value=0)
    
    pred = classifier.predict(user_enc)
    return label_encoder.inverse_transform(pred)[0]
