In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

# --- TASK 5.1: DATA PRE-PROCESSING ---

# 1. Load the dataset
# We use 'train_users.csv' for training the classifier as per instructions.
df = pd.read_csv('data/train_users.csv')

print("Dataset Shape:", df.shape)
print("Columns:", df.columns.tolist())

Dataset Shape: (2000, 33)
Columns: ['user_id', 'age', 'income', 'clicks', 'purchase_amount', 'session_duration', 'content_variety', 'engagement_score', 'num_transactions', 'avg_monthly_spend', 'avg_cart_value', 'browsing_depth', 'revisit_rate', 'scroll_activity', 'time_on_site', 'interaction_count', 'preferred_price_range', 'discount_usage_rate', 'wishlist_size', 'product_views', 'repeat_purchase_gap (days)', 'churn_risk_score', 'loyalty_index', 'screen_brightness', 'battery_percentage', 'cart_abandonment_count', 'browser_version', 'background_app_count', 'session_inactivity_duration', 'network_jitter', 'region_code', 'subscriber', 'label']


In [9]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report

# --- 1. LOAD DATA ---
# We ONLY use train_users.csv for the accuracy check, as per instruction 5.5
df = pd.read_csv('data/train_users.csv')
print(f"Dataset Shape: {df.shape}")

# --- 2. PREPROCESSING ---
target_col = 'label'

# Separate Features (X) and Target (y)
X = df.drop(['user_id', target_col], axis=1)
y = df[target_col]

# Identify categorical columns
categorical_cols = X.select_dtypes(include=['object']).columns
print(f"Encoding categorical columns: {list(categorical_cols)}")

# Encode Categorical Features
for col in categorical_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])

# Encode Target Labels (User1 -> 0, User2 -> 1, User3 -> 2)
target_le = LabelEncoder()
y_encoded = target_le.fit_transform(y)

# --- 3. TRAIN/VALIDATION SPLIT ---
# Instruction 5.2: "Split the train_users.csv ... into training set (80%) and validation set (20%)"
X_train, X_val, y_train, y_val = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# --- 4. TRAIN XGBOOST CLASSIFIER ---
model = xgb.XGBClassifier(
    n_estimators=200, 
    learning_rate=0.05, 
    max_depth=6, 
    subsample=0.8,
    colsample_bytree=0.8,
    objective='multi:softmax',
    num_class=3,
    eval_metric='mlogloss',
    random_state=42
)

print("\nTraining XGBoost Classifier (80% of data)...")
model.fit(X_train, y_train)

# --- 5. EVALUATE ON VALIDATION SET ---
print("\nEvaluating on Validation Set (20% of data)...")
y_pred = model.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)

print("-" * 30)
print(f"Validation Accuracy: {accuracy:.4f}")
print("-" * 30)
print("\nClassification Report:\n")
print(classification_report(y_val, y_pred, target_names=target_le.classes_))

Dataset Shape: (2000, 33)
Encoding categorical columns: ['browser_version', 'region_code']

Training XGBoost Classifier (80% of data)...

Evaluating on Validation Set (20% of data)...
------------------------------
Validation Accuracy: 0.8775
------------------------------

Classification Report:

              precision    recall  f1-score   support

      user_1       0.90      0.82      0.85       147
      user_2       0.95      0.88      0.91       141
      user_3       0.79      0.96      0.87       112

    accuracy                           0.88       400
   macro avg       0.88      0.88      0.88       400
weighted avg       0.88      0.88      0.88       400

