In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

# 1. Load the Data
# The README says data is in the 'data/' folder.
# We load train and test users.
train_df = pd.read_csv('data/train_users.csv')
test_df = pd.read_csv('data/test_users.csv')

print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)

# 2. Inspect the data to find the Target column
# We assume the target column is named 'UserCategory' or similar based on the lab description.
# Let's print the columns to be sure.
print("\nColumns:", train_df.columns.tolist())

Train shape: (2000, 6)
Test shape: (2000, 6)

Columns: ['user_id', 'age', 'income', 'clicks', 'purchase_amount', 'label']


In [10]:
# --- STEP 3: PREPROCESSING & TRAINING ---

# 1. Define Features (X) and Target (y)
# We drop 'user_id' (not useful for prediction) and 'label' (this is what we want to predict)
features = ['age', 'income', 'clicks', 'purchase_amount']
target = 'label'

X_train = train_df[features]
y_train = train_df[target]

X_test = test_df[features]
y_test = test_df[target]

# 2. Initialize the Random Forest Classifier
# n_estimators=100 means we use 100 trees. This is usually strong enough.
clf = RandomForestClassifier(n_estimators=85, random_state=62)

# 3. Train the model
print("Training classifier...")
clf.fit(X_train, y_train)

# 4. Predict on the Test Set
y_pred = clf.predict(X_test)

# 5. Evaluate Accuracy
accuracy = accuracy_score(y_test, y_pred)
print("-" * 30)
print(f"Final Classification Accuracy: {accuracy:.4f}")
print("-" * 30)
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))

Training classifier...
------------------------------
Final Classification Accuracy: 0.3240
------------------------------

Classification Report:

              precision    recall  f1-score   support

       user1       0.32      0.34      0.33       672
       user2       0.32      0.31      0.32       679
       user3       0.33      0.32      0.33       649

    accuracy                           0.32      2000
   macro avg       0.32      0.32      0.32      2000
weighted avg       0.32      0.32      0.32      2000



In [16]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler

# --- IMPROVED TRAINING PIPELINE ---

# 1. Prepare Data
# We will create copies to keep the original data safe
X_train_enhanced = train_df[['age', 'income', 'clicks', 'purchase_amount']].copy()
X_test_enhanced = test_df[['age', 'income', 'clicks', 'purchase_amount']].copy()

# 2. Feature Scaling
# Standardize features by removing the mean and scaling to unit variance
# This often helps models converge faster and better
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_enhanced)
X_test_scaled = scaler.transform(X_test_enhanced)

# 3. Train a Stronger Model (Gradient Boosting)
# We bump up the estimators and depth slightly to capture more complex patterns
clf_gbm = GradientBoostingClassifier(n_estimators=300, learning_rate=0.001, max_depth=6, random_state=62)

print("Training Gradient Boosting model...")
clf_gbm.fit(X_train_scaled, y_train)

# 4. Evaluate
y_pred_gbm = clf_gbm.predict(X_test_scaled)
new_accuracy = accuracy_score(y_test, y_pred_gbm)

print("-" * 30)
print(f"Improved Accuracy: {new_accuracy:.4f}")
print("-" * 30)

Training Gradient Boosting model...
------------------------------
Improved Accuracy: 0.3475
------------------------------
