In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [2]:
train = pd.read_csv("data/train_users.csv")
test = pd.read_csv("data/test_users.csv")

print("Train Shape:", train.shape)
print("Test Shape :", test.shape)

print("\nTrain Columns:\n", train.columns)
print("\nTest Columns:\n", test.columns)

print("\nFirst 5 rows of Train:\n", train.head())
print("\nFirst 5 rows of Test:\n", test.head())


Train Shape: (2000, 33)
Test Shape : (2000, 32)

Train Columns:
 Index(['user_id', 'age', 'income', 'clicks', 'purchase_amount',
       'session_duration', 'content_variety', 'engagement_score',
       'num_transactions', 'avg_monthly_spend', 'avg_cart_value',
       'browsing_depth', 'revisit_rate', 'scroll_activity', 'time_on_site',
       'interaction_count', 'preferred_price_range', 'discount_usage_rate',
       'wishlist_size', 'product_views', 'repeat_purchase_gap (days)',
       'churn_risk_score', 'loyalty_index', 'screen_brightness',
       'battery_percentage', 'cart_abandonment_count', 'browser_version',
       'background_app_count', 'session_inactivity_duration', 'network_jitter',
       'region_code', 'subscriber', 'label'],
      dtype='object')

Test Columns:
 Index(['user_id', 'age', 'income', 'clicks', 'purchase_amount',
       'session_duration', 'content_variety', 'engagement_score',
       'num_transactions', 'avg_monthly_spend', 'avg_cart_value',
       'browsing_

In [3]:
print("\nTrain dtypes:\n", train.dtypes)
print("\nTest dtypes:\n", test.dtypes)



Train dtypes:
 user_id                         object
age                            float64
income                           int64
clicks                           int64
purchase_amount                float64
session_duration               float64
content_variety                float64
engagement_score               float64
num_transactions                 int64
avg_monthly_spend              float64
avg_cart_value                 float64
browsing_depth                   int64
revisit_rate                   float64
scroll_activity                  int64
time_on_site                   float64
interaction_count                int64
preferred_price_range          float64
discount_usage_rate            float64
wishlist_size                    int64
product_views                    int64
repeat_purchase_gap (days)     float64
churn_risk_score               float64
loyalty_index                  float64
screen_brightness              float64
battery_percentage             float64
cart_aban

In [4]:
print("\nMissing values in Train:\n", train.isnull().sum())
print("\nMissing values in Test:\n", test.isnull().sum())



Missing values in Train:
 user_id                          0
age                            698
income                           0
clicks                           0
purchase_amount                  0
session_duration                 0
content_variety                  0
engagement_score                 0
num_transactions                 0
avg_monthly_spend                0
avg_cart_value                   0
browsing_depth                   0
revisit_rate                     0
scroll_activity                  0
time_on_site                     0
interaction_count                0
preferred_price_range            0
discount_usage_rate              0
wishlist_size                    0
product_views                    0
repeat_purchase_gap (days)       0
churn_risk_score                 0
loyalty_index                    0
screen_brightness                0
battery_percentage               0
cart_abandonment_count           0
browser_version                  0
background_app_count        

In [5]:
X = train.drop(columns=['label', 'user_id'])
y = train['label']


In [6]:
X['subscriber'] = X['subscriber'].astype(int)

In [7]:
imputer = SimpleImputer(strategy='median')
X['age'] = imputer.fit_transform(X[['age']])


In [8]:
X = pd.get_dummies(
    X,
    columns=['browser_version', 'region_code'],
    drop_first=True
)


In [9]:
le = LabelEncoder()
y_encoded = le.fit_transform(y)


In [10]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y_encoded,
    test_size=0.2,
    random_state=42,
    stratify=y_encoded
)


In [11]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [12]:
clf_model = LogisticRegression(
    penalty='l1',
    solver='saga',
    C=0.5,
    max_iter=1000,
    multi_class='multinomial'
)
clf_model.fit(X_train_scaled, y_train)



In [13]:
y_pred = clf_model.predict(X_test_scaled)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.925
              precision    recall  f1-score   support

           0       0.91      0.88      0.89       142
           1       0.99      0.91      0.95       142
           2       0.88      1.00      0.94       116

    accuracy                           0.93       400
   macro avg       0.93      0.93      0.93       400
weighted avg       0.93      0.93      0.92       400

