In [6]:
import pandas as pd

train = pd.read_csv("data/train_users.csv")
test = pd.read_csv("data/test_users.csv")

print("Train Shape:", train.shape)
print("Test Shape :", test.shape)

print("\nTrain Columns:\n", train.columns)
print("\nTest Columns:\n", test.columns)

print("\nFirst 5 rows of Train:\n", train.head())
print("\nFirst 5 rows of Test:\n", test.head())


Train Shape: (2000, 33)
Test Shape : (2000, 32)

Train Columns:
 Index(['user_id', 'age', 'income', 'clicks', 'purchase_amount',
       'session_duration', 'content_variety', 'engagement_score',
       'num_transactions', 'avg_monthly_spend', 'avg_cart_value',
       'browsing_depth', 'revisit_rate', 'scroll_activity', 'time_on_site',
       'interaction_count', 'preferred_price_range', 'discount_usage_rate',
       'wishlist_size', 'product_views', 'repeat_purchase_gap (days)',
       'churn_risk_score', 'loyalty_index', 'screen_brightness',
       'battery_percentage', 'cart_abandonment_count', 'browser_version',
       'background_app_count', 'session_inactivity_duration', 'network_jitter',
       'region_code', 'subscriber', 'label'],
      dtype='object')

Test Columns:
 Index(['user_id', 'age', 'income', 'clicks', 'purchase_amount',
       'session_duration', 'content_variety', 'engagement_score',
       'num_transactions', 'avg_monthly_spend', 'avg_cart_value',
       'browsing_

In [7]:
print("\nTrain dtypes:\n", train.dtypes)
print("\nTest dtypes:\n", test.dtypes)



Train dtypes:
 user_id                         object
age                            float64
income                           int64
clicks                           int64
purchase_amount                float64
session_duration               float64
content_variety                float64
engagement_score               float64
num_transactions                 int64
avg_monthly_spend              float64
avg_cart_value                 float64
browsing_depth                   int64
revisit_rate                   float64
scroll_activity                  int64
time_on_site                   float64
interaction_count                int64
preferred_price_range          float64
discount_usage_rate            float64
wishlist_size                    int64
product_views                    int64
repeat_purchase_gap (days)     float64
churn_risk_score               float64
loyalty_index                  float64
screen_brightness              float64
battery_percentage             float64
cart_aban

In [8]:
print("\nMissing values in Train:\n", train.isnull().sum())
print("\nMissing values in Test:\n", test.isnull().sum())



Missing values in Train:
 user_id                          0
age                            698
income                           0
clicks                           0
purchase_amount                  0
session_duration                 0
content_variety                  0
engagement_score                 0
num_transactions                 0
avg_monthly_spend                0
avg_cart_value                   0
browsing_depth                   0
revisit_rate                     0
scroll_activity                  0
time_on_site                     0
interaction_count                0
preferred_price_range            0
discount_usage_rate              0
wishlist_size                    0
product_views                    0
repeat_purchase_gap (days)       0
churn_risk_score                 0
loyalty_index                    0
screen_brightness                0
battery_percentage               0
cart_abandonment_count           0
browser_version                  0
background_app_count        

In [9]:
num_cols = ["age", "income", "clicks", "purchase_amount"]

for col in num_cols:
    print(f"\nUnique strange values in {col}:")
    print(train[col].astype(str).unique()[:20])



Unique strange values in age:
['nan' '56.0' '32.0' '28.0' '45.0' '40.0' '44.0' '31.0' '49.0' '47.0'
 '34.0' '18.0' '50.0' '61.0' '46.0' '35.0' '30.0' '60.0' '48.0' '27.0']

Unique strange values in income:
['23053' '20239' '13907' '26615' '27958' '33387' '24650' '11445' '16881'
 '24562' '11687' '22568' '10000' '20249' '20138' '31057' '14696' '20686'
 '17813' '20007']

Unique strange values in clicks:
['10' '11' '9' '12' '13' '16' '14' '5' '7' '6' '15' '8' '17' '3' '18' '2'
 '4' '19' '20' '21']

Unique strange values in purchase_amount:
['500.0' '913.33' '1252.62' '1234.58' '1628.65' '626.37' '643.16' '731.87'
 '2459.74' '2776.48' '2974.56' '1219.79' '879.87' '770.15' '3600.81'
 '3529.73' '1560.12' '1729.67' '558.55' '1621.74']


In [10]:
print("\nLabel distribution in Train:\n", train["label"].value_counts())



Label distribution in Train:
 user_2    712
user_1    707
user_3    581
Name: label, dtype: int64


In [11]:
print(train.describe(include="all"))


       user_id          age        income       clicks  purchase_amount  \
count     2000  1302.000000   2000.000000  2000.000000      2000.000000   
unique    1804          NaN           NaN          NaN              NaN   
top      U8400          NaN           NaN          NaN              NaN   
freq         4          NaN           NaN          NaN              NaN   
mean       NaN    39.227343  21564.393500    10.526000       854.920700   
std        NaN    11.407103   6740.282694     3.299207       833.770472   
min        NaN    18.000000  10000.000000     1.000000       500.000000   
25%        NaN    31.000000  16742.500000     8.000000       500.000000   
50%        NaN    39.000000  21107.500000    10.000000       500.000000   
75%        NaN    47.000000  25892.500000    13.000000       596.295000   
max        NaN    70.000000  51776.000000    25.000000      4873.670000   

        session_duration  content_variety  engagement_score  num_transactions  \
count        2000.

In [12]:
# Try converting numeric columns safely just for correlation check
temp = train.copy()

for col in num_cols:
    temp[col] = pd.to_numeric(temp[col], errors="coerce")

print("\nCorrelation matrix:\n")
print(temp.corr(numeric_only=True))



Correlation matrix:

                                  age    income    clicks  purchase_amount  \
age                          1.000000  0.005670  0.003360        -0.005573   
income                       0.005670  1.000000  0.954261         0.101317   
clicks                       0.003360  0.954261  1.000000         0.038872   
purchase_amount             -0.005573  0.101317  0.038872         1.000000   
session_duration            -0.024319  0.169398  0.143802         0.437852   
content_variety              0.020489  0.201925  0.171107         0.466997   
engagement_score             0.017858  0.204178  0.173684         0.482828   
num_transactions            -0.015881  0.015809  0.010301         0.017010   
avg_monthly_spend           -0.014433  0.097347  0.046843         0.800885   
avg_cart_value               0.017678  0.072587  0.023894         0.764723   
browsing_depth              -0.035764  0.214937  0.190855         0.431066   
revisit_rate                 0.009820  0.3

In [13]:
print("\nDuplicate rows in train:", train.duplicated().sum())



Duplicate rows in train: 0


In [14]:
import pandas as pd

train = pd.read_csv("data/train_users.csv")
test = pd.read_csv("data/test_users.csv")

# Drop junk
drop_cols = ["user_id", "browser_version", "region_code"]
train.drop(columns=drop_cols, inplace=True)
test.drop(columns=drop_cols, inplace=True)

# Fix age
train["age"].fillna(train["age"].median(), inplace=True)
test["age"].fillna(test["age"].median(), inplace=True)

# Encode subscriber
train["subscriber"] = train["subscriber"].astype(int)
test["subscriber"] = test["subscriber"].astype(int)

# ðŸŽ¯ SELECT ONLY ONE FEATURE FROM EACH CORRELATED GROUP
selected_features = [
    "purchase_amount",          # spending
    "session_duration",         # engagement
    "content_variety",          # interest
    "repeat_purchase_gap (days)", # risk
    "income",                   # traffic
    "wishlist_size",            # unique
    "cart_abandonment_count",   # unique
    "age",                      # demographic
    "subscriber"                # boolean
]

X = train[selected_features]
y = train["label"]

X_test = test[selected_features]

print("Final feature shape:", X.shape)
print(X.head())


Final feature shape: (2000, 9)
   purchase_amount  session_duration  content_variety  \
0           500.00             17.34          0.36661   
1           913.33             22.22          0.61370   
2          1252.62             41.57          0.80368   
3           500.00             30.17          0.26499   
4           500.00             65.27          0.36385   

   repeat_purchase_gap (days)  income  wishlist_size  cart_abandonment_count  \
0                       133.0   23053              3                       8   
1                        92.0   20239              4                       5   
2                        67.0   13907              5                       2   
3                       149.0   26615              6                       9   
4                       117.0   27958              0                       9   

    age  subscriber  
0  39.0           0  
1  56.0           1  
2  39.0           1  
3  39.0           0  
4  32.0           0  


In [15]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

le = LabelEncoder()
y = le.fit_transform(y)

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

model = RandomForestClassifier(n_estimators=300, random_state=42)
model.fit(X_train, y_train)

pred = model.predict(X_val)

print("Validation Accuracy:", accuracy_score(y_val, pred))
print(classification_report(y_val, pred))


Validation Accuracy: 0.8625
              precision    recall  f1-score   support

           0       0.81      0.89      0.85       142
           1       0.93      0.88      0.90       142
           2       0.85      0.80      0.83       116

    accuracy                           0.86       400
   macro avg       0.86      0.86      0.86       400
weighted avg       0.87      0.86      0.86       400

