In [9]:
# Cell 1 - imports & config
import os
import gc
import time
import joblib
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score, log_loss, accuracy_score
from sklearn.impute import SimpleImputer

# Try to import xgboost, fallback to lightgbm if necessary
try:
    from xgboost import XGBClassifier
    ML_LIB = "xgboost"
except Exception:
    try:
        import lightgbm as lgb
        ML_LIB = "lightgbm"
    except Exception:
        raise ImportError("Install xgboost or lightgbm before running notebook.")

print("ML_LIB:", ML_LIB)

# Paths
DATA_PATH = "train.gz"   # adjust if your file is elsewhere
OUT_DIR = "../backend"
os.makedirs(OUT_DIR, exist_ok=True)

# How many rows to use (reduce if memory limited)
NROWS = 1_000_000  # change to 200_000 if memory constrained
RANDOM_STATE = 42


ML_LIB: xgboost


In [10]:
# Cell 2 - load data (reads gz directly). Use chunks if low memory.
print("Loading data...")
if NROWS is None:
    df = pd.read_csv(DATA_PATH, compression="gzip")
else:
    df = pd.read_csv(DATA_PATH, compression="gzip", nrows=NROWS)

print(df.shape)
df.head()


Loading data...
(1000000, 24)


Unnamed: 0,id,click,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,...,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21
0,1.000009e+18,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,1,2,15706,320,50,1722,0,35,-1,79
1,1.000017e+19,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,1,0,15704,320,50,1722,0,35,100084,79
2,1.000037e+19,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,1,0,15704,320,50,1722,0,35,100084,79
3,1.000064e+19,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,1,0,15706,320,50,1722,0,35,100084,79
4,1.000068e+19,0,14102100,1005,1,fe8cc448,9166c161,0569f928,ecad2386,7801e8d9,...,1,0,18993,320,50,2161,0,35,-1,157


In [11]:
# Cell 3 - quick EDA
print(df.info())
print("Target distribution:")
print(df['click'].value_counts(normalize=True))


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 24 columns):
 #   Column            Non-Null Count    Dtype  
---  ------            --------------    -----  
 0   id                1000000 non-null  float64
 1   click             1000000 non-null  int64  
 2   hour              1000000 non-null  int64  
 3   C1                1000000 non-null  int64  
 4   banner_pos        1000000 non-null  int64  
 5   site_id           1000000 non-null  object 
 6   site_domain       1000000 non-null  object 
 7   site_category     1000000 non-null  object 
 8   app_id            1000000 non-null  object 
 9   app_domain        1000000 non-null  object 
 10  app_category      1000000 non-null  object 
 11  device_id         1000000 non-null  object 
 12  device_ip         1000000 non-null  object 
 13  device_model      1000000 non-null  object 
 14  device_type       1000000 non-null  int64  
 15  device_conn_type  1000000 non-null  int64  
 16  C

In [12]:
# Cell 4 - preprocessing & feature engineering
# Drop id (not useful)
if 'id' in df.columns:
    df = df.drop(columns=['id'])

# Convert hour (YYMMDDHH) -> datetime, extract hour/dayofweek
df['hour'] = pd.to_datetime(df['hour'], format='%y%m%d%H', errors='coerce')
df['hour_of_day'] = df['hour'].dt.hour.fillna(0).astype(int)
df['dayofweek'] = df['hour'].dt.dayofweek.fillna(0).astype(int)
df = df.drop(columns=['hour'])  # keep engineered ones

# Option: reduce cardinality for extremely high-card columns (device_ip, device_id)
# we'll keep them but encode later; for demo it's okay

# Make list of categorical string cols we need to encode
cat_cols = df.select_dtypes(include=['object']).columns.tolist()
print("Categorical columns:", cat_cols)

# Numeric columns (keep as is)
num_cols = [c for c in df.columns if c not in cat_cols + ['click']]
print("Numeric columns:", num_cols)


Categorical columns: ['site_id', 'site_domain', 'site_category', 'app_id', 'app_domain', 'app_category', 'device_id', 'device_ip', 'device_model']
Numeric columns: ['C1', 'banner_pos', 'device_type', 'device_conn_type', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21', 'hour_of_day', 'dayofweek']


In [14]:
# Cell 5 - label encoding (fit & save encoders)
encoders = {}
for col in cat_cols:
    le = LabelEncoder()
    # convert to str to avoid NaN issues
    df[col] = df[col].astype(str).fillna("NA")
    le.fit(df[col].unique().tolist())
    df[col] = le.transform(df[col])
    encoders[col] = le

# Save encoders
joblib.dump(encoders, os.path.join(OUT_DIR, "encoders.joblib"))
print("Saved encoders to:", os.path.join(OUT_DIR, "encoders.joblib"))


Saved encoders to: ../backend\encoders.joblib


In [15]:
# Cell 6 - prepare X, y, train-test split
features = [c for c in df.columns if c != 'click']
X = df[features]
y = df['click']

# Free some memory if needed
del df
gc.collect()

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y
)

print("Train shape:", X_train.shape, "Test shape:", X_test.shape)


Train shape: (800000, 23) Test shape: (200000, 23)


In [16]:
# Cell 7 - quick baseline training (to ensure everything works)
if ML_LIB == "xgboost":
    model = XGBClassifier(
        n_estimators=100,
        max_depth=6,
        learning_rate=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        use_label_encoder=False,
        eval_metric="logloss",
        random_state=RANDOM_STATE
    )
else:
    model = lgb.LGBMClassifier(
        n_estimators=100,
        max_depth=6,
        learning_rate=0.1,
        num_leaves=31,
        random_state=RANDOM_STATE
    )

print("Training baseline model...")
t0 = time.time()
model.fit(X_train, y_train)
print("Done in", time.time() - t0, "seconds.")

# Eval
y_proba = model.predict_proba(X_test)[:, 1]
print("AUC:", roc_auc_score(y_test, y_proba))
print("LogLoss:", log_loss(y_test, y_proba))


Training baseline model...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Done in 5.633976936340332 seconds.
AUC: 0.7557556427412129
LogLoss: 0.38099316674781003


In [17]:
# Cell 8 - Randomized search (small budget for demo)
# NOTE: this is small — increase n_iter for better search but more time
param_dist = {
    'n_estimators': [100, 200, 400],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [4, 6, 8],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}

if ML_LIB == "xgboost":
    base = XGBClassifier(use_label_encoder=False, eval_metric="logloss", random_state=RANDOM_STATE)
else:
    base = lgb.LGBMClassifier(random_state=RANDOM_STATE)

rs = RandomizedSearchCV(base, param_distributions=param_dist, n_iter=8, scoring='roc_auc', cv=3, verbose=2, n_jobs=-1, random_state=RANDOM_STATE)
print("Starting RandomizedSearchCV (this can take time)...")
rs.fit(X_train, y_train)
print("Best params:", rs.best_params_)
print("Best CV AUC:", rs.best_score_)

# use best estimator
best_model = rs.best_estimator_


Starting RandomizedSearchCV (this can take time)...
Fitting 3 folds for each of 8 candidates, totalling 24 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Best params: {'subsample': 0.8, 'n_estimators': 200, 'max_depth': 8, 'learning_rate': 0.1, 'colsample_bytree': 1.0}
Best CV AUC: 0.7609857128671768


In [20]:
# Cell 9 - retrain best model on full train set (if randomized search done)
try:
    final_model = best_model
except NameError:
    final_model = model  # fallback to baseline if no tuning performed

print("Retraining final model (fit on X_train)...")
final_model.fit(X_train, y_train)

# Evaluate
y_proba = final_model.predict_proba(X_test)[:, 1]
print("Final AUC:", roc_auc_score(y_test, y_proba))
print("Final LogLoss:", log_loss(y_test, y_proba))
print("Final Accuracy (threshold 0.5):", accuracy_score(y_test, (y_proba > 0.5).astype(int)))


Retraining final model (fit on X_train)...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Final AUC: 0.7671162589181011
Final LogLoss: 0.3746939560850871
Final Accuracy (threshold 0.5): 0.844215


In [21]:
# Cell 10 - save trained model and feature order
MODEL_PATH = os.path.join(OUT_DIR, "ctr_model.pkl")
joblib.dump(final_model, MODEL_PATH)
joblib.dump(features, os.path.join(OUT_DIR, "feature_order.joblib"))
print("Saved model to:", MODEL_PATH)
print("Saved feature order to:", os.path.join(OUT_DIR, "feature_order.joblib"))


Saved model to: ../backend\ctr_model.pkl
Saved feature order to: ../backend\feature_order.joblib
