In [None]:
# Core libraries
import pandas as pd
import numpy as np

# Model and preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, roc_auc_score, log_loss
from xgboost import XGBClassifier

# Save/load models
import joblib

# For warnings
import warnings
warnings.filterwarnings("ignore")


In [2]:
# Cell 2 – Load Dataset
import pandas as pd

# Load the Avazu training dataset
data = pd.read_csv("train.gz", compression='gzip')  # or full path
print("✅ Data Loaded Successfully!")
data.head()


✅ Data Loaded Successfully!


Unnamed: 0,id,click,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,...,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21
0,1.000009e+18,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,1,2,15706,320,50,1722,0,35,-1,79
1,1.000017e+19,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,1,0,15704,320,50,1722,0,35,100084,79
2,1.000037e+19,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,1,0,15704,320,50,1722,0,35,100084,79
3,1.000064e+19,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,1,0,15706,320,50,1722,0,35,100084,79
4,1.000068e+19,0,14102100,1005,1,fe8cc448,9166c161,0569f928,ecad2386,7801e8d9,...,1,0,18993,320,50,2161,0,35,-1,157


In [4]:
# We will only use user-friendly and meaningful columns
features = [
    "hour", "banner_pos", 
    "site_id", "site_domain", "site_category",
    "app_id", "app_domain", "app_category",
    "device_id", "device_model", "device_type", "device_conn_type"
]

target = "click"

# Subset the dataset
data = data[features + [target]]
print("✅ Columns selected for training:")
print(data.head())


✅ Columns selected for training:
       hour  banner_pos   site_id site_domain site_category    app_id  \
0  14102100           0  1fbe01fe    f3845767      28905ebd  ecad2386   
1  14102100           0  1fbe01fe    f3845767      28905ebd  ecad2386   
2  14102100           0  1fbe01fe    f3845767      28905ebd  ecad2386   
3  14102100           0  1fbe01fe    f3845767      28905ebd  ecad2386   
4  14102100           1  fe8cc448    9166c161      0569f928  ecad2386   

  app_domain app_category device_id device_model  device_type  \
0   7801e8d9     07d7df22  a99f214a     44956a24            1   
1   7801e8d9     07d7df22  a99f214a     711ee120            1   
2   7801e8d9     07d7df22  a99f214a     8a4875bd            1   
3   7801e8d9     07d7df22  a99f214a     6332421a            1   
4   7801e8d9     07d7df22  a99f214a     779d90c2            1   

   device_conn_type  click  
0                 2      0  
1                 0      0  
2                 0      0  
3                 0  

In [5]:
# Categorical columns
categorical_cols = [
    "site_id", "site_domain", "site_category",
    "app_id", "app_domain", "app_category",
    "device_id", "device_model"
]

# Initialize LabelEncoders
encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col].astype(str))
    encoders[col] = le

print("✅ Categorical encoding complete. Example:")
data.head()


✅ Categorical encoding complete. Example:


Unnamed: 0,hour,banner_pos,site_id,site_domain,site_category,app_id,app_domain,app_category,device_id,device_model,device_type,device_conn_type,click
0,14102100,0,582,7339,2,7884,254,0,1780272,2187,1,2,0
1,14102100,0,582,7339,2,7884,254,0,1780272,3612,1,0,0
2,14102100,0,582,7339,2,7884,254,0,1780272,4460,1,0,0
3,14102100,0,582,7339,2,7884,254,0,1780272,3163,1,0,0
4,14102100,1,4695,4456,0,7884,254,0,1780272,3835,1,0,0


In [6]:
# Split features and target
X = data[features]
y = data[target]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
print(f"✅ Training samples: {X_train.shape[0]}, Testing samples: {X_test.shape[0]}")


✅ Training samples: 32343173, Testing samples: 8085794


In [8]:
# Initialize XGBoost
model = XGBClassifier(
    n_estimators=200,
    max_depth=8,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric='logloss',
    use_label_encoder=False
)

# Train the model
print("⏳ Training model, this may take a few minutes...")
model.fit(X_train, y_train)
print("✅ Model training complete!")


⏳ Training model, this may take a few minutes...
✅ Model training complete!


In [9]:
# Predictions
y_pred_proba = model.predict_proba(X_test)[:, 1]
y_pred = (y_pred_proba > 0.5).astype(int)

# Metrics
acc = accuracy_score(y_test, y_pred)
logloss = log_loss(y_test, y_pred_proba)
auc = roc_auc_score(y_test, y_pred_proba)

print(f"📊 Accuracy: {acc:.4f}")
print(f"📉 Log Loss: {logloss:.4f}")
print(f"📈 AUC Score: {auc:.4f}")


📊 Accuracy: 0.8337
📉 Log Loss: 0.4001
📈 AUC Score: 0.7424


In [10]:
# Save model and encoders for frontend usage
joblib.dump(model, "ctr_model.pkl")
joblib.dump(encoders, "encoders.pkl")
joblib.dump(features, "feature_order.pkl")
print("📂 Model, encoders, and feature order saved.")


📂 Model, encoders, and feature order saved.
