In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
pd.set_option('display.max_columns', None)

# load feature-engineered data
df = pd.read_csv("../data/social_media_ad_optimization_raw.csv")
df.shape
df.head()


Unnamed: 0,user_id,age,gender,location,interests,ad_id,ad_category,ad_platform,ad_type,impressions,clicks,conversion,time_spent_on_ad,day_of_week,device_type,engagement_score
0,U0001,58,M,USA,Food,A0001,Sportswear,Facebook,Image,3,0,0,3.38,Friday,Mobile,0.02
1,U0002,55,F,USA,Tech,A0002,Electronics,Facebook,Image,9,9,1,6.77,Saturday,Tablet,0.93
2,U0003,52,F,UK,Gaming,A0003,Luggage,Instagram,Image,13,12,1,13.26,Wednesday,Mobile,0.93
3,U0004,31,F,USA,Tech,A0004,Gadgets,Facebook,Video,14,5,0,24.41,Saturday,Desktop,0.28
4,U0005,52,M,India,Tech,A0005,Luggage,Instagram,Carousel,10,5,0,21.43,Monday,Tablet,0.35


In [3]:
df = df.drop(columns=["engagement_score"])

In [4]:
X = df.drop(columns=["user_id", "ad_id", "conversion"])
y = df["conversion"]


In [5]:
numeric_features = ["age", "impressions", "clicks", "time_spent_on_ad"]
categorical_features = [
    "gender", "location", "interests",
    "ad_category", "ad_platform",
    "ad_type", "day_of_week", "device_type"
]


**Preprocessing Pipeline**

In [6]:
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(drop="first", handle_unknown="ignore")

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)

**Logistic Regression Pipeline**

In [7]:
clf_lr = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("clf", LogisticRegression(max_iter=300))
])


In [8]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
clf_lr.fit(X_train, y_train)

  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights


In [9]:
preds = clf_lr.predict(X_test)
print("Accuracy:", accuracy_score(y_test, preds))
print(classification_report(y_test, preds))

Accuracy: 0.51
              precision    recall  f1-score   support

           0       0.54      0.57      0.56        54
           1       0.47      0.43      0.45        46

    accuracy                           0.51       100
   macro avg       0.50      0.50      0.50       100
weighted avg       0.51      0.51      0.51       100



  ret = a @ b
  ret = a @ b
  ret = a @ b


In [10]:
clf_model = clf_lr.named_steps["clf"]

feature_names = []
for name, transformer, cols in clf_lr.named_steps["preprocessor"].transformers_:
    if hasattr(transformer, "get_feature_names_out"):
        names = transformer.get_feature_names_out(cols)
    else:
        names = cols
    feature_names.extend(names)

coef = clf_model.coef_[0]
feat_imp = pd.DataFrame({
    "feature": feature_names,
    "coefficient": coef
}).sort_values("coefficient", ascending=False)

feat_imp.head(20)


Unnamed: 0,feature,coefficient
17,ad_category_Food & Beverage,0.575884
5,gender_Other,0.473112
2,clicks,0.394502
3,time_spent_on_ad,0.359669
21,ad_platform_Instagram,0.167801
6,location_Canada,0.118574
25,day_of_week_Saturday,0.11167
22,ad_type_Image,0.086763
29,day_of_week_Wednesday,0.076435
12,interests_Food,0.052901


In [11]:
X_transformed = clf_lr.named_steps['preprocessor'].transform(df.drop(columns=["user_id","ad_id","conversion"]))


coef = clf_lr.named_steps['clf'].coef_[0]
intercept = clf_lr.named_steps['clf'].intercept_[0]
linear_score = np.dot(X_transformed, coef) + intercept

# Scale linear score to 0–1
scaler = MinMaxScaler()
df['engagement_score_supervised'] = scaler.fit_transform(linear_score.reshape(-1,1))

# Check results
df[['user_id','clicks','time_spent_on_ad','ad_platform','ad_category','engagement_score_supervised']].head()


Unnamed: 0,user_id,clicks,time_spent_on_ad,ad_platform,ad_category,engagement_score_supervised
0,U0001,0,3.38,Facebook,Sportswear,0.416418
1,U0002,9,6.77,Facebook,Electronics,0.427514
2,U0003,12,13.26,Instagram,Luggage,0.609831
3,U0004,5,24.41,Facebook,Gadgets,0.453463
4,U0005,5,21.43,Instagram,Luggage,0.329103
