In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import (
    roc_auc_score, accuracy_score, f1_score, precision_score, recall_score,
    mean_squared_error, r2_score
)

In [3]:
df = pd.read_csv("../data/social_media_ad_optimization_raw.csv")
print("Dataset shape:", df.shape)
df.head()

Dataset shape: (500, 16)


Unnamed: 0,user_id,age,gender,location,interests,ad_id,ad_category,ad_platform,ad_type,impressions,clicks,conversion,time_spent_on_ad,day_of_week,device_type,engagement_score
0,U0001,58,M,USA,Food,A0001,Sportswear,Facebook,Image,3,0,0,3.38,Friday,Mobile,0.02
1,U0002,55,F,USA,Tech,A0002,Electronics,Facebook,Image,9,9,1,6.77,Saturday,Tablet,0.93
2,U0003,52,F,UK,Gaming,A0003,Luggage,Instagram,Image,13,12,1,13.26,Wednesday,Mobile,0.93
3,U0004,31,F,USA,Tech,A0004,Gadgets,Facebook,Video,14,5,0,24.41,Saturday,Desktop,0.28
4,U0005,52,M,India,Tech,A0005,Luggage,Instagram,Carousel,10,5,0,21.43,Monday,Tablet,0.35


In [4]:
cat_cols = [
    'user_id', 'gender', 'location', 'interests',
    'ad_id', 'ad_category', 'ad_platform', 'ad_type',
    'day_of_week', 'device_type'
]

for col in cat_cols:
    df[col] = df[col].astype('category')
df.dtypes

user_id             category
age                    int64
gender              category
location            category
interests           category
ad_id               category
ad_category         category
ad_platform         category
ad_type             category
impressions            int64
clicks                 int64
conversion             int64
time_spent_on_ad     float64
day_of_week         category
device_type         category
engagement_score     float64
dtype: object

In [12]:
# Target variable for conversion
target = 'conversion'

# Features to use (exclude identifiers)
exclude = ['user_id','ad_id','conversion']
features = [col for col in df.columns if col not in exclude and col != target]

# Split train/test
X_train, X_test, y_train, y_test = train_test_split(df[features], df[target],
                                                    test_size=0.3, random_state=42,
                                                    stratify=df[target])

# Identify categorical and numeric columns
cat_cols = X_train.select_dtypes(include=['object','category']).columns.tolist()
num_cols = X_train.select_dtypes(include=[np.number]).columns.tolist()

print("Categorical columns:", cat_cols)
print("Numeric columns:", num_cols)

Categorical columns: ['gender', 'location', 'interests', 'ad_category', 'ad_platform', 'ad_type', 'day_of_week', 'device_type']
Numeric columns: ['age', 'impressions', 'clicks', 'time_spent_on_ad', 'engagement_score']


In [13]:
preprocessor = ColumnTransformer(transformers=[
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_cols),
    ('num', StandardScaler(), num_cols)
])

In [14]:
# Logistic Regression pipeline
clf_lr = Pipeline([
    ('pre', preprocessor),
    ('clf', LogisticRegression(max_iter=1000, class_weight='balanced'))
])

# Fit model
clf_lr.fit(X_train, y_train)

# Predict
y_proba = clf_lr.predict_proba(X_test)[:,1]
y_pred = (y_proba >= 0.5).astype(int)

# Evaluate
print("Logistic Regression Performance:")
print("ROC-AUC:", round(roc_auc_score(y_test, y_proba),4))
print("Accuracy:", round(accuracy_score(y_test, y_pred),4))
print("Precision:", round(precision_score(y_test, y_pred),4))
print("Recall:", round(recall_score(y_test, y_pred),4))
print("F1 Score:", round(f1_score(y_test, y_pred),4))


Logistic Regression Performance:
ROC-AUC: 1.0
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1 Score: 1.0


  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  ret = a @ b
  ret = a @ b
  ret = a @ b


In [15]:
print(len(y_pred))

150


In [16]:
# Only use time_spent_on_ad + clicks for engagement score
eng_cols = ['time_spent_on_ad','clicks']

# Standardize
scaler = StandardScaler()
X_eng_scaled = scaler.fit_transform(df[eng_cols])

# Train logistic regression on these two features
clf_eng = LogisticRegression()
clf_eng.fit(X_eng_scaled, df['conversion'])

# Get coefficients
coef = clf_eng.coef_[0]

# Weighted engagement score
df['engagement_score_supervised'] = X_eng_scaled @ coef

# Optional: normalize 0-1
df['engagement_score_supervised'] = (df['engagement_score_supervised'] - df['engagement_score_supervised'].min()) / \
                                   (df['engagement_score_supervised'].max() - df['engagement_score_supervised'].min())

df[['time_spent_on_ad','clicks','engagement_score_supervised']].head()


Unnamed: 0,time_spent_on_ad,clicks,engagement_score_supervised
0,3.38,0,0.05325
1,6.77,9,0.400387
2,13.26,12,0.613845
3,24.41,5,0.595266
4,21.43,5,0.540922


In [17]:
# Get logistic regression coefficients for all features
clf_model = clf_lr.named_steps['clf']
feature_names = []

# For categorical features (one-hot)
for name, transformer, cols in preprocessor.transformers_:
    if hasattr(transformer, 'get_feature_names_out'):
        names = transformer.get_feature_names_out(cols)
    else:
        names = cols
    feature_names.extend(list(names))

coef_all = clf_model.coef_[0]
feat_imp = pd.DataFrame({'feature': feature_names, 'coefficient': coef_all})
feat_imp = feat_imp.sort_values('coefficient', ascending=False)
feat_imp.head(20)

Unnamed: 0,feature,coefficient
40,engagement_score,5.210359
37,impressions,1.668142
5,location_Germany,0.358486
3,location_Australia,0.242415
12,interests_Gaming,0.235438
29,day_of_week_Sunday,0.230458
4,location_Canada,0.204357
20,ad_category_Sportswear,0.16841
1,gender_M,0.153193
28,day_of_week_Saturday,0.141707
