In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df=pd.read_csv('../data/social_media_ad_optimization_raw.csv', delimiter=',') 
df.head()

Unnamed: 0,user_id,age,gender,location,interests,ad_id,ad_category,ad_platform,ad_type,impressions,clicks,conversion,time_spent_on_ad,day_of_week,device_type,engagement_score
0,U0001,58,M,USA,Food,A0001,Sportswear,Facebook,Image,3,0,0,3.38,Friday,Mobile,0.02
1,U0002,55,F,USA,Tech,A0002,Electronics,Facebook,Image,9,9,1,6.77,Saturday,Tablet,0.93
2,U0003,52,F,UK,Gaming,A0003,Luggage,Instagram,Image,13,12,1,13.26,Wednesday,Mobile,0.93
3,U0004,31,F,USA,Tech,A0004,Gadgets,Facebook,Video,14,5,0,24.41,Saturday,Desktop,0.28
4,U0005,52,M,India,Tech,A0005,Luggage,Instagram,Carousel,10,5,0,21.43,Monday,Tablet,0.35


**Key Engineered Features**

In [3]:
df["CTR"]=df["clicks"]/df["impressions"]
df["CVR"]=df["conversion"]/df["clicks"].replace(0, np.nan)
df["CVR"]=df["CVR"].fillna(0)

**Interaction Features**

In [4]:
df["age_platform_interaction"] = df["age"] * df["ad_platform"].map({'Facebook': 1, 'Instagram': 2, 'Twitter': 3, 'LinkedIn': 4})
df["age_time_spent_interaction"] = df["age"] * df["time_spent_on_ad"]

**Contextual Features**

In [5]:
df["is_weekend"]=df["day_of_week"].isin(['Saturday', 'Sunday']).astype(int)
df["is_mobile"]=df["device_type"].isin(['Mobile']).astype(int)
df["is_facebook"]=df["ad_platform"].isin(['Facebook']).astype(int)
df["is_instagram"]=df["ad_platform"].isin(['Instagram']).astype(int)

df.head()

Unnamed: 0,user_id,age,gender,location,interests,ad_id,ad_category,ad_platform,ad_type,impressions,clicks,conversion,time_spent_on_ad,day_of_week,device_type,engagement_score,CTR,CVR,age_platform_interaction,age_time_spent_interaction,is_weekend,is_mobile,is_facebook,is_instagram
0,U0001,58,M,USA,Food,A0001,Sportswear,Facebook,Image,3,0,0,3.38,Friday,Mobile,0.02,0.0,0.0,58,196.04,0,1,1,0
1,U0002,55,F,USA,Tech,A0002,Electronics,Facebook,Image,9,9,1,6.77,Saturday,Tablet,0.93,1.0,0.111111,55,372.35,1,0,1,0
2,U0003,52,F,UK,Gaming,A0003,Luggage,Instagram,Image,13,12,1,13.26,Wednesday,Mobile,0.93,0.923077,0.083333,104,689.52,0,1,0,1
3,U0004,31,F,USA,Tech,A0004,Gadgets,Facebook,Video,14,5,0,24.41,Saturday,Desktop,0.28,0.357143,0.0,31,756.71,1,0,1,0
4,U0005,52,M,India,Tech,A0005,Luggage,Instagram,Carousel,10,5,0,21.43,Monday,Tablet,0.35,0.5,0.0,104,1114.36,0,0,0,1


In [6]:
df.describe()

Unnamed: 0,age,impressions,clicks,conversion,time_spent_on_ad,engagement_score,CTR,CVR,age_platform_interaction,age_time_spent_interaction,is_weekend,is_mobile,is_facebook,is_instagram
count,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0
mean,39.41,8.282,4.52,0.47,14.0723,0.52022,0.539864,0.166738,59.61,545.3937,0.316,0.338,0.492,0.508
std,12.400219,4.217351,3.727383,0.499599,9.088768,0.306296,0.340362,0.273606,28.225152,395.170033,0.465379,0.473502,0.500437,0.500437
min,18.0,1.0,0.0,0.0,0.46,0.0,0.0,0.0,18.0,8.28,0.0,0.0,0.0,0.0
25%,28.0,4.75,1.0,0.0,4.99,0.27,0.2625,0.0,39.0,202.38,0.0,0.0,0.0,0.0
50%,39.0,9.0,4.0,0.0,13.285,0.55,0.533333,0.0,52.5,460.785,0.0,0.0,0.0,1.0
75%,51.0,12.0,7.0,1.0,21.7325,0.7725,0.857143,0.2,80.0,781.815,1.0,1.0,1.0,1.0
max,60.0,15.0,15.0,1.0,29.95,1.0,1.0,1.0,120.0,1691.28,1.0,1.0,1.0,1.0


**PCA-based Engagement Score**

In [None]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.decomposition import PCA

In [12]:
X_engg = df[["clicks","time_spent_on_ad"]]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_engg)

pca = PCA(n_components=1)
df["engagement_pca"] = pca.fit_transform(X_scaled)
df.head()

Unnamed: 0,user_id,age,gender,location,interests,ad_id,ad_category,ad_platform,ad_type,impressions,clicks,conversion,time_spent_on_ad,day_of_week,device_type,engagement_score,CTR,CVR,age_platform_interaction,age_time_spent_interaction,is_weekend,is_mobile,is_facebook,is_instagram,engagement_pca
0,U0001,58,M,USA,Food,A0001,Sportswear,Facebook,Image,3,0,0,3.38,Friday,Mobile,0.02,0.0,0.0,58,196.04,0,1,1,0,-1.691024
1,U0002,55,F,USA,Tech,A0002,Electronics,Facebook,Image,9,9,1,6.77,Saturday,Tablet,0.93,1.0,0.111111,55,372.35,1,0,1,0,0.282045
2,U0003,52,F,UK,Gaming,A0003,Luggage,Instagram,Image,13,12,1,13.26,Wednesday,Mobile,0.93,0.923077,0.083333,104,689.52,0,1,0,1,1.357161
3,U0004,31,F,USA,Tech,A0004,Gadgets,Facebook,Video,14,5,0,24.41,Saturday,Desktop,0.28,0.357143,0.0,31,756.71,1,0,1,0,0.896229
4,U0005,52,M,India,Tech,A0005,Luggage,Instagram,Carousel,10,5,0,21.43,Monday,Tablet,0.35,0.5,0.0,104,1114.36,0,0,0,1,0.664153


In [13]:
df.to_csv("../data/processed_data_with_pca.csv", index=False)
