# Import Libraries

In [1]:
import numpy as np 
import pandas as pd
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score

import plotly.express as px
import plotly.subplots as sp
import plotly.graph_objects as go

# Loading Dataset

In [2]:
df = pd.read_csv("/kaggle/input/advertising-campaign-performance-dataset/ad_campaign_performance.csv")
df.head()

Unnamed: 0,Campaign_ID,Budget,Duration,Platform,Content_Type,Target_Age,Target_Gender,Region,Clicks,Conversions,CTR,CPC,Conversion_Rate,Success
0,CAMP-XAJI0Y,15895,39,Instagram,Video,35-44,Female,US,48297,2116,303.850267,0.329109,4.381225,1
1,CAMP-6DPBHS,960,8,LinkedIn,Video,45-54,Female,UK,15097,2340,1572.604167,0.063589,15.499768,1
2,CAMP-AHXTHV,38258,54,YouTube,Image,35-44,All,US,8134,2740,21.260913,4.703467,33.685763,1
3,CAMP-3A3ZMF,44832,28,Facebook,Text,25-34,Female,US,21801,4277,48.628212,2.056419,19.618366,1
4,CAMP-8MDD4V,11384,36,Google,Story,18-24,All,UK,16503,1488,144.96662,0.689814,9.016542,1


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Campaign_ID      1000 non-null   object 
 1   Budget           1000 non-null   int64  
 2   Duration         1000 non-null   int64  
 3   Platform         1000 non-null   object 
 4   Content_Type     1000 non-null   object 
 5   Target_Age       1000 non-null   object 
 6   Target_Gender    1000 non-null   object 
 7   Region           1000 non-null   object 
 8   Clicks           1000 non-null   int64  
 9   Conversions      1000 non-null   int64  
 10  CTR              1000 non-null   float64
 11  CPC              1000 non-null   float64
 12  Conversion_Rate  1000 non-null   float64
 13  Success          1000 non-null   int64  
dtypes: float64(3), int64(5), object(6)
memory usage: 109.5+ KB


In [4]:
df.isnull().sum()

Campaign_ID        0
Budget             0
Duration           0
Platform           0
Content_Type       0
Target_Age         0
Target_Gender      0
Region             0
Clicks             0
Conversions        0
CTR                0
CPC                0
Conversion_Rate    0
Success            0
dtype: int64

In [5]:
df['Platform'].unique()

array(['Instagram', 'LinkedIn', 'YouTube', 'Facebook', 'Google'],
      dtype=object)

# Distribution of Numerical Features

In [6]:
numerical_features = df.select_dtypes(include=['number']).columns
df[numerical_features].describe()

Unnamed: 0,Budget,Duration,Clicks,Conversions,CTR,CPC,Conversion_Rate,Success
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,24592.139,31.692,25130.927,2480.36,388.988301,2.968978,26.856985,0.897
std,14632.696566,16.542999,14205.44793,1449.470849,1544.579703,15.660721,88.893199,0.304111
min,109.0,3.0,98.0,13.0,0.232358,0.003153,0.036164,0.0
25%,11480.0,17.0,13225.25,1222.75,52.635433,0.468196,4.889723,1.0
50%,23965.0,32.0,25013.5,2407.0,102.213286,0.978351,9.847713,1.0
75%,37953.75,46.0,37386.5,3771.25,213.586455,1.899861,18.757727,1.0
max,49950.0,60.0,49820.0,4995.0,31711.612903,430.37069,1554.121864,1.0


In [7]:
rows,cols = 3,3
fig = sp.make_subplots(rows,cols,subplot_titles=[f"{feature}" for feature in numerical_features])

for i in range(len(numerical_features)):
    row = i // cols + 1
    col = i % cols + 1
    fig.add_trace(go.Histogram(x=df[numerical_features[i]],name=numerical_features[i]), row=row, col=col)

fig.update_layout(height=rows * 250, width=cols * 300, title_text="Distributions of Numerical Features")
fig.show()

# EDA

## Platform wise Success Rates 

In [8]:
platform_success_rates = df.groupby('Platform')['Success'].mean() * 100
print(platform_success_rates)

Platform
Facebook     91.709845
Google       92.941176
Instagram    86.633663
LinkedIn     89.952153
YouTube      88.053097
Name: Success, dtype: float64


In [9]:
fig = px.bar(platform_success_rates,x=platform_success_rates.index,y=platform_success_rates.values,
            labels={'x':'Platform','y':'Success Rate'},title='Success Rate by Platform',
            text=platform_success_rates.values.round(2))
fig.show()

## Budget wise Success Rate by Platform

In [10]:
fig = px.box(df, x="Platform", y="Budget", color="Success",
             title="Budget wise Success Rate by Platform",
             labels={"Budget": "Campaign Budget", "Platform": "Advertising Platform"},
             category_orders={"Platform": ["Facebook", "Instagram", "Google", "YouTube", "LinkedIn"]})
fig.show()

## Duration wise Success Rate by Content Type

In [11]:
fig = px.scatter(df, x="Duration", y="Content_Type", color="Success", size="Budget",
                 title="Duration wise Success Rate by Content Type",
                 labels={"Duration": "Campaign Duration (Days)", "Content_Type": "Ad Content Type"},
                 category_orders={"Content_Type": ["Image", "Video", "Carousel", "Story", "Text"]},
                 hover_data=['Budget']) 
fig.show()

## Conversion Rate by Content Type and Success

In [12]:
conversion_by_content_success = df.groupby(['Content_Type', 'Success'])['Conversion_Rate'].mean().reset_index()

fig = px.bar(conversion_by_content_success, x="Content_Type", y="Conversion_Rate", color="Success",
             title="Conversion Rate by Content Type and Success",
             labels={"Content_Type": "Ad Content Type", "Conversion_Rate": "Average Conversion Rate", "Success": "Campaign Success"},
             category_orders={"Content_Type": ["Image", "Video", "Carousel", "Story", "Text"]},
             barmode='group', 
             text=conversion_by_content_success['Conversion_Rate'].round(2))  
fig.show()

## Conversion Rate by Content Type, Success, and Platform

In [13]:
conversion_by_content_success_platform = df.groupby(['Content_Type', 'Success', 'Platform'])['Conversion_Rate'].mean().round(2).reset_index()

fig = px.bar(conversion_by_content_success_platform, x="Content_Type", y="Conversion_Rate", color="Success",
             facet_col="Platform",  
             title="Conversion Rate by Content Type, Success, and Platform",
             labels={"Content_Type": "Ad Content Type", "Conversion_Rate": "Average Conversion Rate", "Success": "Campaign Success", "Platform": "Advertising Platform"},
             category_orders={"Content_Type": ["Image", "Video", "Carousel", "Story", "Text"],
                               "Platform": ["Facebook", "Instagram", "Google", "YouTube", "LinkedIn"]},
             barmode='group',  
             text=conversion_by_content_success_platform['Conversion_Rate'].round(2))  # Display conversion rate as text
fig.show()

# Encode Categorical Features

In [14]:
categorical_cols = df.select_dtypes(include=['object']).columns
df_encoded = pd.get_dummies(df,columns=categorical_cols,drop_first=True)

# Split the Dataset

In [15]:
X = df_encoded.drop('Success',axis=1)
Y = df_encoded['Success']

In [16]:
x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.33,random_state=0)

# Handle Class Imbalance

We need to use SMOTE technique to balance Success Feature

In [17]:
smote = SMOTE(random_state=12)
x_train_resampled, y_train_resampled = smote.fit_resample(x_train,y_train)

In [18]:
fig = px.histogram(y_train_resampled,x="Success",title="Distribution of Success")
fig.update_layout(bargap=0.2)
fig.show()

# Models

In [19]:
lr = LogisticRegression(max_iter=1000, random_state=42)
lr.fit(x_train_resampled, y_train_resampled)
y_pred = lr.predict(x_test)

lr_acc = accuracy_score(y_test, y_pred)
lr_r2 = r2_score(y_test, y_pred)

print(f"Accuracy: {lr_acc:.2f}")
print(f"R-squared: {lr_r2:.2f}")

Accuracy: 0.97
R-squared: 0.72


In [20]:
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(x_train_resampled, y_train_resampled)
y_pred = rf.predict(x_test)

rf_acc = accuracy_score(y_test, y_pred)
rf_r2 = r2_score(y_test, y_pred)

print(f"Accuracy: {rf_acc:.2f}")
print(f"R-squared: {rf_r2:.2f}")

Accuracy: 0.99
R-squared: 0.91


In [21]:
gb = GradientBoostingClassifier(n_estimators=100, random_state=42)
gb.fit(x_train_resampled, y_train_resampled)
y_pred = gb.predict(x_test)

gb_acc = accuracy_score(y_test, y_pred)
gb_r2 = r2_score(y_test, y_pred)

print(f"Accuracy: {gb_acc:.2f}")
print(f"R-squared: {gb_r2:.2f}")

Accuracy: 0.99
R-squared: 0.94


In [22]:
xgb = XGBClassifier(random_state=42)
xgb.fit(x_train_resampled, y_train_resampled)
y_pred = xgb.predict(x_test)

xgb_acc = accuracy_score(y_test, y_pred)
xgb_r2 = r2_score(y_test, y_pred)

print(f"Accuracy: {xgb_acc:.2f}")
print(f"R-squared: {xgb_r2:.2f}")

Accuracy: 0.99
R-squared: 0.91


# Model Comparison

In [23]:
models = ['Logistic Regression','Random Forest', 'Gradient Boosting', 'XGBoost']
r2 = [lr_r2, rf_r2, gb_r2, xgb_r2]

Performance = [(x * 100).round(2) for x in r2]

fig = px.bar(x=models, y=Performance, labels={'x': 'Models', 'y': 'Perfromances'}, title="Comparison of Model Performance",
            text=Performance)
fig.show()

Hence, we can infer GradientBoostingClassifier model should be used to predict campaign success