In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [11]:
# Create sample marketing campaign data
np.random.seed(42)
n_samples = 2000

data = {
    'campaign_id': np.arange(1, n_samples + 1),
    'channel': np.random.choice(['Facebook', 'Google', 'Instagram', 'LinkedIn'], n_samples),
    'ad_type': np.random.choice(['Image', 'Video', 'Carousel', 'Text'], n_samples),
    'budget': np.random.uniform(100, 1000, n_samples),
    'impressions': np.random.randint(1000, 100000, n_samples),
    'clicks': np.random.randint(10, 2000, n_samples),
    'conversions': np.random.randint(1, 100, n_samples),  # Avoiding 0 for division
    'time_of_day': np.random.choice(['Morning', 'Afternoon', 'Evening', 'Night'], n_samples),
    'day_of_week': np.random.choice(['Weekday', 'Weekend'], n_samples),
    'target_audience': np.random.choice(['Young', 'Adult', 'Senior'], n_samples),
    'campaign_duration': np.random.randint(5, 30, n_samples),
}

# Calculate base metrics
data['ctr'] = (data['clicks'] / data['impressions']) * 100
data['conversion_rate'] = (data['conversions'] / data['clicks']) * 100
data['cost_per_click'] = data['budget'] / data['clicks']

# Add new metrics
data['frequency'] = np.random.randint(2, 8, n_samples)  # Average number of times a user sees the ad
data['cpm'] = (data['budget'] / data['impressions']) * 1000  # Cost per 1000 impressions
data['reach'] = data['impressions'] / data['frequency']  # Unique users reached
data['customer_acquisition_cost'] = data['budget'] / data['conversions']  # CAC
data['lifetime_value'] = np.random.uniform(50, 500, n_samples)  # Estimated customer LTV
data['roas'] = (data['conversions'] * data['lifetime_value']) / data['budget']  # Return on Ad Spend

# Calculate base metrics
data['ctr'] = (data['clicks'] / data['impressions']) * 100
data['conversion_rate'] = (data['conversions'] / data['clicks']) * 100
data['cost_per_click'] = data['budget'] / data['clicks']

# Add new metrics
data['frequency'] = np.random.randint(2, 8, n_samples)  # Average number of times a user sees the ad
data['cpm'] = (data['budget'] / data['impressions']) * 1000  # Cost per 1000 impressions
data['reach'] = data['impressions'] / data['frequency']  # Unique users reached
data['customer_acquisition_cost'] = data['budget'] / data['conversions']  # CAC
data['lifetime_value'] = np.random.uniform(50, 500, n_samples)  # Estimated customer LTV
data['roas'] = (data['conversions'] * data['lifetime_value']) / data['budget']  # Return on Ad Spend


In [12]:
# Create DataFrame
marketing_df = pd.DataFrame(data)

# Define success criteria incorporating new metrics
marketing_df['is_successful'] = np.where(
    (marketing_df['roas'] > 2) &  # ROAS > 200%
    (marketing_df['conversion_rate'] > 2) &  # Conversion rate > 2%
    (marketing_df['customer_acquisition_cost'] < 50) &  # CAC < $50
    (marketing_df['cpm'] < 10),  # CPM < $10
    1, 0
)

In [24]:
# After creating the DataFrame, simply use the to_csv method
marketing_df.to_csv('marketing_campaign_data.csv', index=False)

In [13]:
# Prepare data for modeling
X = marketing_df.drop(['is_successful', 'campaign_id', 'lifetime_value'], axis=1)
y = marketing_df['is_successful']

In [14]:
# Handle categorical variables
categorical_columns = ['channel', 'ad_type', 'time_of_day', 'day_of_week', 'target_audience']
X_encoded = pd.get_dummies(X, columns=categorical_columns)

In [15]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

In [19]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)


In [20]:
# Define and train the model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Make predictions
predictions = model.predict(X_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [21]:
# Performance Analysis
from sklearn.metrics import classification_report
print("\nModel Performance:")
print(classification_report(y_test, predictions))



Model Performance:
              precision    recall  f1-score   support

           0       0.90      0.93      0.92       237
           1       0.90      0.85      0.87       163

    accuracy                           0.90       400
   macro avg       0.90      0.89      0.90       400
weighted avg       0.90      0.90      0.90       400



In [22]:
# Metrics Analysis
print("\nKey Metrics Summary:")
metrics_summary = marketing_df.agg({
    'cpm': ['mean', 'median', 'std'],
    'reach': ['mean', 'median', 'std'],
    'customer_acquisition_cost': ['mean', 'median', 'std'],
    'roas': ['mean', 'median', 'std']
}).round(2)
print(metrics_summary)


Key Metrics Summary:
          cpm     reach  customer_acquisition_cost   roas
mean    29.06  13313.63                      28.34  35.69
median  10.93  11059.69                      10.85  21.95
std     65.64  10525.68                      76.89  44.16


In [23]:
# Channel Performance Analysis
channel_performance = marketing_df.groupby('channel').agg({
    'cpm': 'mean',
    'reach': 'mean',
    'customer_acquisition_cost': 'mean',
    'roas': 'mean',
    'is_successful': 'mean'
}).round(2)
print("\nChannel Performance:")
print(channel_performance)


Channel Performance:
             cpm     reach  customer_acquisition_cost   roas  is_successful
channel                                                                    
Facebook   29.82  13606.82                      25.45  36.41           0.39
Google     30.51  13276.03                      31.58  35.90           0.37
Instagram  27.33  13050.35                      30.38  36.06           0.39
LinkedIn   28.53  13301.16                      26.15  34.34           0.34
