<a href="https://colab.research.google.com/github/sarah-izzy/HNG-task2-tickect-generator/blob/master/FINAL_PROJECT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv("/content/influencer_marketing_roi.csv")
df.head(10)


Unnamed: 0,campaign_id,platform,influencer_category,campaign_type,start_date,engagements,estimated_reach,product_sales,campaign_duration_days,end_date
0,CAMP100000,TikTok,Fitness,Giveaway,2022-01-01 00:00:00,79900,1892,2834,14,2022-01-15 00:00:00
1,CAMP100001,YouTube,Food,Product Launch,2022-01-02 00:00:00,47985,437228,165,13,2022-01-15 00:00:00
2,CAMP100002,TikTok,Travel,Brand Awareness,2022-01-03 00:00:00,13875,982513,2539,5,2022-01-08 00:00:00
3,CAMP100003,YouTube,Food,Brand Awareness,2022-01-04 00:00:00,41200,213400,100,20,2022-01-24 00:00:00
4,CAMP100004,Instagram,Food,Giveaway,2022-01-05 00:00:00,96998,42501,550,28,2022-02-02 00:00:00
5,CAMP100005,Twitter,Beauty,Brand Awareness,2022-01-06 00:00:00,76687,443289,2338,27,2022-02-02 00:00:00
6,CAMP100006,TikTok,Tech,Seasonal Sale,2022-01-07 00:00:00,8878,116825,1027,17,2022-01-24 00:00:00
7,CAMP100007,TikTok,Gaming,Brand Awareness,2022-01-08 00:00:00,74092,644094,121,13,2022-01-21 00:00:00
8,CAMP100008,Instagram,Travel,Event Promotion,2022-01-09 00:00:00,84035,66087,3405,14,2022-01-23 00:00:00
9,CAMP100009,YouTube,Food,Event Promotion,2022-01-10 00:00:00,74906,258767,1149,1,2022-01-11 00:00:00


In [None]:
PRODUCT_SALE_PRICE = 40000.0  # NGN 40,000 per unit (Gross Revenue)

# Cost Assumption: Campaign Cost Per Mille (CPM)
COST_PER_THOUSAND_REACH = 20000.0
COST_PER_REACH = COST_PER_THOUSAND_REACH / 1000.0

# Net Profit Margin Assumption: (To calculate NET Revenue)
# Assuming a 20% Gross Profit Margin (after Cost of Goods Sold/COGS)
GROSS_PROFIT_MARGIN = 0.20 # 20% of Sale Price is retained after production cost
REVENUE_PER_SALE_NET = PRODUCT_SALE_PRICE * GROSS_PROFIT_MARGIN # N8,000 Net Revenue

# to calculate Net Revenue
# We use Net Revenue (after COGS) to make the ROI calculation more robust.
df['Net_Revenue'] = df['product_sales'] * REVENUE_PER_SALE_NET


# to calculate Campaign Cost
if 'estimated_reach' in df.columns:
    df['Campaign_Cost'] = df['estimated_reach'] * COST_PER_REACH
    # Set a minimum cost to prevent division by zero in the ROI formula
    df['Campaign_Cost'] = np.maximum(1, df['Campaign_Cost'])
else:
    print("Warning: 'estimated_reach' column not found. Using a flat cost.")
    FLAT_COST = 500000.0 # Placeholder for a typical micro-influencer campaign cost
    df['Campaign_Cost'] = FLAT_COST


# to calculate the Final ROI Label (Percentage)
def calculate_roi(row):
    # ROI = ((Net Revenue - Campaign Cost) / Campaign Cost) * 100
    revenue = row['Net_Revenue']
    cost = row['Campaign_Cost']

    if cost == 0:
        return np.nan

    roi = ((revenue - cost) / cost) * 100
    return roi

df['ROI'] = df.apply(calculate_roi, axis=1)

print("\n--- DataFrame Head with Realistic ROI Label (NGN) ---")
print(f"Assumptions: Product Price = N{PRODUCT_SALE_PRICE:,.0f} | Campaign CPM = N{COST_PER_THOUSAND_REACH:,.0f}")
print(df[['product_sales', 'Net_Revenue', 'Campaign_Cost', 'ROI']].head(200))


--- DataFrame Head with Realistic ROI Label (NGN) ---
Assumptions: Product Price = N40,000 | Campaign CPM = N20,000
     product_sales  Net_Revenue  Campaign_Cost           ROI
0             2834   22672000.0        37840.0  59815.433404
1              165    1320000.0      8744560.0    -84.904901
2             2539   20312000.0     19650260.0      3.367589
3              100     800000.0      4268000.0    -81.255858
4              550    4400000.0       850020.0    417.634879
..             ...          ...            ...           ...
195            149    1192000.0      3089400.0    -61.416456
196           3036   24288000.0      7945680.0    205.675537
197           3374   26992000.0      9795480.0    175.555664
198            103     824000.0     15893600.0    -94.815523
199           1981   15848000.0     15674200.0      1.108829

[200 rows x 4 columns]


In [None]:
# Basic ROI statistics
print(df['ROI'].describe())

# To check missing or extreme values
print("Missing ROI values:", df['ROI'].isna().sum())

# Optional: Clip extreme ROI values (outliers)
df['ROI'] = df['ROI'].clip(lower=-100, upper=500)

count    150000.000000
mean        572.149574
std        3318.751033
min        -100.000000
25%           0.426260
50%          99.594457
75%         296.555270
max      145669.980507
Name: ROI, dtype: float64
Missing ROI values: 0


In [None]:
def roi_category(roi):
    if roi < 0:
        return "Loss"
    elif roi < 50:
        return "Low ROI"
    elif roi < 150:
        return "Medium ROI"
    else:
        return "High ROI"

df['ROI_Class'] = df['ROI'].apply(roi_category)

print(df['ROI_Class'].value_counts())


ROI_Class
High ROI      59611
Loss          37360
Medium ROI    34091
Low ROI       18938
Name: count, dtype: int64


In [None]:
cols_to_drop = [
    'campaign_id',
    'product_sales',   # used to calculate ROI
    'Net_Revenue',
    'Campaign_Cost',
    'ROI',
    'start_date',
    'end_date'
]

df_model = df.drop(columns=cols_to_drop, errors='ignore')


In [None]:
from sklearn.preprocessing import LabelEncoder

categorical_cols = [
    'platform',
    'influencer_category',
    'campaign_type'
]

label_encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    df_model[col] = le.fit_transform(df_model[col])
    label_encoders[col] = le


In [None]:
X = df_model[
    [
        'platform',
        'influencer_category',
        'campaign_type',
        'engagements',
        'estimated_reach',
        'campaign_duration_days'
    ]
]

y = df['ROI_Class']


In [None]:
print("Feature shape:", X.shape)
print("Target distribution:")
print(y.value_counts())


Feature shape: (150000, 6)
Target distribution:
ROI_Class
High ROI      59611
Loss          37360
Medium ROI    34091
Low ROI       18938
Name: count, dtype: int64


In [None]:
from sklearn.preprocessing import LabelEncoder

label_encoders = {}
categorical_cols = ['platform', 'influencer_category']

for col in categorical_cols:
    le = LabelEncoder()
    df[col + '_encoded'] = le.fit_transform(df[col])
    label_encoders[col] = le

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(
    n_estimators=200,
    max_depth=10,
    random_state=42
)

model.fit(X_train, y_train)


In [None]:
from sklearn.metrics import classification_report, confusion_matrix

y_pred = model.predict(X_test)

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

    High ROI       0.65      0.90      0.76     11922
        Loss       0.40      0.59      0.48      7472
     Low ROI       0.00      0.00      0.00      3788
  Medium ROI       0.33      0.11      0.17      6818

    accuracy                           0.53     30000
   macro avg       0.34      0.40      0.35     30000
weighted avg       0.43      0.53      0.46     30000

[[10773   659     0   490]
 [ 2305  4423     0   744]
 [ 1169  2248     0   371]
 [ 2314  3729     0   775]]
