In [1]:
# --- 1. IMPORT LIBRARIES ---
import pandas as pd
import numpy as np
from faker import Faker
import random
from datetime import datetime, timedelta

# Initialize the Faker library to generate fake data
fake = Faker()

print("Libraries imported successfully!")

# --- 2. SIMULATE THE 'influencers.csv' FILE ---
num_influencers = 50
influencer_data = []

# A list of our brands' core categories
categories = ['Fitness', 'Nutrition', 'Wellness', 'Lifestyle']
platforms = ['Instagram', 'YouTube', 'Twitter']

for i in range(num_influencers):
    # Choose random attributes for each influencer
    category = random.choice(categories)
    platform = random.choice(platforms)
    
    # Use a log-normal distribution for followers to get a realistic spread
    # (many small influencers, a few very large ones)
    followers = int(np.random.lognormal(mean=10, sigma=1.5) * 100)
    
    # This is our custom 'persona' logic to make insights richer later
    if followers < 50000:
        size = 'Micro'
    elif 50000 <= followers < 500000:
        size = 'Mid-Tier'
    else:
        size = 'Macro'
    persona = f"{size}-{category}"
    
    influencer_data.append({
        'influencer_id': f'HKI-{1001+i}',
        'name': fake.name(),
        'category': category,
        'gender': random.choice(['Male', 'Female']),
        'follower_count': followers,
        'platform': platform,
        'persona': persona  # Our unique, added column
    })

# Convert the list of data into a pandas DataFrame (which is like a table)
influencers_df = pd.DataFrame(influencer_data)

# Display the first 5 rows to check our work
influencers_df.head()

Libraries imported successfully!


Unnamed: 0,influencer_id,name,category,gender,follower_count,platform,persona
0,HKI-1001,Dr. Shawn Davis,Lifestyle,Male,2010183,Instagram,Macro-Lifestyle
1,HKI-1002,Joanna Nguyen,Wellness,Female,4577131,YouTube,Macro-Wellness
2,HKI-1003,Daniel Lewis,Lifestyle,Female,1142438,YouTube,Macro-Lifestyle
3,HKI-1004,Geoffrey Wilson,Fitness,Female,8250015,YouTube,Macro-Fitness
4,HKI-1005,Connie Nelson,Lifestyle,Male,6012265,Instagram,Macro-Lifestyle


In [2]:
# --- 3. SIMULATE THE 'posts.csv' FILE ---

post_data = []
post_id_counter = 1

# We'll base our simulation on a 90-day campaign period
campaign_start_date = datetime.now() - timedelta(days=90)

# Loop through each influencer in our DataFrame
for index, influencer in influencers_df.iterrows():
    # Each influencer will make between 2 and 8 posts
    num_posts = random.randint(2, 8)
    
    for _ in range(num_posts):
        # Post date is a random day within our 90-day campaign
        post_date = campaign_start_date + timedelta(days=random.randint(0, 89))
        
        # Engagement metrics are based on follower count, with randomness
        reach = int(influencer['follower_count'] * random.uniform(0.2, 0.6)) # Reach is 20-60% of followers
        likes = int(reach * random.uniform(0.02, 0.10)) # Likes are 2-10% of reach
        comments = int(likes * random.uniform(0.01, 0.05)) # Comments are 1-5% of likes
        
        post_data.append({
            'post_id': f'HKP-{post_id_counter}',
            'influencer_id': influencer['influencer_id'],
            'platform': influencer['platform'],
            'date': post_date.strftime('%Y-%m-%d'),
            'url': f'http://{influencer["platform"]}.com/post/{post_id_counter}',
            'caption': fake.sentence(nb_words=15),
            'reach': reach,
            'likes': likes,
            'comments': comments
        })
        post_id_counter += 1

posts_df = pd.DataFrame(post_data)

# Display the first 5 rows to check our work
posts_df.head()

Unnamed: 0,post_id,influencer_id,platform,date,url,caption,reach,likes,comments
0,HKP-1,HKI-1001,Instagram,2025-05-26,http://Instagram.com/post/1,Rather generation scientist while create he pr...,1079533,93768,2213
1,HKP-2,HKI-1001,Instagram,2025-06-30,http://Instagram.com/post/2,Similar first war be focus police southern int...,927046,80969,3625
2,HKP-3,HKI-1001,Instagram,2025-05-20,http://Instagram.com/post/3,Other food investment maybe attention parent a...,1045068,103812,5167
3,HKP-4,HKI-1002,YouTube,2025-04-23,http://YouTube.com/post/4,Value note million even spring wife speak real...,2086599,184634,8075
4,HKP-5,HKI-1002,YouTube,2025-07-07,http://YouTube.com/post/5,Lay performance force effort trip scene instea...,1686695,87170,4042


In [3]:
# --- 4. SIMULATE THE 'tracking_data.csv' FILE ---

tracking_data = []
num_transactions = 5000  # Let's simulate 5000 total transactions

# HealthKart's brands and some example products
brands = {
    'MuscleBlaze': ['Biozyme Whey', 'BCAA Pro', 'Creatine Monohydrate'],
    'HKVitals': ['Skin Radiance Collagen', 'Multivitamin', 'Fish Oil'],
    'Gritzo': ['SuperMilk for Kids', 'Active Teen Whey']
}
all_products = [prod for sublist in brands.values() for prod in sublist]

# Get a list of influencer IDs who are part of the campaign
campaign_influencer_ids = influencers_df['influencer_id'].tolist()

for i in range(num_transactions):
    # This is the key for incrementality: not all sales come from influencers!
    # We'll say 30% are from influencers, 60% organic, 10% direct.
    source = np.random.choice(
        ['influencer_campaign', 'organic_search', 'direct'],
        p=[0.3, 0.6, 0.1] 
    )
    
    transaction_date = campaign_start_date + timedelta(days=random.randint(0, 89))
    product = random.choice(all_products)
    revenue = round(random.uniform(500, 4000), 2)
    
    # If the sale came from an influencer, we need to attribute it to one
    influencer_id = None
    if source == 'influencer_campaign':
        influencer_id = random.choice(campaign_influencer_ids)
        
    tracking_data.append({
        'transaction_id': f'HKT-{10001+i}',
        'source': source,
        'campaign': 'Q4-2023-All-Brands',
        'influencer_id': influencer_id, # This will be None (or NaN) if not from an influencer
        'user_id': f'HKU-{random.randint(10000, 50000)}',
        'product': product,
        'date': transaction_date.strftime('%Y-%m-%d'),
        'orders': 1,
        'revenue': revenue
    })

tracking_df = pd.DataFrame(tracking_data)

# The 'display()' function is great for showing tables nicely in notebooks

# Display the first 5 rows to check our work
print("Tracking Data Head:")
display(tracking_df.head())

print("\nBreakdown of Revenue Sources:")
display(tracking_df.groupby('source')['revenue'].sum())

Tracking Data Head:


Unnamed: 0,transaction_id,source,campaign,influencer_id,user_id,product,date,orders,revenue
0,HKT-10001,organic_search,Q4-2023-All-Brands,,HKU-12362,BCAA Pro,2025-07-16,1,2750.44
1,HKT-10002,organic_search,Q4-2023-All-Brands,,HKU-44225,BCAA Pro,2025-07-08,1,3174.12
2,HKT-10003,direct,Q4-2023-All-Brands,,HKU-21224,Creatine Monohydrate,2025-05-04,1,1068.55
3,HKT-10004,influencer_campaign,Q4-2023-All-Brands,HKI-1023,HKU-33710,Fish Oil,2025-05-17,1,1536.65
4,HKT-10005,influencer_campaign,Q4-2023-All-Brands,HKI-1033,HKU-13196,Active Teen Whey,2025-04-30,1,2232.42



Breakdown of Revenue Sources:


source
direct                 1159146.68
influencer_campaign    3466551.98
organic_search         6519098.45
Name: revenue, dtype: float64

In [4]:
# --- 5. SIMULATE THE 'payouts.csv' FILE ---

payout_data = []

# Filter tracking data to only include sales from influencers
# .dropna() removes rows with missing values in the specified column
influencer_sales = tracking_df.dropna(subset=['influencer_id'])

for index, influencer in influencers_df.iterrows():
    # Randomly assign a payment model: 50% get paid per post, 50% per order
    basis = random.choice(['Per Post', 'Per Order'])
    
    total_payout = 0
    orders = 0
    rate = 0
    
    if basis == 'Per Post':
        # Payout is based on follower count tier, plus some randomness
        rate = 500 + (influencer['follower_count'] // 1000) * random.uniform(0.5, 1.5)
        num_posts = posts_df[posts_df['influencer_id'] == influencer['influencer_id']].shape[0]
        total_payout = round(rate * num_posts, 2)
    
    else: # Basis is 'Per Order'
        # Commission rate between 5% and 15%
        rate = random.uniform(0.05, 0.15) 
        
        # Get all sales attributed to this influencer
        sales_for_influencer = influencer_sales[influencer_sales['influencer_id'] == influencer['influencer_id']]
        orders = sales_for_influencer.shape[0]
        total_revenue = sales_for_influencer['revenue'].sum()
        total_payout = round(total_revenue * rate, 2)
        
    payout_data.append({
        'influencer_id': influencer['influencer_id'],
        'basis': basis,
        'rate': round(rate, 4), # Store the rate (either flat fee or percentage)
        'orders': orders,
        'total_payout': total_payout
    })

payouts_df = pd.DataFrame(payout_data)

print("Payout Data Head:")
display(payouts_df.head())


# --- 6. SAVE ALL DATAFRAMES TO CSV FILES ---

# The '..' means 'go up one directory' from /notebooks to the main project folder
output_path = '../data/'

influencers_df.to_csv(output_path + 'influencers.csv', index=False)
posts_df.to_csv(output_path + 'posts.csv', index=False)
tracking_df.to_csv(output_path + 'tracking_data.csv', index=False)
payouts_df.to_csv(output_path + 'payouts.csv', index=False)

print(f"\nSUCCESS: All 4 CSV files have been saved to the '{output_path}' directory.")

Payout Data Head:


Unnamed: 0,influencer_id,basis,rate,orders,total_payout
0,HKI-1001,Per Post,2784.3816,0,8353.14
1,HKI-1002,Per Order,0.1034,39,9388.73
2,HKI-1003,Per Post,1474.8852,0,5899.54
3,HKI-1004,Per Order,0.0701,27,4339.64
4,HKI-1005,Per Order,0.059,33,4517.33



SUCCESS: All 4 CSV files have been saved to the '../data/' directory.
