In [1]:
import pandas as pd
import random
from datetime import datetime, timedelta

# Define marketing channels
marketing_channels = ['google_search', 'youtube', 'google_display_apps', 'meta', 'tiktok','youtube_retargeting']
remarketing_channels = ['youtube_retargeting', 'GDN_retargeting', 'gmail_retargeting', 'tiktok_retargeting', 'meta_retargeting']

# Function to generate user paths
# Function to generate user paths
def generate_user_path():
    # Probability distribution for starting with 1, 2, or 3 marketing channels
    start_distribution = [60, 20, 20]

    # Weights for marketing_channels
    marketing_weights = [38, 42, 11, 5, 4]

    # Choose the starting marketing channels
    start_length = random.choices([1, 2, 3], start_distribution, k=1)[0]
    start_channels = random.choices(marketing_channels, weights=marketing_weights, k=start_length)

    # Weights for remarketing_channels
    remarketing_weights = [45, 35, 10, 5, 5]

    # Choose 0, 1 or 2 remarketing channels
    end_length = random.randint(0, 2)
    end_channels = random.choices(remarketing_channels, weights=remarketing_weights, k=end_length)

    return start_channels + end_channels


# Function to generate data
def generate_data(num_users=10000):
    data = []

    for i in range(num_users):
        user_id = i + 1
        user_path = generate_user_path()
        click_date = datetime.utcnow() - timedelta(days=random.randint(1, 90))
        # Calculate the probability of installing based on the number of touchpoints
        touchpoints = len(user_path)
        installed_prob = [max(100 - 15 * touchpoints, 30), min(15 * touchpoints, 70)]
        installed = random.choices([0, 1], installed_prob, k=1)[0]

        data.append({
            'user_id': user_id,
            'path': ' > '.join(user_path),
            'first_click_date': click_date,
            'installed': installed
        })

    return pd.DataFrame(data)
df=generate_data()

In [8]:
import numpy as np
from collections import defaultdict

# Function to calculate first touch, last touch, and linear touch attribution
def first_last_linear_attributions(df, journey_length):
    df_journey_length = df[df['path'].str.count('>') + 1 == journey_length]
    first_touch_counts = defaultdict(int)
    last_touch_counts = defaultdict(int)
    linear_touch_counts = defaultdict(int)

    for _, row in df_journey_length.iterrows():
        channels = row['path'].split(' > ')
        first_touch_counts[channels[0]] += 1
        last_touch_counts[channels[-1]] += 1

        for channel in channels:
            linear_touch_counts[channel] += 1

    total_journeys = df_journey_length.shape[0]
    all_channels = marketing_channels + remarketing_channels
    return {
        channel: {
            'first_touch': 100 * first_touch_counts[channel] / total_journeys if total_journeys > 0 else 0,
            'last_touch': 100 * last_touch_counts[channel] / total_journeys if total_journeys > 0 else 0,
            'linear_touch': 100 * linear_touch_counts[channel] / (journey_length * total_journeys) if total_journeys > 0 else 0
        }
        for channel in all_channels
    }


# Function to calculate Markov model fractional credit
def markov_model_attributions(df, journey_length):
    df_journey_length = df[df['path'].str.count('>') + 1 == journey_length]
    total_journeys = df_journey_length.shape[0]

    if total_journeys == 0:
        return {channel: {'markov': 0} for channel in marketing_channels}

    all_channels = marketing_channels + remarketing_channels
    transition_matrix = np.zeros((len(all_channels), len(all_channels)))
    channel_indices = {channel: i for i, channel in enumerate(all_channels)}

    for _, row in df_journey_length.iterrows():
        channels = row['path'].split(' > ')

        for i in range(len(channels) - 1):
            transition_matrix[channel_indices[channels[i]], channel_indices[channels[i + 1]]] += 1

    row_sums = np.sum(transition_matrix, axis=1, keepdims=True)
    transition_matrix = np.divide(transition_matrix, row_sums, out=np.zeros_like(transition_matrix), where=row_sums!=0)

    markov_counts = defaultdict(int)
    num_iterations = 10000
    for _ in range(num_iterations):
        journey = [np.random.choice(all_channels)]
        for _ in range(journey_length - 1):
            probs = transition_matrix[channel_indices[journey[-1]], :]
            epsilon = 1e-9  # Small epsilon value to avoid division by zero
            probs /= (probs.sum() + epsilon)  # Normalize the probabilities

            journey.append(np.random.choice(all_channels, p=probs))

        for channel in journey:
            markov_counts[channel] += 1

    return {
        channel: {
            'markov': 100 * markov_counts[channel] / (journey_length * num_iterations)
        }
        for channel in all_channels
    }

# Generate tables based on journey lengths
for journey_length in range(1, 6):
    first_last_linear = first_last_linear_attributions(df, journey_length)
    markov = markov_model_attributions(df, journey_length)

    for channel in marketing_channels:
        first_last_linear[channel].update(markov[channel])

    table = pd.DataFrame.from_dict(first_last_linear, orient='index')
    table.index.name = f'Journey Length {journey_length}'
    table.columns.name = 'Attribution Model'
    table.to_csv(f'journey_length_{journey_length}_attribution.csv')



ValueError: probabilities do not sum to 1