In [1]:
import numpy as np
import pandas as pd

## Function Definitions

#### Supporting Functions

In [2]:
def make_mask(data, match, target_column):
    """
Returns a boolean mask showing where 'match' is found in 'target_column' of 'data'.

Parameters:
    data (pandas.DataFrame): DataFrame to search
    match (str): String to find in 'target_column'; if "All," returns a mask of True values
    target_column (str): Column to search for 'match'

Returns:
    list of bool: Mask indicating locations of 'match' in 'target_column'
"""

    if match == "All":
        size = len(data[target_column])
        mask = [True]*size # list full of True with the same lenght as the data
    
    else:
        mask = list(data[target_column] == match)
    
    return mask


In [3]:
def split_pages(data, target_column = "user_journey"):
    """
Splits user journey strings in a DataFrame into separate pages and returns them as a numpy array.

Parameters:
    data (pandas.DataFrame): DataFrame containing user journey data
    target_column (str): Column with user journey strings (default: 'user_journey')

Returns:
    numpy.array: Array of arrays, each containing pages from the user journeys
"""

    # Transform the user journey column to a numpy array
    # numpy.array needed in order to be able to use a mask (filter)
    user_journeys = np.array(data[target_column])
    
    
    # Split the journey strings into pages -> [["page1", "page2", ..."pageN"], ...]
    for i in range(len(user_journeys)):
        user_journeys[i] = np.array(user_journeys[i].split("-"))
    
    return user_journeys


In [4]:
def get_pages_set(data, target_column = "user_journey"):
    """
    Obtain the set (unique values) of all pages contained in the data.
    
        Parameters:
            data (pandas.DataFrame): The DataFrame containg user journeys data
            
            target_column (str): The column containing the user journey strings
                Default: 'user_journey', the expected name for the column
        
        
        Returns:
            pages_set (set): A set of all unique pages contained in the user journey strings
    """

    
    # Obtain a list of the pages in each journey string -> [["page1", "page2", ..."pageN"], ...]
    user_journeys = [journey.split("-") for journey in list(data[target_column])]
    
    # Create an empty set and update it for each user journey
    pages_set = set()
    for journey in user_journeys:
        pages_set.update(journey)
    
    return pages_set


#### Main Functions

In [5]:
def page_count(data, target_column = 'user_journey', plan = "All", mask = None, sort = True):
    """
Counts occurrences of each page in user journey strings.

Parameters:
    data (pandas.DataFrame): DataFrame with user journeys
    target_column (str): Column with user journey strings (default: 'user_journey')
    plan (str): Subscription plan to filter by; if "All," includes all journeys (default: "All")
    mask (list of bool): Boolean mask to filter data (default: None, uses 'plan' filter)
    sort (bool): Sorts the result by count in descending order (default: True)

Returns:
    dict: Dictionary of page counts {page: count}
"""

    # Set the proper mask if none was provided by the user
    if mask is None:
        mask = make_mask(data, plan, "subscription_type")
    
    # Obtain the relevant array of user journey pages
    user_journeys = split_pages(data, target_column)
    user_journeys = user_journeys[mask]
    
    # Define the empty result dictionary
    results = {}
    results["Total pages"] = 0
    
    # Populate the dictionary with page counts
    for journey in user_journeys:
        for page in journey:
            results[page] = results.get(page, 0) + 1
            results["Total pages"] = results["Total pages"] + 1
    
    if not sort:
        return results
    
    # Create a new dictionary, sorted by values (page counts) in decreasing order
    sorted_keys = sorted(results, key = results.get, reverse = True)
    sorted_results = {key: results[key] for key in sorted_keys}
    
    return sorted_results
    

In [6]:
def page_presence(data, target_column = 'user_journey', plan = "All", mask = None, sort = True):
    """
Counts the number of journeys each page appears in within user journey strings.

Parameters:
    data (pandas.DataFrame): DataFrame with user journey data
    target_column (str): Column with user journey strings (default: 'user_journey')
    plan (str): Subscription plan to filter by; if "All," includes all journeys (default: "All")
    mask (list of bool): Boolean mask to filter data (default: None, uses 'plan' filter)
    sort (bool): Sorts the result by count in descending order (default: True)

Returns:
    dict: Dictionary with pages and their journey counts {page: count_journeys}
"""

    # Set the proper mask if none was provided by the user
    if mask is None:
        mask = make_mask(data, plan, "subscription_type")
    
    # Obtain the relevant array of user journey pages
    user_journeys = split_pages(data, target_column)
    user_journeys = user_journeys[mask]
    
    
    # Define the empty result dictionary
    results = {}
    results["Total journeys"] = len(user_journeys)
    
    # Populate the dictionary with count journeys for each page
    for journey in user_journeys:
        
        pages = set(journey) # Unique pages in a journey
        
        for page in pages:
            results[page] = results.get(page, 0) + 1
    
    
    if not sort:
        return results
    
    # Create a new dictionary, sorted by values (page counts) in decreasing order
    sorted_keys = sorted(results, key = results.get, reverse = True)
    sorted_results = {key: results[key] for key in sorted_keys}
    
    
    
    return sorted_results


In [7]:
def page_destinations(data, target_column = 'user_journey', plan = "All", mask = None, sort = True):
    """
Finds all follow-up pages and their counts for each page in user journey strings.

Parameters:
    data (pandas.DataFrame): DataFrame with user journey data
    target_column (str): Column with user journey strings (default: 'user_journey')
    plan (str): Subscription plan to filter by; if "All," includes all journeys (default: "All")
    mask (list of bool): Boolean mask to filter data (default: None, uses 'plan' filter)
    sort (bool): Sorts the result by count in descending order (default: True)

Returns:
    dict: Dictionary of each page and its follow-up pages with counts 
          {starting_page: {next_page: count}}
"""

    
    # Set the proper mask if none was provided by the user
    if mask is None:
        mask = make_mask(data, plan, "subscription_type")
    
    # Obtain the relevant array of user journey pages
    user_journeys = split_pages(data, target_column)
    user_journeys = user_journeys[mask]
    
    
    # Construct a dictionary with keys being all the pages contained in user journeys
    results = { key: dict() for key in get_pages_set(data, target_column)}
    
    # Populate the dictionary
    for journey in user_journeys:
        for i in range(len(journey) - 1):
            page_source = journey[i]
            page_next = journey[i+1]
        
            results[page_source][page_next] = results[page_source].get(page_next, 0) + 1
    
    if not sort:
        return results
    
    
    
    sorted_results = {}
    
    # Sort the sub-dictionaries by values (page_destionation counts) in decreasing order
    for page in results:
        unsorted_dict = results[page]
        sorted_keys = sorted(unsorted_dict, key = unsorted_dict.get, reverse = True)
    
        sorted_results[page] = { key: unsorted_dict[key] for key in sorted_keys}
    
    
    
    return sorted_results


In [8]:
def page_sequences(data, number_of_pages = 3, show_results = 10, target_column = 'user_journey', 
                      plan = "All", mask = None, sort = True):
    """
Finds the most common consecutive page sequences and their counts in user journey strings.

Parameters:
    data (pandas.DataFrame): DataFrame with user journey data
    number_of_pages (int): Number of consecutive pages in each sequence (default: 3)
    show_results (int): Number of top results to return (default: 10)
    target_column (str): Column with user journey strings (default: 'user_journey')
    plan (str): Subscription plan to filter by; if "All," includes all journeys (default: "All")
    mask (list of bool): Boolean mask to filter data (default: None, uses 'plan' filter)
    sort (bool): Sorts the result by count in descending order (default: True)

Returns:
    dict: Dictionary of page sequences and their counts {(page1, page2, ... pageN): count}
          Returns the top 'show_results' sequences if sorted, otherwise all sequences.
"""

    # Set the proper mask if none was provided by the user
    if mask is None:
        mask = make_mask(data, plan, "subscription_type")
    
    # Obtain the relevant array of user journey pages
    user_journeys = split_pages(data, target_column)
    user_journeys = user_journeys[mask]
    
    
    # Define the empty result dictionary
    results = {}
    
    # Populate the result dictionary
    for journey in user_journeys:
        
        flag = {} # Used to stop double counting of the same page combination in the same journey
        
        for i in range(len(journey) - number_of_pages + 1):
            
            page_combination = tuple(journey[i : i + number_of_pages])

            if flag.get(page_combination, False): continue # If we have counted it already, skip it
            
            results[page_combination] = results.get(page_combination, 0) + 1
            flag[page_combination] = True # Flag this combination as already counted in this journey
    
    
    
    if not sort:
        return results
    
    # Create a new dictionary, sorted by values (page counts) in decreasing order
    # Take only top 'show_results' pages
    sorted_keys = sorted(results, key = results.get, reverse = True)[:show_results]
    sorted_results = {key: results[key] for key in sorted_keys}
    
    
    
    return sorted_results


In [9]:
def avg_journey_length(data, target_column = 'user_journey', plan = "All", mask = None):
  """
Calculates the average number of pages in user journeys.

Parameters:
    data (pandas.DataFrame): DataFrame with user journey data
    target_column (str): Column with user journey strings (default: 'user_journey')
    plan (str): Subscription plan to filter by; if "All," includes all journeys (default: "All")
    mask (list of bool): Boolean mask to filter data (default: None, uses 'plan' filter)

Returns:
    float: Average number of pages in the user journeys
"""

# Set the proper mask if none was provided by the user
  if mask is None:
    mask = make_mask(data, plan, "subscription_type")

# Obtain the relevant array of user journey pages
  user_journeys = split_pages(data, target_column)
  user_journeys = user_journeys[mask]

# Find the total amount of pages in all user journeys
  total = 0
  for i in range(len(user_journeys)):

    total = total + len(user_journeys[i])

# The average journey length is the total amount of pages divided by the number of user journeys
  avg_len = total / len(user_journeys)
  return avg_len



## Main

In [10]:
# Load the user journey data
data = pd.read_csv('/Users/premann/Downloads/user_journey.csv')

In [11]:
# Check how the data looks
data.head()

Unnamed: 0,user_id,subscription_type,user_journey
0,1516,Annual,Homepage-Log in-Other-Sign up-Log in-Homepage-...
1,3395,Annual,Other-Pricing-Sign up-Log in-Homepage-Pricing-...
2,10107,Annual,Homepage-Career tracks-Homepage-Career tracks-...
3,11145,Monthly,Homepage-Log in-Homepage-Log in-Homepage-Log i...
4,12400,Monthly,Homepage-Career tracks-Sign up-Log in-Other-Ca...


### Most popular pages

In [12]:
page_count(data, plan = "All", sort = True)

{'Total pages': 14098,
 'Homepage': 2679,
 'Log in': 2234,
 'Checkout': 1351,
 'Sign up': 1247,
 'Other': 1189,
 'Courses': 1087,
 'Career tracks': 1070,
 'Pricing': 1053,
 'Coupon': 720,
 'Resources center': 546,
 'Career track certificate': 468,
 'Course certificate': 212,
 'Upcoming courses': 110,
 'Success stories': 49,
 'Instructors': 43,
 'About us': 20,
 'Blog': 20}

### How many times each page is part of a journey

In [13]:
page_presence(data, plan = "All", sort = True)

{'Total journeys': 1350,
 'Homepage': 843,
 'Checkout': 821,
 'Log in': 756,
 'Sign up': 738,
 'Other': 623,
 'Coupon': 606,
 'Pricing': 476,
 'Courses': 453,
 'Career tracks': 380,
 'Career track certificate': 228,
 'Resources center': 184,
 'Course certificate': 151,
 'Upcoming courses': 83,
 'Success stories': 38,
 'Instructors': 25,
 'About us': 17,
 'Blog': 13}

### Most frequent follow-up after every page

In [14]:
page_destinations(data, plan = "All", sort = True)['Homepage']

{'Log in': 953,
 'Pricing': 449,
 'Career tracks': 357,
 'Sign up': 341,
 'Courses': 246,
 'Career track certificate': 117,
 'Course certificate': 66,
 'Resources center': 51,
 'Other': 33,
 'Instructors': 25,
 'Coupon': 12,
 'Upcoming courses': 10,
 'About us': 5,
 'Checkout': 5,
 'Success stories': 4,
 'Blog': 4}

### Most popular N page strings

In [15]:
page_sequences(data, number_of_pages = 3, show_results = 10, plan = "All", sort = True)

{('Homepage', 'Log in', 'Checkout'): 239,
 ('Log in', 'Homepage', 'Log in'): 220,
 ('Homepage', 'Log in', 'Homepage'): 180,
 ('Sign up', 'Log in', 'Checkout'): 123,
 ('Sign up', 'Log in', 'Homepage'): 104,
 ('Homepage', 'Pricing', 'Checkout'): 100,
 ('Homepage', 'Sign up', 'Log in'): 97,
 ('Sign up', 'Homepage', 'Log in'): 94,
 ('Homepage', 'Sign up', 'Homepage'): 90,
 ('Log in', 'Other', 'Log in'): 83}

### Average length of a journey (pages)

In [16]:
avg_journey_length(data, plan = "All")

10.442962962962962