In [5]:
import pandas as pd
import networkx as nx
from collections import defaultdict
import pymfinder as py
import json
import math

# Load the transaction data
df = pd.read_csv('./Debit_Transactions4.csv')

# Extract relevant columns
df = df[['customerId', 'externalParty.merchantCategoryCode', 'createdAt', 'transactionFootPrint.carbonEmissionInGrams']]

# Convert 'createdAt' to datetime and sort by customer and timestamp
df['createdAt'] = pd.to_datetime(df['createdAt'])
df = df.sort_values(by=['customerId', 'createdAt'])

# Create transaction sequences by customer with carbon emissions
transaction_sequences = df.groupby('customerId').apply(
    lambda x: list(zip(x['externalParty.merchantCategoryCode'], x['transactionFootPrint.carbonEmissionInGrams']))
).reset_index(name='sequences')

# Initialize a dictionary to store results for each customer
all_customer_results = {}

# Mapping of motifs to their initial nodes
initial_nodes = {
    (6, 0, 1),    # S1
    (12, 0, 1),   # S3
    (14, 0, 1),   # S7
    (36, 0, 2),   # S9
    (38, 0, 1),   # S11
    (46, 1, 1),   # S14 or S15 (both are initial and have the same structure)
    (74, 0, 1),   # S16
    (78, 0, 2),   # S19
    (98, 0, 2),   # S21
    (102, 0, 1),  # S23
    (108, 1, 1),  # S25 or S26 (both are initial and have the same structure)
    (238, 1, 1)   # S29 or S30 (both are initial and have the same structure)
}

# Convert initial_nodes to a set of motif IDs
initial_node_ids = {id for id, *_ in initial_nodes}

# Function to process a single customer
def process_customer(customer_id, sequences):
    G = nx.DiGraph()
    edge_freq = defaultdict(int)
    edge_emissions = defaultdict(float)
    node_emissions = defaultdict(list)  # Store all emissions for calculating average

    # Validate sequences format
    if not isinstance(sequences, list):
        raise TypeError(f"Sequences for customer {customer_id} are not in the expected format: {type(sequences)}")

    # Add edges, their weights, and carbon emissions
    for seq in sequences:
        if not isinstance(seq, tuple) or len(seq) != 2:
            raise ValueError(f"Invalid sequence format for customer {customer_id}: {seq}")
        
        node_id = seq[0]
        emission = seq[1]
        if pd.notna(emission):  # Check for NaN emissions
            node_emissions[node_id].append(emission)  # Collect emissions for average calculation
            # Add edges to the graph
            for i in range(len(sequences) - 1):
                source = sequences[i][0]
                target = sequences[i + 1][0]
                edge_freq[(source, target)] += 1
                edge_emissions[(source, target)] += sequences[i + 1][1]

    # Add edges to the graph with weights and emissions
    for (source, target), weight in edge_freq.items():
        emission = edge_emissions.get((source, target), 0)
        G.add_edge(source, target, weight=weight, emissions=emission)

    # Save the network to an edge list file (without emissions for pymfinder)
    network_file = f'network_edges_{customer_id}.txt'
    with open(network_file, 'w') as file:
        for edge in G.edges(data=True):
            source, target, data = edge
            weight = data['weight']
            file.write(f"{source} {target} {weight}\n")

    # Run pymfinder to detect motifs
    results = py.pymfinder(
        network=network_file,
        links=True,
        motifsize=3,
        stoufferIDs=False,
        allmotifs=False,
        nrandomizations=0,
        randomize=False,
        usemetropolis=False,
        networktype="unipartite"
    )

    def node_link_to_dict(node_link):
        return {
            "id": node_link.id,
            "motifs": node_link.motifs,
            "roles": node_link.roles,
            "weight": node_link.weight,
            "weighted_motifs": node_link.weighted_motifs,
            "weighted_roles": node_link.weighted_roles
        }

    # Extract the data to save
    results_dict = {
        "motifs": {motif_id: motif.real for motif_id, motif in results.motifs.items()},
        "nodes": {
            node_id: {
                "id": node.id,
                "motifs": node.motifs,
                "roles": node.roles,
                "weighted_motifs": node.weighted_motifs,
                "weighted_roles": node.weighted_roles
            } for node_id, node in results.nodes.items()
        },
        "links": [node_link_to_dict(link) for _, link in results.links.items()]  # Convert NodeLink to dict
    }

    # Function to convert tuple keys to strings
    def convert_tuple_keys(d):
        if isinstance(d, dict):
            new_dict = {}
            for k, v in d.items():
                if isinstance(k, tuple):
                    k = str(k)
                new_dict[k] = convert_tuple_keys(v)
            return new_dict
        elif isinstance(d, list):
            return [convert_tuple_keys(i) for i in d]
        else:
            return d

    # Convert any tuple keys to strings
    results_dict = convert_tuple_keys(results_dict)

    # Save the results to a text file
    results_file = f'results_{customer_id}.txt'
    with open(results_file, 'w') as f:
        json.dump(results_dict, f, indent=4)

    # Calculate total emission per motif
    return calculate_total_emission_per_motif(G, results_dict['motifs'], results_dict['nodes'], results_dict['links'])

# Function to calculate total emissions per motif for a customer
def calculate_total_emission_per_motif(G, motifs, nodes, links):
    motif_emissions = defaultdict(float)
    added_nodes = defaultdict(set)
    motif_link_counts = defaultdict(int)
 
    # Create a dictionary to map links to their motifs
    link_motifs = {tuple(link['id']): link['motifs'] for link in links}

    # Function to convert role string to a tuple
    def role_str_to_tuple(role_str):
        return tuple(map(int, role_str.strip('()').split(', ')))

    # Iterate through links to calculate emissions for each motif
    for link in links:
        edge_id = tuple(link['id'])
        associated_motifs = link['motifs']
        emission = G[edge_id[0]][edge_id[1]].get('emissions', 0)
        weight = G[edge_id[0]][edge_id[1]].get('weight', 1)

        for motif_id in associated_motifs:
            count = link['motifs'][motif_id]
            motif_link_counts[motif_id] += count
            motif_id_int = int(motif_id)  # Ensure motif_id is an integer
            # Determine the multiplier based on the motif ID
            multiplier = 2 if motif_id_int in {36, 78} else 1

            if motif_id_int in initial_node_ids:
                # Convert node IDs to string format to access the nodes dictionary
                node_id_str_1 = edge_id[0]
                node_id_str_2 = edge_id[1]

                # Check if node_id_str_1 is in nodes by comparing `id`
                node_in_nodes = next((node for node in nodes.values() if node['id'] == int(node_id_str_1)), None)
                
                if node_in_nodes:
                    # Get roles with count > 0
                    node_roles = {role: count for role, count in node_in_nodes['roles'].items() if count > 0}
                    # Convert roles to tuples for comparison
                    node_roles_tuples = {role_str_to_tuple(role) for role in node_roles}
                    # Check if any role matches the initial node role for the motif
                    for role_tuple in node_roles_tuples:
                        if role_tuple in initial_nodes:
                            if not (edge_id[0] in added_nodes[motif_id_int]):
                                # Calculate initial emission
                                if (role_tuple[0] == motif_id_int):
                                    initial_emission = sum(node_emissions.get(edge_id[0], [])) / len(node_emissions[edge_id[0]])
                                    initial_emission *= node_roles.get(str(role_tuple), 0)  # Multiply by count
                                    initial_emission *= multiplier
                                    motif_emissions[motif_id_int] += initial_emission
                                    added_nodes[motif_id_int].add(edge_id[0])

            # Accumulate emission for the motif, but only if it's a valid number
            if not math.isnan(emission):
                motif_emissions[motif_id_int] += emission * (link['motifs'][motif_id] / weight)
  
    # Return results
    return motif_emissions

# Process each customer and store results
for customer_id, sequences in transaction_sequences.values:
    print(f"Processing customer {customer_id}")
    try:
        customer_results = process_customer(customer_id, sequences)
        all_customer_results[customer_id] = customer_results
    except Exception as e:
        print(f"Error processing customer {customer_id}: {e}")

# Save all results to a file
with open('all_customer_results.json', 'w') as f:
    json.dump(all_customer_results, f, indent=4)

print("All customer results have been processed and saved.")


Processing customer 3253926e-9c0d-4fd9-8248-eb7c2f457b99
Error processing customer 3253926e-9c0d-4fd9-8248-eb7c2f457b99: name 'node_emissions' is not defined
All customer results have been processed and saved.


In [1]:
import pandas as pd
import networkx as nx
from collections import defaultdict
import pymfinder as py
import json
import math

# Load the transaction data
df = pd.read_csv('./Debit_Transactions4.csv')

# Extract relevant columns
df = df[['customerId', 'externalParty.merchantCategoryCode', 'createdAt', 'transactionFootPrint.carbonEmissionInGrams']]

# Convert 'createdAt' to datetime and sort by customer and timestamp
df['createdAt'] = pd.to_datetime(df['createdAt'])
df = df.sort_values(by=['customerId', 'createdAt'])

# Create transaction sequences by customer with carbon emissions
transaction_sequences = df.groupby('customerId').apply(
    lambda x: list(zip(x['externalParty.merchantCategoryCode'], x['transactionFootPrint.carbonEmissionInGrams']))
).reset_index(name='sequences')

# Initialize a dictionary to store results for each customer
all_customer_results = {}

# Mapping of motifs to their initial nodes
initial_nodes = {
    (6, 0, 1),    # S1
    (12, 0, 1),   # S3
    (14, 0, 1),   # S7
    (36, 0, 2),   # S9
    (38, 0, 1),   # S11
    (46, 1, 1),   # S14 or S15 (both are initial and have the same structure)
    (74, 0, 1),   # S16
    (78, 0, 2),   # S19
    (98, 0, 2),   # S21
    (102, 0, 1),  # S23
    (108, 1, 1),  # S25 or S26 (both are initial and have the same structure)
    (238, 1, 1)   # S29 or S30 (both are initial and have the same structure)
}

# Convert initial_nodes to a set of motif IDs
initial_node_ids = {id for id, *_ in initial_nodes}
# Function to process a single customer
def process_customer(customer_id, sequences):
    G = nx.DiGraph()
    edge_freq = defaultdict(int)
    edge_emissions = defaultdict(float)
    node_emissions = defaultdict(list)  # Store all emissions for calculating average

    # Validate sequences format
    if not isinstance(sequences, list):
        raise TypeError(f"Sequences for customer {customer_id} are not in the expected format: {type(sequences)}")

    # Add edges, their weights, and carbon emissions
    for seq in sequences:
        if not isinstance(seq, tuple) or len(seq) != 2:
            raise ValueError(f"Invalid sequence format for customer {customer_id}: {seq}")
        
        node_id = seq[0]
        emission = seq[1]
        if pd.notna(emission):  # Check for NaN emissions
            node_emissions[node_id].append(emission)  # Collect emissions for average calculation
            # Add edges to the graph
            for i in range(len(sequences) - 1):
                source = sequences[i][0]
                target = sequences[i + 1][0]
                edge_freq[(source, target)] += 1
                edge_emissions[(source, target)] += sequences[i + 1][1]

    # Add edges to the graph with weights and emissions
    for (source, target), weight in edge_freq.items():
        emission = edge_emissions.get((source, target), 0)
        G.add_edge(source, target, weight=weight, emissions=emission)

    # Save the network to an edge list file (without emissions for pymfinder)
    network_file = f'network_edges_{customer_id}.txt'
    with open(network_file, 'w') as file:
        for edge in G.edges(data=True):
            source, target, data = edge
            weight = data['weight']
            file.write(f"{source} {target} {weight}\n")

    def node_link_to_dict(node_link):
        return {
            "id": node_link.id,
            "motifs": node_link.motifs,
            "roles": node_link.roles,
            "weight": node_link.weight,
            "weighted_motifs": node_link.weighted_motifs,
            "weighted_roles": node_link.weighted_roles
        }

    # Run pymfinder to detect motifs
    results = py.pymfinder(
        network=network_file,
        links=True,
        motifsize=3,
        stoufferIDs=False,
        allmotifs=False,
        nrandomizations=0,
        randomize=False,
        usemetropolis=False,
        networktype="unipartite"
    )

    # Extract the data to save
    results_dict = {
        "motifs": {motif_id: motif.real for motif_id, motif in results.motifs.items()},
        "nodes": {
            node_id: {
                "id": node.id,
                "motifs": node.motifs,
                "roles": node.roles,
                "weighted_motifs": node.weighted_motifs,
                "weighted_roles": node.weighted_roles
            } for node_id, node in results.nodes.items()
        },
        "links": [node_link_to_dict(link) for _, link in results.links.items()]  # Convert NodeLink to dict
    }

    # Function to convert tuple keys to strings
    def convert_tuple_keys(d):
        if isinstance(d, dict):
            new_dict = {}
            for k, v in d.items():
                if isinstance(k, tuple):
                    k = str(k)
                new_dict[k] = convert_tuple_keys(v)
            return new_dict
        elif isinstance(d, list):
            return [convert_tuple_keys(i) for i in d]
        else:
            return d

    # Convert any tuple keys to strings
    results_dict = convert_tuple_keys(results_dict)

    # Save the results to a text file
    results_file = f'results_{customer_id}.txt'
    with open(results_file, 'w') as f:
        json.dump(results_dict, f, indent=4)

    # Calculate total emission per motif
    return calculate_total_emission_per_motif(G, results_dict['motifs'], results_dict['nodes'], results_dict['links'], node_emissions)

# Function to calculate total emissions per motif for a customer
def calculate_total_emission_per_motif(G, motifs, nodes, links, node_emissions):
    motif_emissions = defaultdict(float)
    added_nodes = defaultdict(set)
    motif_link_counts = defaultdict(int)
 
    # Create a dictionary to map links to their motifs
    link_motifs = {tuple(link['id']): link['motifs'] for link in links}

    # Function to convert role string to a tuple
    def role_str_to_tuple(role_str):
        return tuple(map(int, role_str.strip('()').split(', ')))

    # Iterate through links to calculate emissions for each motif
    for link in links:
        edge_id = tuple(link['id'])
        associated_motifs = link['motifs']
        emission = G[edge_id[0]][edge_id[1]].get('emissions', 0)
        weight = G[edge_id[0]][edge_id[1]].get('weight', 1)

        for motif_id in associated_motifs:
            count = link['motifs'][motif_id]
            motif_link_counts[motif_id] += count
            motif_id_int = int(motif_id)  # Ensure motif_id is an integer
            # Determine the multiplier based on the motif ID
            multiplier = 2 if motif_id_int in {36, 78} else 1

            if motif_id_int in initial_node_ids:
                # Convert node IDs to string format to access the nodes dictionary
                node_id_str_1 = edge_id[0]
                node_id_str_2 = edge_id[1]

                # Check if node_id_str_1 is in nodes by comparing `id`
                node_in_nodes = next((node for node in nodes.values() if node['id'] == int(node_id_str_1)), None)
                
                if node_in_nodes:
                    # Get roles with count > 0
                    node_roles = {role: count for role, count in node_in_nodes['roles'].items() if count > 0}
                    # Convert roles to tuples for comparison
                    node_roles_tuples = {role_str_to_tuple(role) for role in node_roles}
                    # Check if any role matches the initial node role for the motif
                    for role_tuple in node_roles_tuples:
                        if role_tuple in initial_nodes:
                            if not (edge_id[0] in added_nodes[motif_id_int]):
                                # Calculate initial emission
                                if (role_tuple[0] == motif_id_int):
                                    initial_emission = sum(node_emissions.get(edge_id[0], [])) / len(node_emissions.get(edge_id[0], [1]))
                                    initial_emission *= node_roles.get(str(role_tuple), 0)  # Multiply by count
                                    initial_emission *= multiplier
                                    motif_emissions[motif_id_int] += initial_emission
                                    added_nodes[motif_id_int].add(edge_id[0])

            # Accumulate emission for the motif, but only if it's a valid number
            if not math.isnan(emission):
                motif_emissions[motif_id_int] += emission * (link['motifs'][motif_id] / weight)
  
    # Return results
    return motif_emissions

# Process each customer and store results
for customer_id, sequences in transaction_sequences.values:
    print(f"Processing customer {customer_id}")
    try:
        customer_results = process_customer(customer_id, sequences)
        all_customer_results[customer_id] = customer_results
    except Exception as e:
        print(f"Error processing customer {customer_id}: {e}")

# Save all results to a file
with open('all_customer_results.json', 'w') as f:
    json.dump(all_customer_results, f, indent=4)

print("All customer results have been processed and saved.")


Processing customer 2aa2a980-470d-470a-8379-ec51e5036ee4
Processing customer 3253926e-9c0d-4fd9-8248-eb7c2f457b99
All customer results have been processed and saved.
