In [97]:
import pandas as pd
import networkx as nx
from collections import defaultdict
import pymfinder as py
import json
import math

# Load the transaction data
df = pd.read_csv('./Debit_Transactions4.csv')

# Extract relevant columns
df = df[['customerId', 'externalParty.merchantCategoryCode', 'createdAt', 'transactionFootPrint.carbonEmissionInGrams']]

# Convert 'createdAt' to datetime and sort by customer and timestamp
df['createdAt'] = pd.to_datetime(df['createdAt'])
df = df.sort_values(by=['customerId', 'createdAt'])

# Create transaction sequences by customer with carbon emissions
transaction_sequences = df.groupby('customerId').apply(
    lambda x: list(zip(x['externalParty.merchantCategoryCode'], x['transactionFootPrint.carbonEmissionInGrams']))
).reset_index(name='sequences')

# Initialize a directed graph
G = nx.DiGraph()

# Create dictionaries to store edge frequencies and carbon emissions
edge_freq = defaultdict(int)
edge_emissions = defaultdict(float)
node_emissions = defaultdict(list)  # Store all emissions for calculating average

# Add edges, their weights, and carbon emissions
for sequences in transaction_sequences['sequences']:
    for i in range(len(sequences)):
        node_id = sequences[i][0]
        emission = sequences[i][1]
        if pd.notna(emission):  # Check for NaN emissions
            node_emissions[node_id].append(emission)  # Collect emissions for average calculation
            if i < len(sequences) - 1:
                source = sequences[i][0]
                target = sequences[i + 1][0]
                edge_freq[(source, target)] += 1
                edge_emissions[(source, target)] += sequences[i + 1][1]

# Add edges to the graph with weights and emissions
for (source, target), weight in edge_freq.items():
    emission = edge_emissions.get((source, target), 0)
    G.add_edge(source, target, weight=weight, emissions=emission)

# Save the network to an edge list file (without emissions for pymfinder)
with open('network_edges.txt', 'w') as file:
    for edge in G.edges(data=True):
        source, target, data = edge
        weight = data['weight']
        file.write(f"{source} {target} {weight}\n")

# Function to convert NodeLink objects to a serializable format
def node_link_to_dict(node_link):
    return {
        "id": node_link.id,
        "motifs": node_link.motifs,
        "roles": node_link.roles,
        "weight": node_link.weight,
        "weighted_motifs": node_link.weighted_motifs,
        "weighted_roles": node_link.weighted_roles
    }

# Run pymfinder to detect motifs
results = py.pymfinder(
    network='network_edges.txt',
    links=True,
    motifsize=3,
    stoufferIDs=False,
    allmotifs=False,
    nrandomizations=0,
    randomize=False,
    usemetropolis=False,
    networktype="unipartite"
)

# Extract the data to save
results_dict = {
    "motifs": {motif_id: motif.real for motif_id, motif in results.motifs.items()},
    "nodes": {
        node_id: {
            "id": node.id,
            "motifs": node.motifs,
            "roles": node.roles,
            "weighted_motifs": node.weighted_motifs,
            "weighted_roles": node.weighted_roles
        } for node_id, node in results.nodes.items()
    },
    "links": [node_link_to_dict(link) for _, link in results.links.items()]  # Convert NodeLink to dict
}

# Function to convert tuple keys to strings
def convert_tuple_keys(d):
    if isinstance(d, dict):
        new_dict = {}
        for k, v in d.items():
            if isinstance(k, tuple):
                k = str(k)
            new_dict[k] = convert_tuple_keys(v)
        return new_dict
    elif isinstance(d, list):
        return [convert_tuple_keys(i) for i in d]
    else:
        return d

# Convert any tuple keys to strings
results_dict = convert_tuple_keys(results_dict)

# Save the results to a text file
with open('results.txt', 'w') as f:
    json.dump(results_dict, f, indent=4)

# Mapping of motifs to their initial nodes
initial_nodes = {
    (6, 0, 1),    # S1
    (12, 0, 1),   # S3
    (14, 0, 1),   # S7
    (36, 0, 2),   # S9
    (38, 0, 1),   # S11
    (46, 1, 1),   # S14 or S15 (both are initial and have the same structure)
    (74, 0, 1),   # S16
    (78, 0, 2),   # S19
    (98, 0, 2),   # S21
    (102, 0, 1),  # S23
    (108, 1, 1),  # S25 or S26 (both are initial and have the same structure)
    (238, 1, 1)   # S29 or S30 (both are initial and have the same structure)
}

# Convert initial_nodes to a set of motif IDs
initial_node_ids = {id for id, *_ in initial_nodes}


# Parsing results and calculating total emission per motif
def parse_results_from_file(file_path):
    with open(file_path, 'r') as f:
        results_dict = json.load(f)
    
    motifs = results_dict['motifs']
    nodes = results_dict['nodes']
    links = results_dict['links']
    
    return motifs, nodes, links


In [104]:
def calculate_total_emission_per_motif(motifs, nodes, links):
    motif_emissions = defaultdict(float)
    added_nodes = defaultdict(set)
    motif_link_counts = defaultdict(int)
 
    # Create a dictionary to map links to their motifs
    link_motifs = {tuple(link['id']): link['motifs'] for link in links}

    # Function to convert role string to a tuple
    def role_str_to_tuple(role_str):
        return tuple(map(int, role_str.strip('()').split(', ')))

    # Iterate through links to calculate emissions for each motif
    for link in links:
        edge_id = tuple(link['id'])
        associated_motifs = link['motifs']
        emission = G[edge_id[0]][edge_id[1]].get('emissions', 0)
        weight = G[edge_id[0]][edge_id[1]].get('weight', 1)

        for motif_id in associated_motifs:
            count = link['motifs'][motif_id]
            motif_link_counts[motif_id] += count
            motif_id_int = int(motif_id)  # Ensure motif_id is an integer
            # Determine the multiplier based on the motif ID
            multiplier = 2 if motif_id_int in {36, 78} else 1

            if motif_id_int in initial_node_ids:
                # Convert node IDs to string format to access the nodes dictionary
                node_id_str_1 = edge_id[0]
                node_id_str_2 = edge_id[1]

                # Check if node_id_str_1 is in nodes by comparing `id`
                node_in_nodes = next((node for node in nodes.values() if node['id'] == int(node_id_str_1)), None)
                
                if node_in_nodes:
                    # Get roles with count > 0
                    node_roles = {role: count for role, count in node_in_nodes['roles'].items() if count > 0}
                   # print ("K")
                    # Convert roles to tuples for comparison
                    node_roles_tuples = {role_str_to_tuple(role) for role in node_roles}
                   # print (node_roles_tuples)
                    # Check if any role matches the initial node role for the motif
                    for role_tuple in node_roles_tuples:
                        #print (edge_id[0],role_tuple)
                        #print (initial_nodes)
                        if role_tuple in initial_nodes:
                            if not (edge_id[0] in added_nodes[motif_id_int]):
                                # Calculate initial emission
                                if (role_tuple[0]== motif_id_int):
                                    initial_emission = sum(node_emissions.get(edge_id[0], [])) / len(node_emissions[edge_id[0]])
                                    initial_emission *= node_roles.get(str(role_tuple), 0)  # Multiply by count
                                    initial_emission *= multiplier
                                    print(f"Adding initial node emission:{role_tuple}{edge_id[0]} {initial_emission} for motif {motif_id_int}")
                                    motif_emissions[motif_id_int] += initial_emission
                                    added_nodes[motif_id_int].add(edge_id[0])
                                    print (added_nodes[motif_id_int]) 

            # Accumulate emission for the motif, but only if it's a valid number
            if not math.isnan(emission):
                print(f"Edge {edge_id} -> Emission: {emission}, Weight: {weight} occurence:{link['motifs'][motif_id]} motif_id :{motif_id}")
                motif_emissions[motif_id_int] += emission * (link['motifs'][motif_id] / weight)
  
    # Print total emissions for each motif
    print("Motif ID\tTotal Emission")
    print ( motif_link_counts)
    for motif_id, total_emission in motif_emissions.items():
        print(f"{motif_id}\t{total_emission:.4f}")

# Run the function with the path to the results file
motifs, nodes, links = parse_results_from_file('results.txt')
calculate_total_emission_per_motif(motifs, nodes, links)


Adding initial node emission:(12, 0, 1)8398 0.0 for motif 12
{8398}
Edge (8398, 7399) -> Emission: 0.0, Weight: 1 occurence:1 motif_id :12
Edge (8398, 7399) -> Emission: 0.0, Weight: 1 occurence:0 motif_id :74
Adding initial node emission:(12, 0, 1)7399 0.0 for motif 12
{8398, 7399}
Edge (7399, 5712) -> Emission: 0.0, Weight: 1 occurence:2 motif_id :12
Edge (7399, 5712) -> Emission: 0.0, Weight: 1 occurence:0 motif_id :74
Adding initial node emission:(12, 0, 1)5712 0.0 for motif 12
{5712, 8398, 7399}
Edge (5712, 5940) -> Emission: 18225.91, Weight: 1 occurence:2 motif_id :12
Edge (5712, 5940) -> Emission: 18225.91, Weight: 1 occurence:0 motif_id :74
Adding initial node emission:(12, 0, 1)5940 18225.91 for motif 12
{5712, 5940, 8398, 7399}
Edge (5940, 7221) -> Emission: 4873.5, Weight: 1 occurence:2 motif_id :12
Edge (5940, 7221) -> Emission: 4873.5, Weight: 1 occurence:0 motif_id :74
Adding initial node emission:(12, 0, 1)7221 4873.5 for motif 12
{7399, 8398, 5712, 5940, 7221}
Edge (72

In [103]:
print (results)

motif real rand srand zscore weight-mean weight-sd
12 8 0.000 0.000 888888.000 0.000 0.000
74 1 0.000 0.000 888888.000 0.000 0.000

node 12 74
8398 1 0
7399 2 0
5712 3 0
5940 3 0
7221 3 0
5399 3 0
5732 3 0
7922 3 0
6011 2 1
5411 1 1
5541 0 1

link 12 74
(8398, 7399) 1 0
(7399, 5712) 2 0
(5712, 5940) 2 0
(5940, 7221) 2 0
(7221, 5399) 2 0
(5399, 5732) 2 0
(5732, 7922) 2 0
(7922, 6011) 2 0
(6011, 5411) 1 1
(5411, 5541) 0 1
(5541, 5411) 0 1

node (12, 0, 1) (12, 1, 0) (12, 1, 1) (74, 0, 1) (74, 1, 1) (74, 2, 1)
8398 1 0 0 0 0 0
7399 1 0 1 0 0 0
5712 1 1 1 0 0 0
5940 1 1 1 0 0 0
7221 1 1 1 0 0 0
5399 1 1 1 0 0 0
5732 1 1 1 0 0 0
7922 1 1 1 0 0 0
6011 0 1 1 1 0 0
5411 0 1 0 0 0 1
5541 0 0 0 0 1 0

link (12, (0, 1), (1, 1)) (12, (1, 1), (1, 0)) (74, (0, 1), (2, 1)) (74, (1, 1), (2, 1))
(8398, 7399) 1 0 0 0
(7399, 5712) 1 1 0 0
(5712, 5940) 1 1 0 0
(5940, 7221) 1 1 0 0
(7221, 5399) 1 1 0 0
(5399, 5732) 1 1 0 0
(5732, 7922) 1 1 0 0
(7922, 6011) 1 1 0 0
(6011, 5411) 0 1 1 0
(5411, 5541) 0 0 0 1
