# Creating a financial transaction dataset using RMAT
The following are configurable:
* Range of the random transaction amounts
* Date range of the transactions
* Total number of transactions

Basic imports needed for RMAT

In [None]:
import gc
import os
from time import perf_counter
import numpy as np
import random
import math

# rapids
import cugraph
import cudf

# NetworkX libraries
import networkx as nx

# RMAT data generator
from cugraph.generators import rmat
from datetime import datetime

Method to wrap the rmat code allowing scale and edgefactor configuration

In [None]:
def generate_data(scale, edgefactor=16):
    _gdf = rmat(
        scale,
        (2 ** scale) * edgefactor,
        0.57,
        0.19,
        0.19,
        42,
        clip_and_flip=False,
        scramble_vertex_ids=True,
        create_using=None,  # return edgelist instead of Graph instance
        mg=False
        )
    print('Generating a dataframe of ' + str(len(_gdf)) + ' edges')
    return _gdf

Generate column for the date time of each transaction inside the specified range

In [None]:
def gen_times(count, start_date, end_date):
    range_start = start_date.timestamp()
    range_end =  int(end_date.timestamp())
    random_list = []
    for i in range(count):
        random_list.append(random.randint(range_start,range_end))
    return cudf.Series(random_list,name='Date', dtype=int)
#    return [datetime.fromtimestamp(i) for i in random_list]

Create the dollar amount column for transactions.

In [None]:
def gen_amounts(count,value_range):
    random_list = []
    for i in range(count):
        random_list.append(random.randint(0,value_range*100))
    return cudf.Series(random_list,name='amount', dtype=float).divide(100)

Create and write out the csv data file.

Verified to generate a file containing 33554432 edges (scale 21) on a single GPU. Takes roughly 90 seconds to do that.

In [None]:
start_time = '1/1/2022 01:00:00 AM'
end_time =   '7/1/2022 01:00:00 AM'
amount_range = 25000
d1 = datetime.strptime(start_time, '%m/%d/%Y %I:%M:%S %p')
d2 = datetime.strptime(end_time, '%m/%d/%Y %I:%M:%S %p')
scale = 15
df = generate_data(scale)

dates = gen_times(len(df),d1, d2)
amounts = gen_amounts(len(df),amount_range)
df['amounts'] = amounts
df['date'] = dates
filename = "transaction_data_scale"+str(scale)+".csv"
df.to_csv('../data/'+filename) #append mode
print (len(df))
df.head(5)

___
Copyright (c) 2023-2025, NVIDIA CORPORATION.

Licensed under the Apache License, Version 2.0 (the "License");  you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.
___