#Setup

In [None]:
pip install google-generativeai faker pandas numpy


Collecting faker
  Downloading faker-37.4.2-py3-none-any.whl.metadata (15 kB)
Downloading faker-37.4.2-py3-none-any.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faker
Successfully installed faker-37.4.2


In [None]:
import google.generativeai as genai
from google.colab import userdata

api_key = userdata.get('GEMINI_API_KEY')
genai.configure(api_key=api_key)
model = genai.GenerativeModel("gemini-2.5-flash")
print("Gemini API configured successfully using Colab Secrets.")


Gemini API configured successfully using Colab Secrets.


##Prompt Template for One Transaction Record

In [None]:
prompt = """

You are a JSON API simulator for a synthetic transaction generator within the Vybe app.

Return ONLY a **single line minified JSON object**, no markdown, no explanation.

Generate a single JSON object representing a high-fidelity simulated transaction with the labels and format specified below.

The object should include the following keys:

Transaction Metadata:
- transaction_id: UUID
- user_id: user_{number}
- timestamp_initiated: YYYY-MM-DD HH:MM:SS format
- amount: float value between 10.0 and 10000.0
- transaction_types = One of [
    "Bank to Bank (InstaPay)",
    "Bank to Bank (PESONet)",
    "Bank to e-Wallet (GCash)",
    "Bank to e-Wallet (Maya)",
    "Bank to e-Wallet (ShopeePay)",
    "BPI to Vybe Wallet",
    "Vybe Wallet to GCash",
    "Vybe Wallet to Maya",
    "Vybe Wallet to ShopeePay",
    "Vybe Wallet to Vybe Wallet",
    "Vybe Wallet to Bank (BPI)",
    "Internal Vybe App Transfer",
    "Internal Cashback Credit",
    "Auto-Reversal Processed",
    "Auto-Retry Triggered",
    "Manual Escalation Triggered",
    "QR Payment (Merchant)",
    "QR Payment (P2P)",
    "Bills Payment (via Vybe Wallet)",
    "Bills Payment (via BPI Linked)",
    "Scheduled Transfer (Future Dated)",
    "Cash-In via Partner Outlet",
    "Cash-Out via ATM or OTC"
]
- recipient_type: "New Recipient" or "Frequent Recipient"
- recipient_account_id: a 12-digit number as string
- recipient_bank_name/e-wallet_name: matching the transaction_type
- device_id: UUID
- location_coordinates: [latitude, longitude] floats
- simulated_network_latency: int in milliseconds (50-5000)

Transaction Status Progression:
- status_timestamp_1: Timestamp for "Initiated"
- status_1: "Initiated"
- status_timestamp_2: Timestamp for "Debit Confirmed (BPI)"
- status_2: "Debit Confirmed (BPI)"
- status_timestamp_3: Timestamp for "Processing (Recipient Bank/e-Wallet)"
- status_3: "Processing (Recipient Bank/e-Wallet)"
- status_timestamp_4: Final timestamp ("Credit Confirmed", "Failed", or "Reversed")
- status_4: One of ["Credit Confirmed (Recipient)", "Failed (Network Error)", "Failed (Timeout)", "Reversed (User Cancelled)"]
- expected_completion_time: based on transaction_type (add 2 to 10 minutes to initiation)

Anomaly/Failure Indicators:
- is_floating_cash: Boolean
- floating_duration_minutes: Int (0 if False)
- is_fraudulent_attempt: Boolean
- is_cancellation: Boolean
- is_retry_successful: Boolean
- manual_escalation_needed: Boolean

Ensure timestamps are logical (progress chronologically).

Only respond with minified JSON. No extra text, markdown, or explanation.
Respond with only one object, no text or comments.

"""

response = model.generate_content(prompt)
print(response.text)


{"transaction_id":"19b0d616-241f-4318-912b-34a1b0ac62e0","user_id":"user_36411","timestamp_initiated":"2023-10-27 10:33:04","amount":4670.36,"transaction_types":"Bank to Bank (InstaPay)","recipient_type":"Frequent Recipient","recipient_account_id":"117467657158","recipient_bank_name/e-wallet_name":"Metrobank","device_id":"90e292a1-2856-4c9f-8c38-8959f6b92a4e","location_coordinates":[13.344445,122.956637],"simulated_network_latency":1003,"status_timestamp_1":"2023-10-27 10:33:04","status_1":"Initiated","status_timestamp_2":"2023-10-27 10:33:07","status_2":"Debit Confirmed (BPI)","status_timestamp_3":"2023-10-27 10:33:17","status_3":"Processing (Recipient Bank/e-Wallet)","status_timestamp_4":"2023-10-27 10:33:40","status_4":"Credit Confirmed (Recipient)","expected_completion_time":"2023-10-27 10:39:04","is_floating_cash":false,"floating_duration_minutes":0,"is_fraudulent_attempt":false,"is_cancellation":false,"is_retry_successful":false,"manual_escalation_needed":false}


#Generation Trial

In [None]:
import json

synthetic_data = []
for i in range(5):
    response = model.generate_content(prompt)
    try:
        entry = json.loads(response.text)
        synthetic_data.append(entry)
    except:
        continue

print(f"Total entries generated: {len(synthetic_data)}")


Total entries generated: 5


In [None]:
import pandas as pd

df = pd.DataFrame(synthetic_data)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 20)

df.to_csv("synthetic_transactions_gemini.csv", index=False)

df.head(20)


Unnamed: 0,transaction_id,user_id,timestamp_initiated,amount,transaction_types,recipient_type,recipient_account_id,recipient_bank_name/e-wallet_name,device_id,location_coordinates,simulated_network_latency,status_timestamp_1,status_1,status_timestamp_2,status_2,status_timestamp_3,status_3,status_timestamp_4,status_4,expected_completion_time,is_floating_cash,floating_duration_minutes,is_fraudulent_attempt,is_cancellation,is_retry_successful,manual_escalation_needed
0,90d71597-2a6c-4828-b80c-e2f750e4171d,user_387910,2023-11-19 20:31:02,3095.53,Vybe Wallet to GCash,New Recipient,185011494951,GCash,594d4d62-1172-4746-836e-5e3650207a97,"[17.514749, 120.35478]",1965,2023-11-19 20:31:02,Initiated,2023-11-19 20:32:00,Debit Confirmed (BPI),2023-11-19 20:33:04,Processing (Recipient Bank/e-Wallet),2023-11-19 20:34:52,Credit Confirmed (Recipient),2023-11-19 20:38:02,False,0,True,False,False,False
1,7c7c10b7-4f93-4a6c-9226-9f4a2119c8f0,user_48332,2023-10-27 10:24:59,7153.29,Bank to Bank (InstaPay),New Recipient,671802996545,Metrobank,8c313a89-231a-4710-947b-117cf49e2908,"[12.355172, 123.63351]",4619,2023-10-27 10:24:59,Initiated,2023-10-27 10:25:07,Debit Confirmed (BPI),2023-10-27 10:25:56,Processing (Recipient Bank/e-Wallet),2023-10-27 10:27:08,Failed (Timeout),2023-10-27 10:30:59,True,103,False,False,False,True
2,e2e2a143-7f21-4f1e-82b5-520e5e40e2d3,user_43615,2024-06-25 15:07:34,816.03,Cash-In via Partner Outlet,Frequent Recipient,380665319889,7-Eleven,8c5d95fe-a78b-4977-9878-a35987a022b7,"[12.28821, 123.687786]",3105,2024-06-25 15:07:34,Initiated,2024-06-25 15:07:44,Debit Confirmed (BPI),2024-06-25 15:08:04,Processing (Recipient Bank/e-Wallet),2024-06-25 15:09:59,Credit Confirmed (Recipient),2024-06-25 15:15:34,False,0,False,False,False,False
3,71ae1a35-154a-4c22-9212-32130e43d2ac,user_3879,2023-10-27 10:30:13,9415.53,Bank to Bank (InstaPay),Frequent Recipient,310815777174,RCBC,5668d2aa-4f81-422f-b4fc-7a70a84e902c,"[41.365315, -83.477023]",4810,2023-10-27 10:30:13,Initiated,2023-10-27 10:30:52,Debit Confirmed (BPI),2023-10-27 10:31:07,Processing (Recipient Bank/e-Wallet),2023-10-27 10:31:17,Credit Confirmed (Recipient),2023-10-27 10:39:13,False,0,False,False,False,False
4,c1a9c148-3560-49e0-8451-344498308e6f,user_21590,2024-07-27 10:25:35,6220.89,Bank to e-Wallet (Maya),Frequent Recipient,180126935749,Maya,620586e9-38b4-42f4-a2b1-6b2191599a0e,"[-69.835474, 98.711833]",1345,2024-07-27 10:25:35,Initiated,2024-07-27 10:25:46,Debit Confirmed (BPI),2024-07-27 10:26:27,Processing (Recipient Bank/e-Wallet),2024-07-27 10:27:14,Credit Confirmed (Recipient),2024-07-27 10:30:35,False,0,False,False,False,False


#Actual Generation

In [None]:
batch_size = 10
target_total = 100

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 20)

#MAIN LOOP
while len(synthetic_data) < target_total:
    print(f"\nGenerating batch... Current count: {len(synthetic_data)}")

    for i in range(batch_size):
        try:
            response = model.generate_content(prompt)
            entry = json.loads(response.text)

            # Optional: basic schema validation
            if isinstance(entry, dict) and "transaction_id" in entry:
                synthetic_data.append(entry)
                print(f"[✓] Entry #{len(synthetic_data)} added.")
            else:
                print("[!] Skipped: Incomplete or invalid JSON structure.")

        except json.JSONDecodeError:
            print("[X] JSON decoding failed. Skipping.")
        except Exception as e:
            print(f"[X] Error: {e}")

    # Save intermediate CSV every 100 entries
    if len(synthetic_data) % 100 == 0:
        df = pd.DataFrame(synthetic_data)
        df.to_csv("synthetic_transactions_gemini.csv", index=False)
        print(f"Saved progress: {len(synthetic_data)} entries.")

print(f"\n Done! Total entries generated: {len(synthetic_data)}")

# Final save
df = pd.DataFrame(synthetic_data)
df.to_csv("synthetic_transactions_gemini_final.csv", index=False)
print("Final CSV saved as 'synthetic_transactions_gemini_final.csv'.")

# Optional: preview
df.head(20)



Generating batch... Current count: 71
[✓] Entry #72 added.
[✓] Entry #73 added.
[✓] Entry #74 added.
[✓] Entry #75 added.
[✓] Entry #76 added.
[✓] Entry #77 added.
[✓] Entry #78 added.
[✓] Entry #79 added.
[✓] Entry #80 added.
[✓] Entry #81 added.

Generating batch... Current count: 81
[✓] Entry #82 added.
[✓] Entry #83 added.
[✓] Entry #84 added.
[✓] Entry #85 added.
[✓] Entry #86 added.


ERROR:tornado.access:500 POST /v1beta/models/gemini-2.5-flash:generateContent?%24alt=json%3Benum-encoding%3Dint (127.0.0.1) 23840.33ms


[X] Error: 500 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent?%24alt=json%3Benum-encoding%3Dint: An internal error has occurred. Please retry or report in https://developers.generativeai.google/guide/troubleshooting
[✓] Entry #87 added.
[✓] Entry #88 added.
[✓] Entry #89 added.
[✓] Entry #90 added.

Generating batch... Current count: 90
[✓] Entry #91 added.
[✓] Entry #92 added.
[✓] Entry #93 added.
[✓] Entry #94 added.
[✓] Entry #95 added.
[✓] Entry #96 added.
[✓] Entry #97 added.
[✓] Entry #98 added.
[✓] Entry #99 added.
[✓] Entry #100 added.
Saved progress: 100 entries.

 Done! Total entries generated: 100
Final CSV saved as 'synthetic_transactions_gemini_final.csv'.


Unnamed: 0,transaction_id,user_id,timestamp_initiated,amount,transaction_types,recipient_type,recipient_account_id,recipient_bank_name/e-wallet_name,device_id,location_coordinates,simulated_network_latency,status_timestamp_1,status_1,status_timestamp_2,status_2,status_timestamp_3,status_3,status_timestamp_4,status_4,expected_completion_time,is_floating_cash,floating_duration_minutes,is_fraudulent_attempt,is_cancellation,is_retry_successful,manual_escalation_needed,transaction_type
0,90d71597-2a6c-4828-b80c-e2f750e4171d,user_387910,2023-11-19 20:31:02,3095.53,Vybe Wallet to GCash,New Recipient,185011494951,GCash,594d4d62-1172-4746-836e-5e3650207a97,"[17.514749, 120.35478]",1965,2023-11-19 20:31:02,Initiated,2023-11-19 20:32:00,Debit Confirmed (BPI),2023-11-19 20:33:04,Processing (Recipient Bank/e-Wallet),2023-11-19 20:34:52,Credit Confirmed (Recipient),2023-11-19 20:38:02,False,0,True,False,False,False,
1,7c7c10b7-4f93-4a6c-9226-9f4a2119c8f0,user_48332,2023-10-27 10:24:59,7153.29,Bank to Bank (InstaPay),New Recipient,671802996545,Metrobank,8c313a89-231a-4710-947b-117cf49e2908,"[12.355172, 123.63351]",4619,2023-10-27 10:24:59,Initiated,2023-10-27 10:25:07,Debit Confirmed (BPI),2023-10-27 10:25:56,Processing (Recipient Bank/e-Wallet),2023-10-27 10:27:08,Failed (Timeout),2023-10-27 10:30:59,True,103,False,False,False,True,
2,e2e2a143-7f21-4f1e-82b5-520e5e40e2d3,user_43615,2024-06-25 15:07:34,816.03,Cash-In via Partner Outlet,Frequent Recipient,380665319889,7-Eleven,8c5d95fe-a78b-4977-9878-a35987a022b7,"[12.28821, 123.687786]",3105,2024-06-25 15:07:34,Initiated,2024-06-25 15:07:44,Debit Confirmed (BPI),2024-06-25 15:08:04,Processing (Recipient Bank/e-Wallet),2024-06-25 15:09:59,Credit Confirmed (Recipient),2024-06-25 15:15:34,False,0,False,False,False,False,
3,71ae1a35-154a-4c22-9212-32130e43d2ac,user_3879,2023-10-27 10:30:13,9415.53,Bank to Bank (InstaPay),Frequent Recipient,310815777174,RCBC,5668d2aa-4f81-422f-b4fc-7a70a84e902c,"[41.365315, -83.477023]",4810,2023-10-27 10:30:13,Initiated,2023-10-27 10:30:52,Debit Confirmed (BPI),2023-10-27 10:31:07,Processing (Recipient Bank/e-Wallet),2023-10-27 10:31:17,Credit Confirmed (Recipient),2023-10-27 10:39:13,False,0,False,False,False,False,
4,c1a9c148-3560-49e0-8451-344498308e6f,user_21590,2024-07-27 10:25:35,6220.89,Bank to e-Wallet (Maya),Frequent Recipient,180126935749,Maya,620586e9-38b4-42f4-a2b1-6b2191599a0e,"[-69.835474, 98.711833]",1345,2024-07-27 10:25:35,Initiated,2024-07-27 10:25:46,Debit Confirmed (BPI),2024-07-27 10:26:27,Processing (Recipient Bank/e-Wallet),2024-07-27 10:27:14,Credit Confirmed (Recipient),2024-07-27 10:30:35,False,0,False,False,False,False,
5,84a7e94f-4422-4824-a745-f048d601b22e,user_195589,2023-10-27 10:37:37,6734.42,Bank to e-Wallet (Maya),New Recipient,590740698114,Maya,b188c039-38e2-48b4-9273-075e8ee4917a,"[15.22851, 126.311333]",4545,2023-10-27 10:37:37,Initiated,2023-10-27 10:37:40,Debit Confirmed (BPI),2023-10-27 10:37:49,Processing (Recipient Bank/e-Wallet),2023-10-27 10:38:00,Failed (Network Error),2023-10-27 10:43:37,True,51,False,False,True,True,
6,c85d7756-11f8-456d-88ec-a393967d603a,user_4958,2023-10-27 10:30:15,8573.06,Bank to Bank (InstaPay),Frequent Recipient,890123456789,Metrobank,89b3d047-9752-4416-815d-854e488f72a9,"[14.599512, 120.984222]",1234,2023-10-27 10:30:15,Initiated,2023-10-27 10:30:46,Debit Confirmed (BPI),2023-10-27 10:32:01,Processing (Recipient Bank/e-Wallet),2023-10-27 10:33:05,Credit Confirmed (Recipient),2023-10-27 10:39:15,False,0,False,False,False,False,
7,a87071e2-b352-4dd2-824c-1d3744ac7c8b,user_36498,2023-11-19 19:42:01,7497.02,Bank to e-Wallet (Maya),New Recipient,346985012378,Maya,5669f64c-70e0-4a81-9f20-149dd7a47731,"[14.881476, 120.301309]",1936,2023-11-19 19:42:01,Initiated,2023-11-19 19:42:15,Debit Confirmed (BPI),2023-11-19 19:43:08,Processing (Recipient Bank/e-Wallet),2023-11-19 19:45:51,Credit Confirmed (Recipient),2023-11-19 19:45:01,False,0,False,False,False,False,
8,90e4871d-557d-4573-a292-127e997f0e08,user_85860,2024-07-28 17:34:10,3080.76,QR Payment (P2P),Frequent Recipient,310842517650,Vybe User,5454b5ee-06b2-4d76-8883-93f8e5ee7606,"[-41.761801, -166.495066]",1307,2024-07-28 17:34:10,Initiated,2024-07-28 17:34:25,Vybe Wallet Debit Confirmed,2024-07-28 17:35:10,Processing (Recipient Vybe Wallet),2024-07-28 17:35:46,Credit Confirmed (Recipient),2024-07-28 17:39:10,False,0,False,False,False,False,
9,7c7e0971-8726-4ed9-8f0a-1153d1007997,user_48037,2023-10-27 10:34:02,2534.61,QR Payment (Merchant),New Recipient,341103606782,Merchant JLS,845d4f3b-52c7-433e-b873-10705f15456f,"[34.693897, -159.261829]",3075,2023-10-27 10:34:02,Initiated,2023-10-27 10:34:10,Debit Confirmed (BPI),2023-10-27 10:34:39,Processing (Recipient Bank/e-Wallet),2023-10-27 10:36:14,Credit Confirmed (Recipient),2023-10-27 10:37:02,False,0,True,False,False,True,


#Combining CSVs

In [None]:
import pandas as pd
import io
from google.colab import files

# Upload multiple CSVs
uploaded = files.upload()

combined_df = pd.DataFrame()

for filename in uploaded:
    content = uploaded[filename].decode('utf-8')
    try:
        df = pd.read_csv(io.StringIO(content), delimiter=',')
        print(f" {filename} read successfully with comma delimiter.")
    except pd.errors.ParserError:
        try:
            df = pd.read_csv(io.StringIO(content), delimiter=';')
            print(f"{filename} read successfully with semicolon delimiter.")
        except Exception as e:
            print(f"❌ Failed to read {filename}. Error: {e}")
            continue
    combined_df = pd.concat([combined_df, df], ignore_index=True)

# Final sanity check
print(f"\n Final combined shape: {combined_df.shape}")

# Save to CSV
combined_df.to_csv('dataset.csv', index=False)
files.download('dataset.csv')


Saving DONE.csv to DONE (1).csv
Saving synthetic_transactions_progress_100.csv to synthetic_transactions_progress_100 (3).csv
 DONE (1).csv read successfully with comma delimiter.
 synthetic_transactions_progress_100 (3).csv read successfully with comma delimiter.

 Final combined shape: (12100, 28)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

#Checking Number of Rows

In [None]:
import pandas as pd

df = pd.read_csv("dataset.csv")
print(f"✅ Rows: {df.shape[0]}, Columns: {df.shape[1]}")
df.head()


✅ Rows: 12100, Columns: 28


Unnamed: 0,transaction_id,user_id,timestamp_initiated,amount,transaction_type,recipient_type,recipient_account_id,recipient_bank_name_or_ewallet,device_id,location_coordinates,...,status_4,expected_completion_time,is_floating_cash,floating_duration_minutes,is_fraudulent_attempt,is_cancellation,is_retry_successful,manual_escalation_needed,transaction_types,recipient_bank_name/e-wallet_name
0,5ef64426-5ee8-44fb-9ce6-538562d9894e,user_21932,2024-04-03 12:21:38,5000.0,Bills Payment (via Vybe Wallet),Frequent Recipient,408432314981,BPI,42a22f3e-436f-45b1-ab98-4ed086a9f0fe,"[14.67228, 121.03756]",...,Credit Confirmed (Recipient),2024-04-20 03:22:20,False,104,False,True,True,False,,
1,90d5cd5e-333e-4b6e-bb16-c956c34091a1,user_43501,2024-07-28 14:07:05,1003.55,Bank to e-Wallet (ShopeePay),Frequent Recipient,375058098317,GCash,84c8a5a4-c7b4-4b55-bf7d-c2057393d25d,"[14.475456, 120.931707]",...,Credit Confirmed (Recipient),2024-05-18 11:32:40,False,0,False,False,False,True,,
2,f39f7278-f71f-49b4-a6fc-6e792c3d5268,user_4,2024-05-24 16:03:00,6759.56,Vybe Wallet to Bank (BPI),New Recipient,409395995722,Merchant QR,1b43b67e-28b3-4676-9d33-9118c7c91d8e,"[9.488339, 122.956799]",...,Credit Confirmed (Recipient),2024-07-28 17:03:21,False,0,False,False,False,False,,
3,220f18d7-402a-436d-99d6-f35f29910d9e,user_5966,2024-06-25 19:15:38,7218.49,Vybe Wallet to Bank (BPI),Frequent Recipient,349842603347,BPI,59cf059d-d6a8-48b6-963d-424a7ef751cf,"[16.035767, 115.820293]",...,Credit Confirmed (Recipient),2024-05-13 09:16:03,False,0,False,False,False,False,,
4,5f51086a-7407-4a0d-83ec-1087e504c568,user_1,2023-10-27 10:27:03,830.41,Vybe Wallet to Vybe Wallet,Frequent Recipient,863897722137,Other Bank (InstaPay),0364d27f-0546-4c9c-b166-574f7d451a2d,"[12.355106, 122.996156]",...,Failed (Network Error),2024-07-29 02:47:48,False,56,False,False,False,False,,


In [None]:
# Summary of column types, non-null values, and memory usage
df.info()

# Summary statistics for numeric columns
df.describe()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12100 entries, 0 to 12099
Data columns (total 28 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   transaction_id                     12100 non-null  object 
 1   user_id                            12100 non-null  object 
 2   timestamp_initiated                12100 non-null  object 
 3   amount                             12100 non-null  float64
 4   transaction_type                   11809 non-null  object 
 5   recipient_type                     11959 non-null  object 
 6   recipient_account_id               11924 non-null  object 
 7   recipient_bank_name_or_ewallet     11420 non-null  object 
 8   device_id                          12100 non-null  object 
 9   location_coordinates               12100 non-null  object 
 10  simulated_network_latency          12100 non-null  int64  
 11  status_timestamp_1                 12100 non-null  obj

Unnamed: 0,amount,simulated_network_latency,floating_duration_minutes
count,12100.0,12100.0,12100.0
mean,4101.254234,2727.853884,11.729669
std,3124.587994,1558.027934,69.46117
min,10.0,50.0,0.0
25%,830.41,1500.0,0.0
50%,3671.3,2941.0,0.0
75%,7152.06,4369.0,0.0
max,25000.0,4991.0,1369.0


In [None]:
nulls = df.isnull().sum()
nulls[nulls > 0]


Unnamed: 0,0
transaction_type,291
recipient_type,141
recipient_account_id,176
recipient_bank_name_or_ewallet,680
status_timestamp_4,10
transaction_types,11809
recipient_bank_name/e-wallet_name,11803


In [None]:
# Count total number of unique rows
unique_rows_total = df.drop_duplicates().shape[0]
print(f"Number of unique rows: {unique_rows_total}")


Number of unique rows: 11997


In [None]:
# Drop duplicate rows
df_unique = df.drop_duplicates()

# Check the number of rows after removing duplicates
print(f"Number of rows after removing duplicates: {df_unique.shape[0]}")

# Save to a new CSV
df_unique.to_csv("syn_dataset.csv", index=False)


Number of rows after removing duplicates: 11997


In [None]:
import pandas as pd

# Load the dataset
df = pd.read_csv('syn_dataset.csv')

# Filter only floating cash scenarios
floating_cash_df = df[df['is_floating_cash'] == True]

# Show count and preview
print(f"Number of floating cash transactions: {len(floating_cash_df)}")
display(floating_cash_df.head(10))  # Display first 10 rows


Number of floating cash transactions: 1835


Unnamed: 0,transaction_id,user_id,timestamp_initiated,amount,transaction_type,recipient_type,recipient_account_id,recipient_bank_name_or_ewallet,device_id,location_coordinates,...,status_4,expected_completion_time,is_floating_cash,floating_duration_minutes,is_fraudulent_attempt,is_cancellation,is_retry_successful,manual_escalation_needed,transaction_types,recipient_bank_name/e-wallet_name
6,7c43df5e-2fce-4c28-9ed7-9a4f61f714c3,user_818,2024-06-03 14:38:29,9660.84,Auto-Reversal Processed,New Recipient,421290610344,GCash,89012345-6789-0123-4567-890123456789,"[7.0673, 125.6058]",...,Reversed (User Cancelled),2023-10-27 15:03:40,True,53,False,False,False,False,,
9,e2bb256f-2396-410c-99f6-02e079cd5f8b,user_6,2024-05-16 11:36:26,8858.74,Bank to e-Wallet (GCash),Frequent Recipient,171447953258,BPI,71434c7a-514d-4876-b60b-8d5423fef6b0,"[14.473552, 121.077227]",...,Credit Confirmed (Recipient),2024-06-25 16:24:31,True,118,False,False,False,False,,
17,7023c72b-8a8f-4ed1-8551-71e84a275218,user_9,2023-10-27 09:20:00,1758.21,Manual Escalation Triggered,Frequent Recipient,736696708681,Maya,79737190-252a-43cf-82b5-e6a351187d96,"[14.887856, 124.708899]",...,Failed (Network Error),2024-06-25 15:13:40,True,0,False,False,False,False,,
22,846b0a1d-720c-4394-a169-c00fdfcf2825,user_10,2024-07-28 10:14:13,774.28,Cash-In via Partner Outlet,New Recipient,370355416075,Vybe Wallet,8a3c8e5e-5264-42b7-a3a7-e92534f59345,"[17.514013, 124.607425]",...,Credit Confirmed (Recipient),2024-04-27 10:29:43,True,0,False,False,False,True,,
25,5c12808c-66f8-4503-b0e6-a83d7f02306d,user_3943,2023-10-27 10:27:32,2222.97,Auto-Reversal Processed,New Recipient,387123490786,Partner Outlet,cfb56c42-263a-4467-b50e-b06f85078500,"[17.589851, 120.730248]",...,Credit Confirmed (Recipient),2023-11-20 11:43:55,True,0,False,False,False,False,,
31,8a425313-ec84-4861-a5c9-1bb6bb2013f9,user_58702,2024-05-09 04:36:20,34.82,Bank to Bank (InstaPay),Frequent Recipient,466089201646,Maya,594d618d-6ac2-498c-9c78-ddb2c12579b1,"[17.653303, 116.891244]",...,Failed (Network Error),2024-07-28 01:39:04,True,0,False,False,False,False,,
32,987d3a04-5827-4c07-b649-14a01c43d8a6,user_6,2024-05-13 14:00:23,150.31,Vybe Wallet to ShopeePay,,892404071337,,59580b54-9456-4c40-a10c-f377a06a3861,"[6.027059, 121.751307]",...,Credit Confirmed (Recipient),2024-07-28 17:41:25,True,98,False,False,False,True,,
40,f6c919d3-35f1-432a-9289-5f2128a30d5d,user_43653,2024-06-19 07:10:00,4378.89,Vybe Wallet to Vybe Wallet,Frequent Recipient,445566778899,Maya,f0e4b77d-741c-4b5c-b179-8d19760773d2,"[14.549226, 120.940561]",...,Reversed (User Cancelled),2024-07-30 14:26:18,True,0,False,False,True,False,,
44,f666f4ef-ec8e-49b0-9b4b-9118d0f19a4e,user_8586,2024-05-13 07:12:09,5299.78,Vybe Wallet to Bank (BPI),Frequent Recipient,677843398902,Partner Outlet,995b00c3-f018-4a6f-a185-32e92c2a04ec,"[17.587372, 126.331495]",...,Credit Confirmed (Recipient),2023-10-27 09:11:42,True,0,False,False,False,False,,
53,55331ac7-d64e-4f01-8b21-4f910403756c,user_48689,2024-04-20 04:30:11,7497.66,Cash-In via Partner Outlet,Frequent Recipient,609564619934,Vybe Internal User,6e7f8a9b-0c1d-2e3f-4a5b-6c7d8e9f0a1b,"[17.513289, 122.999617]",...,Failed (Timeout),2023-11-01 19:50:08,True,0,False,False,True,False,,


In [None]:
from google.colab import files
import pandas as pd
import random

# Step 1: Upload CSV
uploaded = files.upload()
filename = list(uploaded.keys())[0]

# Step 2: Read the CSV
df = pd.read_csv(filename)
print("Original Data (First 5 Rows):")
print(df.head())

# Step 3: Randomize each column independently while preserving structure
shuffled_df = df.copy()
for column in shuffled_df.columns:
    shuffled_df[column] = shuffled_df[column].sample(frac=1).reset_index(drop=True)

print("\nRandomized Data (First 5 Rows):")
print(shuffled_df.head())

# Step 4: Save to new CSV
output_filename = "randomized_" + filename
shuffled_df.to_csv(output_filename, index=False)

# Step 5: Offer download
files.download(output_filename)


Saving ra.csv to ra.csv
Original Data (First 5 Rows):
                         transaction_id    user_id  timestamp_initiated  \
0  55403e5c-9c71-4687-8461-9f93510e1a12     user_7  2024-05-18 21:10:07   
1  673a3885-f5b9-4a41-b844-486927a44f51     user_6  2023-11-19 22:50:53   
2  500dd70d-f28b-4a87-b129-c1878d6b9d62  user_4387  2024-07-25 21:02:45   
3  54b8a7f4-f3a7-4b72-a720-33303c7069dd    user_10  2023-10-26 10:30:17   
4  671a5c6d-961d-4054-9988-8255866175b9  user_4387  2024-06-25 15:15:37   

    amount                 transaction_type      recipient_type  \
0  1880.50             Auto-Retry Triggered  Frequent Recipient   
1  4378.07         Vybe Wallet to ShopeePay  Frequent Recipient   
2  3300.95  Bills Payment (via Vybe Wallet)  Frequent Recipient   
3  5298.67          Bank to e-Wallet (Maya)  Frequent Recipient   
4   401.76  Bills Payment (via Vybe Wallet)       New Recipient   

  recipient_account_id recipient_bank_name_or_ewallet  \
0       123456789012.0             

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import pandas as pd
from google.colab import files
import io

# Step 0: Upload the file
uploaded = files.upload()

# Step 1: Read the uploaded CSV
for filename in uploaded:
    df = pd.read_csv(io.StringIO(uploaded[filename].decode('utf-8')))

# Step 2: Count NaNs per row
df["__missing_count"] = df.isna().sum(axis=1)

# Step 3: Sort by missing count (ascending: fewer NaNs first)
df_sorted = df.sort_values(by="__missing_count", ascending=True)

# Step 4: Keep only top 10,000 rows
df_cleaned = df_sorted.head(10000).drop(columns=["__missing_count"])

# Step 5: Save cleaned dataset
df_cleaned.to_csv("cleaned_10000_rows.csv", index=False)

# Step 6: Down


Saving transactions_fixed.csv to transactions_fixed.csv


In [None]:
import pandas as pd
import io
from google.colab import files
import numpy as np # Import numpy for random number generation

# Step 1: Allow the user to upload CSV files using Google Colab's file upload functionality.
# This will open a file selection dialog in the Colab environment.
print("Please upload your 'transactions_fixed.csv' file.")
uploaded = files.upload()

# Step 2: Extract the content of the uploaded CSV file.
# We assume the user uploads a file named 'transactions_fixed.csv'.
# The `uploaded` dictionary contains the file contents as bytes.
if 'transactions_fixed.csv' in uploaded:
    # Decode the bytes content to a UTF-8 string.
    csv_content = uploaded['transactions_fixed.csv'].decode('utf-8')
else:
    # If the expected file is not found, raise an error to inform the user.
    raise FileNotFoundError("transactions_fixed.csv not found in uploaded files. Please ensure you upload the correct file.")

# Step 3: Important check to ensure the fetched content is not empty after stripping whitespace.
# An empty string would lead to a parsing error in pandas.
if not csv_content.strip():
    raise ValueError("Uploaded CSV content is empty or contains only whitespace. Cannot parse.")

# Step 4: Load the CSV data into a pandas DataFrame.
# `io.StringIO` is used to treat the string content as a file-like object,
# which `pd.read_csv` can then process.
# Using `engine='python'` as a fallback can sometimes help with parsing
# if the default C engine encounters unexpected formatting.
df = pd.read_csv(io.StringIO(csv_content), engine='python')

# Step 5: Get all unique user_ids from the original dataset.
# This ensures that all users from the input file are included in the output.
unique_user_ids = df['user_id'].unique()

# Step 6: Generate a random positive wallet balance for each unique user_id.
# We'll use a uniform distribution for simplicity, ensuring all balances are positive.
# For example, balances between 1.00 and 10000.00.
# You can adjust the range (low, high) as needed.
random_wallet_balances = np.random.uniform(low=1.00, high=10000.00, size=len(unique_user_ids))

# Step 7: Create a new DataFrame with user_id and the generated random wallet balances.
wallet_balances = pd.DataFrame({
    'user_id': unique_user_ids,
    'wallet_balance': random_wallet_balances
})

# Step 8: Print the first few rows of the newly generated dataset.
# This provides a quick preview of the structure and content of the output.
print("Generated Wallet Balances (first 5 rows with random positive balances):")
print(wallet_balances.head())

# Step 9: Convert the final `wallet_balances` DataFrame to a CSV formatted string.
# This string can be easily copied by the user or saved as a new CSV file.
output_csv_string = wallet_balances.to_csv(index=False)

# Step 10: Print the full CSV content for the user to access.
print("\n--- Full CSV Data for Wallet Balances ---")
print(output_csv_string)

# Step 11: Export the generated wallet balances to a CSV file for download.
# This step uses Google Colab's `files.download()` function to allow the user
# to download the generated CSV file directly from the notebook.
try:
    # Define the filename for the output CSV.
    output_filename = 'user_wallet_balances.csv'
    # Write the CSV content to a local file in the Colab environment.
    with open(output_filename, 'w', encoding='utf-8') as f:
        f.write(output_csv_string)
    # Trigger the download of the file.
    files.download(output_filename)
    print(f"\n'{output_filename}' has been generated and is ready for download.")
except Exception as e:
    print(f"\nError exporting CSV: {e}. This feature is intended for Google Colab environment.")


Please upload your 'transactions_fixed.csv' file.


Saving transactions_fixed.csv to transactions_fixed.csv
Generated Wallet Balances (first 5 rows with random positive balances):
      user_id  wallet_balance
0   user_7758       10.888344
1  user_19632     3773.758348
2  user_38848     8493.209273
3      user_8     9089.821685
4  user_43694     1773.442334

--- Full CSV Data for Wallet Balances ---
user_id,wallet_balance
user_7758,10.888344180977649
user_19632,3773.7583478812985
user_38848,8493.209272567832
user_8,9089.821685448847
user_43694,1773.4423338535373
user_65306,6238.42771778131
user_7368,6770.095620267329
user_3640,7473.388888200896
user_5,7061.5810948084245
user_8112,6088.942495914934
user_48529,6123.479355386687
user_2689,2463.879365363138
user_2109,4213.5038603741405
user_1562,3860.9276161135626
user_9037,1105.0076283870897
user_7609,3240.9170370835436
user_8750,6027.41173113966
user_8586,8482.572204489015
user_22,4707.146769010307
user_132205,2143.1079351726275
user_2,5365.125438666228
user_819448,6381.898184703481
user_

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


'user_wallet_balances.csv' has been generated and is ready for download.


In [None]:
import pandas as pd
import io
# Removed: from google.colab import files (as it's no longer needed for automatic reading)

# Step 1: Define the filename for the wallet balances CSV.
# This assumes 'user_wallet_balances.csv' is already present in the Colab environment.
# It should have been generated and downloaded by the 'wallet_balance_generator' code
# and then potentially re-uploaded or kept in the same Colab session.
output_filename = 'user_wallet_balances.csv'

# Step 2: Load the wallet balances data into a pandas DataFrame directly from the file.
try:
    df_check = pd.read_csv(output_filename, engine='python')
    print(f"Successfully loaded '{output_filename}' for data shape check.")
except FileNotFoundError:
    print(f"Error: '{output_filename}' not found. Please ensure the file is generated and present in the current Colab environment.")
    # Raise an exception to stop execution if the file is not found.
    raise
except Exception as e:
    print(f"An error occurred while reading the CSV file: {e}")
    raise

# Step 3: Get and print the shape of the DataFrame.
# The shape attribute returns a tuple representing the dimensionality of the DataFrame (rows, columns).
print(f"\nShape of the data (rows, columns): {df_check.shape}")

# Step 4: Get and print the number of rows.
# The len() function on a DataFrame returns the number of rows.
print(f"Number of rows: {len(df_check)}")

# Step 5: Get and print basic information about the DataFrame, including column names and data types.
print("\nData Info:")
df_check.info()

# Step 6: Print the first few rows to get a quick sense of the data.
print("\nFirst 5 rows of the data:")
print(df_check.head())


Successfully loaded 'user_wallet_balances.csv' for data shape check.

Shape of the data (rows, columns): (1635, 2)
Number of rows: 1635

Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1635 entries, 0 to 1634
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   user_id         1635 non-null   object 
 1   wallet_balance  1635 non-null   float64
dtypes: float64(1), object(1)
memory usage: 25.7+ KB

First 5 rows of the data:
      user_id  wallet_balance
0   user_7758       10.888344
1  user_19632     3773.758348
2  user_38848     8493.209273
3      user_8     9089.821685
4  user_43694     1773.442334
