<a href="https://colab.research.google.com/github/santwan/upi-fraud-transaction-detection/blob/main/Dataset_generation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install Faker

Collecting Faker
  Downloading faker-37.4.2-py3-none-any.whl.metadata (15 kB)
Downloading faker-37.4.2-py3-none-any.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m20.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Faker
Successfully installed Faker-37.4.2


# 1. Imports and Global Config

## Importing all the necessary library for data generationn

In [None]:

import pandas as pd
import numpy as np
from faker import Faker
import random
from datetime import datetime, timedelta

## **Initializing Faker**

In [None]:
# Initialize Faker for India
fake = Faker('en_IN')

## **CONFIGURATION PARAMETER**

In [None]:
# --- Configuration Parameters ---

NUM_USERS = 5000
TOTAL_TRANSACTIONS = 150000
FRAUD_RATIO = 0.01  # 1% Fraud


## **BANK LIST IN INDIA**

In [None]:
banks = [
    'HDFC Bank',
    'ICICI Bank',
    'SBI',
    'Axis Bank',
    'Kotak Mahindra Bank',
    'Punjab National Bank',
    'Bank of Baroda',
    'IDFC FIRST Bank',
    'IndusInd Bank',
    'Yes Bank',
    'Union Bank of India',
    'Canara Bank',
    'Bank of India',
    'Federal Bank',
    'Central Bank of India',
    'UCO Bank',
    'Indian Bank',
    'South Indian Bank',
    'Karur Vysya Bank',
    'RBL Bank',
    'Bandhan Bank',
    'AU Small Finance Bank',
    'Jana Small Finance Bank',
    'City Union Bank'
]


## **UPI DOMAIN, DEVICE TYPE, LOCATION, & FOREIGN CITIES**

In [None]:
upi_id_domains = ['@ybl', '@okaxis', '@apl', '@sbi', '@hdfcbank', '@ptsbi']

device_types = ['Android', 'iOS', 'Web']

merchant_categories = ['Food', 'Grocery', 'Fuel', 'Entertainment', 'Shopping', 'Healthcare', 'Education', 'Transport', 'Utilities' ]

indian_cities = [
    'Mumbai', 'Delhi', 'Bangalore', 'Kolkata', 'Chennai', 'Hyderabad', 'Pune', 'Ahmedabad', 'Jaipur', 'Lucknow',
    'Surat', 'Kanpur', 'Nagpur', 'Visakhapatnam', 'Bhopal', 'Patna', 'Vadodara', 'Ludhiana', 'Agra', 'Nashik',
    'Faridabad', 'Meerut', 'Rajkot', 'Kalyan', 'Vasai-Virar', 'Varanasi', 'Srinagar', 'Aurangabad', 'Dhanbad', 'Amritsar',
    'Allahabad', 'Ranchi', 'Gwalior', 'Jabalpur', 'Coimbatore', 'Vijayawada', 'Jodhpur', 'Madurai', 'Raipur', 'Kota',
    'Guwahati', 'Chandigarh', 'Solapur', 'Hubballi-Dharwad', 'Tiruchirappalli', 'Bareilly', 'Mysore', 'Tiruppur', 'Moradabad', 'Jalandhar',
    'Bhubaneswar', 'Salem', 'Warangal', 'Guntur', 'Bhiwandi', 'Saharanpur', 'Gorakhpur', 'Bikaner', 'Amravati', 'Noida'
]

foreign_cities = ['New York', 'London', 'Dubai', 'Singapore']

## **User generation logic**

In [None]:
# user generation logic

users = []
user_ids = []

for i in range(NUM_USERS):
    upi_id = f"user{i}_{random.randint(1000,9999)}{random.choice(upi_id_domains)}"
    location = random.choice(indian_cities)
    device = random.choice(device_types)

    profile = {
        'user_upi_id': upi_id,
        'typical_location': location,
        'typical_device': device
    }

    users.append(profile)
    user_ids.append(upi_id)

user_map = {u['user_upi_id']: u for u in users}

receiver_ids_pool = [
    f"rec{i}_{random.randint(1000,9999)}{random.choice(upi_id_domains)}"
    for i in range(NUM_USERS * 2)
]
receiver_ids_pool += user_ids


In [None]:
print(f"Total users generated: {len(users)}")
print(f"Sample user IDs: {user_ids[:5]}")
print(f"Sample receiver IDs: {receiver_ids_pool[:5]}")
print(f"Sample user profiles:")
for u in users[:3]:
    print(u)


Total users generated: 5000
Sample user IDs: ['user0_7717@apl', 'user1_9527@sbi', 'user2_5134@okaxis', 'user3_4064@apl', 'user4_2799@apl']
Sample receiver IDs: ['rec0_1128@ptsbi', 'rec1_5541@sbi', 'rec2_2832@apl', 'rec3_9392@hdfcbank', 'rec4_3299@hdfcbank']
Sample user profiles:
{'user_upi_id': 'user0_7717@apl', 'typical_location': 'Meerut', 'typical_device': 'Android'}
{'user_upi_id': 'user1_9527@sbi', 'typical_location': 'Mumbai', 'typical_device': 'Web'}
{'user_upi_id': 'user2_5134@okaxis', 'typical_location': 'Guwahati', 'typical_device': 'Web'}


In [None]:
start_date = datetime(2025, 1, 1)
end_date = datetime(2025, 2, 30)
avg_time_per_txn = (end_date - start_date).total_seconds() / TOTAL_TRANSACTIONS

In [None]:
fraud_scenarios = [
    {'type': 'ATO', 'weight': 0.3},
    {'type': 'Phishing', 'weight': 0.2},
    {'type': 'Small-Large', 'weight': 0.2},
    {'type': 'High_Velocity', 'weight': 0.15},
    {'type': 'International_Anomaly', 'weight': 0.1},
    {'type': 'Failed_Attempts_Burst', 'weight': 0.05}
]

scenario_types = [s['type'] for s in fraud_scenarios]
scenario_weights = [s['weight'] for s in fraud_scenarios]


In [None]:
transactions = []
current_timestamp = start_date

for _ in range(TOTAL_TRANSACTIONS):
    # Move forward in time
    current_timestamp += timedelta(seconds=int(avg_time_per_txn * random.uniform(0.5, 1.5)))
    if current_timestamp > end_date:
        current_timestamp = start_date + timedelta(seconds=random.randint(0, int((end_date - start_date).total_seconds())))

    is_fraud = 1 if random.random() < FRAUD_RATIO else 0
    chosen_user = random.choice(user_ids)
    sender = user_map[chosen_user]

    transaction_type = random.choices(['P2P', 'P2M', 'Bill Payment', 'Recharge'], weights=[0.6, 0.2, 0.1, 0.1])[0]
    amount = round(random.uniform(10, 5000), 2)
    status = random.choices(['SUCCESS', 'FAILED'], weights=[0.98, 0.02])[0]
    sender_bank = random.choice(banks)
    receiver_bank = random.choice(banks)
    device_type = sender['typical_device']
    ip = fake.ipv4()
    location = f"{sender['typical_location']}_{random.uniform(20.0, 29.0):.4f},{random.uniform(70.0, 79.0):.4f}"
    merchant_id, merchant_category = np.nan, np.nan
    receiver_upi_id = random.choice(receiver_ids_pool)

    # 🚨 Apply fraud scenarios
    if is_fraud:
        scenario = random.choices(scenario_types, weights=scenario_weights, k=1)[0]
        receiver_upi_id = f"fraud_rec_{fake.uuid4().split('-')[0]}@fraud"

        if scenario == 'ATO':
            device_type = random.choice([d for d in device_types if d != sender['typical_device']])
            ip = fake.ipv4()
            location = f"{random.choice(foreign_cities)}_{random.uniform(0, 90):.4f},{random.uniform(-180, 180):.4f}"
            amount = round(random.uniform(50000, 300000), 2)

        elif scenario == 'Phishing':
            amount = round(random.uniform(10000, 200000), 2)
            if random.random() < 0.2:
                device_type = random.choice([d for d in device_types if d != sender['typical_device']])
            if random.random() < 0.2:
                location = f"{random.choice(indian_cities)}_{random.uniform(20.5, 30.0):.4f},{random.uniform(70.5, 80.0):.4f}"

        elif scenario == 'Small-Large':
            small_amt = round(random.uniform(1, 100), 2)
            transactions.append({
                'transaction_id': fake.uuid4(),
                'timestamp': current_timestamp - timedelta(minutes=random.randint(1, 5)),
                'user_upi_id': chosen_user,
                'receiver_upi_id': receiver_upi_id,
                'amount': small_amt,
                'transaction_type': 'P2P',
                'status': 'SUCCESS',
                'sender_bank': sender_bank,
                'receiver_bank': receiver_bank,
                'device_type': device_type,
                'ip_address': ip,
                'location': location,
                'merchant_id': np.nan,
                'merchant_category': np.nan,
                'is_fraud': 1
            })
            amount = round(random.uniform(50000, 500000), 2)

        elif scenario == 'High_Velocity':
            burst_count = random.randint(3, 6)
            for _ in range(burst_count):
                transactions.append({
                    'transaction_id': fake.uuid4(),
                    'timestamp': current_timestamp + timedelta(seconds=random.randint(1, 60)),
                    'user_upi_id': chosen_user,
                    'receiver_upi_id': f"burst_rec_{fake.uuid4().split('-')[0]}@fraud",
                    'amount': round(random.uniform(50, 5000), 2),
                    'transaction_type': 'P2P',
                    'status': 'SUCCESS',
                    'sender_bank': sender_bank,
                    'receiver_bank': random.choice(banks),
                    'device_type': device_type,
                    'ip_address': ip,
                    'location': location,
                    'merchant_id': np.nan,
                    'merchant_category': np.nan,
                    'is_fraud': 1
                })
            amount = round(random.uniform(100, 5000), 2)

        elif scenario == 'International_Anomaly':
            location = f"{random.choice(foreign_cities)}_{random.uniform(0, 90):.4f},{random.uniform(-180, 180):.4f}"
            amount = round(random.uniform(20000, 1000000), 2)

        elif scenario == 'Failed_Attempts_Burst':
            for _ in range(random.randint(3, 7)):
                transactions.append({
                    'transaction_id': fake.uuid4(),
                    'timestamp': current_timestamp - timedelta(seconds=random.randint(10, 60)),
                    'user_upi_id': chosen_user,
                    'receiver_upi_id': receiver_upi_id,
                    'amount': round(random.uniform(100, 50000), 2),
                    'transaction_type': 'P2P',
                    'status': 'FAILED',
                    'sender_bank': sender_bank,
                    'receiver_bank': receiver_bank,
                    'device_type': device_type,
                    'ip_address': ip,
                    'location': location,
                    'merchant_id': np.nan,
                    'merchant_category': np.nan,
                    'is_fraud': 1
                })
            status = 'SUCCESS' if random.random() < 0.7 else 'FAILED'
            amount = round(random.uniform(1000, 100000), 2)
    # Re-check merchant logic just before saving
    if transaction_type == 'P2M':
        merchant_id = fake.uuid4()
        merchant_category = random.choice(merchant_categories)
    else:
        merchant_id = np.nan
        merchant_category = np.nan


    # Append the main transaction
    transactions.append({
        'transaction_id': fake.uuid4(),
        'timestamp': current_timestamp,
        'user_upi_id': chosen_user,
        'receiver_upi_id': receiver_upi_id,
        'amount': amount,
        'transaction_type': transaction_type,
        'status': status,
        'sender_bank': sender_bank,
        'receiver_bank': receiver_bank,
        'device_type': device_type,
        'ip_address': ip,
        'location': location,
        'merchant_id': merchant_id,
        'merchant_category': merchant_category,
        'is_fraud': is_fraud
    })

In [None]:
df = pd.DataFrame(transactions)
print(f"✅ Generated {len(df)} transactions.")
print(f"🟥 Fraud cases (random only): {df['is_fraud'].sum()} ({df['is_fraud'].mean()*100:.2f}%)")
print(df.sample(5))


✅ Generated 151728 transactions.
🟥 Fraud cases (random only): 3242 (2.14%)
                              transaction_id           timestamp  \
93324   8f714da0-0ecd-4fd6-ba4e-797461421902 2025-04-21 00:52:03   
133154  89326272-8d5f-4f7c-a040-bc5a777a13b0 2025-06-07 04:12:35   
71532   a9125941-7071-4751-9f40-fca3c2d674f7 2025-03-26 10:15:16   
52533   41ba9ae8-ec0a-4480-9509-572378a5e0f5 2025-03-04 01:46:53   
108014  440b9d37-2610-4581-80ad-002de6591ca5 2025-05-08 08:59:03   

                   user_upi_id         receiver_upi_id   amount  \
93324   user2964_8754@hdfcbank     user3095_6659@ptsbi  1097.70   
133154       user2678_9021@ybl  user1338_7063@hdfcbank  3580.84   
71532         user911_1955@ybl        rec8188_7113@apl   741.73   
52533     user1549_1576@okaxis    user4677_2093@okaxis  4437.33   
108014       user3206_9219@apl        rec4266_9586@sbi  1615.85   

       transaction_type   status        sender_bank    receiver_bank  \
93324               P2P  SUCCESS         

In [None]:
import pandas as pd

# Create DataFrame if not already done
df = pd.DataFrame(transactions)

# Sort by timestamp (optional, but useful)
df = df.sort_values(by='timestamp').reset_index(drop=True)

# Save to CSV
df.to_csv("upi_labeled_transactions.csv", index=False)

print("✅ CSV saved as 'upi_labeled_transactions.csv'")
print(f"Total transactions: {len(df)} | Fraud: {df['is_fraud'].sum()} ({df['is_fraud'].mean() * 100:.2f}%)")


✅ CSV saved as 'upi_labeled_transactions.csv'
Total transactions: 151728 | Fraud: 3242 (2.14%)


In [None]:
df.head(10)


Unnamed: 0,transaction_id,timestamp,user_upi_id,receiver_upi_id,amount,transaction_type,status,sender_bank,receiver_bank,device_type,ip_address,location,merchant_id,merchant_category,is_fraud
0,d1fd3795-15b0-4672-a50f-2fe79667eff2,2025-01-01 00:01:23,user2656_5140@sbi,rec9851_6824@ybl,243.0,P2M,SUCCESS,Central Bank of India,ICICI Bank,Android,31.241.151.217,"Aurangabad_27.1549,74.5861",8da987a9-28e5-44a2-8b1e-350f3f9f476b,Healthcare,0
1,7c48985b-57e6-4282-aa6a-5718656d1277,2025-01-01 00:02:37,user4356_2704@sbi,rec6165_1227@okaxis,4748.0,Recharge,SUCCESS,Central Bank of India,Central Bank of India,Android,16.2.244.93,"Hubballi-Dharwad_21.6947,76.4588",,,0
2,7e6c13eb-55cf-4b2e-bfe1-104b9b0cdb6d,2025-01-01 00:03:41,user1672_8466@okaxis,rec1269_4826@okaxis,190.54,P2M,SUCCESS,UCO Bank,Kotak Mahindra Bank,Android,189.101.213.17,"Gorakhpur_21.6322,73.9702",6c1ca586-43e3-4a6e-85b2-c50fe8c25194,Transport,0
3,a440a63a-2f57-4d9d-8ca4-3bfc5c61d7f8,2025-01-01 00:06:00,user3485_1392@hdfcbank,rec2363_8801@ptsbi,4349.76,Bill Payment,SUCCESS,Punjab National Bank,City Union Bank,iOS,88.173.155.93,"Coimbatore_21.1142,74.4812",,,0
4,5b8fadb8-b694-4447-888a-8943656f3acb,2025-01-01 00:08:30,user1856_4500@ybl,rec8419_2563@sbi,4455.56,P2P,SUCCESS,ICICI Bank,Federal Bank,Web,95.137.121.143,"Moradabad_25.3105,75.3766",,,0
5,7075a1b6-5816-480e-bceb-d78cd2aa7d8b,2025-01-01 00:10:46,user1233_9853@okaxis,rec4326_6136@ybl,1598.81,P2M,SUCCESS,Kotak Mahindra Bank,City Union Bank,Web,48.136.222.231,"Nashik_22.2231,77.7004",d5dc19ff-8881-43f6-9b15-c9b391f87a8d,Transport,0
6,1ecde23e-882f-4119-8432-cddaad62a329,2025-01-01 00:11:55,user2046_1343@apl,rec9248_6694@sbi,3288.04,P2P,SUCCESS,Indian Bank,RBL Bank,Android,153.167.253.255,"Visakhapatnam_20.9825,74.6007",,,0
7,f0c23887-78bd-4909-b6db-6fd6a0d1a02e,2025-01-01 00:14:25,user2478_3200@ybl,rec2290_2012@ybl,575.88,P2P,SUCCESS,Bank of Baroda,Bank of Baroda,Web,63.109.202.96,"Lucknow_26.0084,70.7220",,,0
8,032e6a0b-24c2-4434-9d39-8965c8f9d067,2025-01-01 00:15:32,user2465_3660@ptsbi,user1201_1257@hdfcbank,4695.04,P2P,SUCCESS,Indian Bank,HDFC Bank,iOS,157.191.72.176,"Srinagar_25.1608,76.1615",,,0
9,9c203de7-4630-483c-8bd0-2726377b42aa,2025-01-01 00:17:57,user1869_6346@hdfcbank,rec8064_4491@apl,563.28,P2P,SUCCESS,South Indian Bank,City Union Bank,iOS,59.29.192.30,"Ahmedabad_25.3392,72.8148",,,0


In [None]:
from tabulate import tabulate

In [None]:

target_upi_id = "user2656_5140@sbi"

# Filter the rows
user_txns = df[df['user_upi_id'] == target_upi_id]

# Filter transactions for that user
user_txns = df[df['user_upi_id'] == target_upi_id]

# Display as pretty table
print(f"\n Showing {len(user_txns)} transactions for: {target_upi_id}\n")
print(tabulate(user_txns, headers='keys', tablefmt='grid', showindex=False))


 Showing 34 transactions for: user2656_5140@sbi

+--------------------------------------+---------------------+-------------------+--------------------------+-----------+--------------------+----------+-------------------------+-----------------------+---------------+-----------------+----------------------------+--------------------------------------+---------------------+------------+
| transaction_id                       | timestamp           | user_upi_id       | receiver_upi_id          |    amount | transaction_type   | status   | sender_bank             | receiver_bank         | device_type   | ip_address      | location                   | merchant_id                          | merchant_category   |   is_fraud |
| d1fd3795-15b0-4672-a50f-2fe79667eff2 | 2025-01-01 00:01:23 | user2656_5140@sbi | rec9851_6824@ybl         |    243    | P2M                | SUCCESS  | Central Bank of India   | ICICI Bank            | Android       | 31.241.151.217  | Aurangabad_27.1549,74.5861 | 8

In [None]:
from google.colab import files
files.download('upi_labeled_transactions.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 151728 entries, 0 to 151727
Data columns (total 15 columns):
 #   Column             Non-Null Count   Dtype         
---  ------             --------------   -----         
 0   transaction_id     151728 non-null  object        
 1   timestamp          151728 non-null  datetime64[ns]
 2   user_upi_id        151728 non-null  object        
 3   receiver_upi_id    151728 non-null  object        
 4   amount             151728 non-null  float64       
 5   transaction_type   151728 non-null  object        
 6   status             151728 non-null  object        
 7   sender_bank        151728 non-null  object        
 8   receiver_bank      151728 non-null  object        
 9   device_type        151728 non-null  object        
 10  ip_address         151728 non-null  object        
 11  location           151728 non-null  object        
 12  merchant_id        29949 non-null   object        
 13  merchant_category  29949 non-null   object  