In [None]:
# Enhanced Financial Data Generation Pipeline for M.Tech Project

## Key Improvements:
1. **Scale**: Generate 1M+ transactions per user, 1000+ users
2. **Realism**: Advanced behavioral patterns, seasonal trends, economic events
3. **Complexity**: Multi-account, investment data, loan management
4. **Query Diversity**: 50+ query types, complex financial reasoning
5. **Commercial Features**: Risk scoring, fraud detection, personalization

## Technical Innovation:
- **Temporal Financial Embeddings**: Custom representation learning
- **Behavioral Modeling**: Markov chains for realistic spending patterns  
- **Economic Event Simulation**: Market crashes, inflation, job changes
- **Multi-Modal Data**: Text, numerical, categorical, time-series



# **Install Dependencies**




In [None]:
!pip install faker pandas tqdm numpy scikit-learn seaborn matplotlib plotly
!pip install yfinance requests beautifulsoup4 nltk transformers
!pip install pymc3 arviz  # For advanced statistical modeling




# Import Libraries + Set Config

In [9]:
import pandas as pd
import random
import json
from faker import Faker
from datetime import datetime, timedelta
from tqdm import tqdm

fake = Faker()
Faker.seed(42)
random.seed(42)

# Configuration
NUM_MONTHS = 60
TRANSACTIONS_PER_MONTH = 100000
from datetime import date
today = datetime.today()
START_DATE = today - pd.DateOffset(months=60)  # Go 60 months back

CATEGORIES = {
    'groceries': ['Big Bazaar', 'Reliance Fresh', 'DMart'],
    'rent': ['Landlord Transfer'],
    'travel': ['Uber', 'Indigo', 'Ola'],
    'food': ['Zomato', 'Swiggy', 'KFC'],
    'utilities': ['Electricity Bill', 'Airtel', 'Water Bill'],
    'shopping': ['Amazon', 'Flipkart'],
    'emi': ['HDFC EMI', 'Bajaj Finance'],
    'salary': ['Company Payroll'],
}
PAYMENT_MODES = ['NetBanking', 'UPI', 'Card', 'Cash']


# Generate Monthly Transactions

In [10]:
from collections import defaultdict

    ## this is done so that model doesn't train on unrealistic data Unrealistic amounts (like ₹14,80,159 on food)
    ## Model may think “grocery spend = lakhs or crores” is normal - AVOID THIS USING BELOW LOGIC

def generate_monthly_transactions(month_offset):
    transactions = []
    month_start = START_DATE + pd.DateOffset(months=month_offset)
    current_date = month_start

    category_amounts = {
        'groceries': (500, 3000),
        'rent': (8000, 25000),
        'travel': (1000, 8000),
        'food': (300, 2000),
        'utilities': (500, 3000),
        'shopping': (500, 10000),
        'emi': (1000, 20000),
        'salary': (30000, 80000)
    }

    max_txn_per_category = defaultdict(lambda: 0)
    max_txn_limit = {
        'groceries': 20,
        'food': 15,
        'rent': 1,
        'salary': 1,
        'travel': 10,
        'shopping': 10,
        'utilities': 6,
        'emi': 2
    }

    for _ in range(TRANSACTIONS_PER_MONTH):
        category = random.choices(list(CATEGORIES.keys()), weights=[20, 15, 10, 15, 10, 10, 10, 10])[0]
        if max_txn_per_category[category] >= max_txn_limit[category]:
            continue  # skip this category if too many entries

        merchant = random.choice(CATEGORIES[category])
        min_amt, max_amt = category_amounts[category]
        amount = round(random.uniform(min_amt, max_amt), 2)
        trans_type = 'credit' if category == 'salary' else 'debit'
        payment_mode = random.choice(PAYMENT_MODES)
        txn_date = current_date + timedelta(days=random.randint(0, 27))

        txn = {
            "transaction_id": fake.uuid4(),
            "date": txn_date.strftime('%Y-%m-%d'),
            "merchant": merchant,
            "category": category,
            "amount": amount,
            "currency": "INR",
            "payment_mode": payment_mode,
            "type": trans_type
        }
        transactions.append(txn)
        max_txn_per_category[category] += 1

    return transactions




# Run this for X months and save to CSV

In [11]:
all_transactions = []

for m in tqdm(range(NUM_MONTHS), desc="Generating monthly transactions"):
    month_data = generate_monthly_transactions(m)
    all_transactions.extend(month_data)

df = pd.DataFrame(all_transactions)
df.to_csv("user1_bank_statement.csv", index=False)

print(f"✅ Generated {len(df)} transactions across {NUM_MONTHS} months.")
df.head()


Generating monthly transactions: 100%|██████████| 60/60 [00:14<00:00,  4.28it/s]

✅ Generated 3900 transactions across 60 months.





Unnamed: 0,transaction_id,date,merchant,category,amount,currency,payment_mode,type
0,bdd640fb-0667-4ad1-9c80-317fa3b1799d,2020-08-19,Electricity Bill,utilities,2353.88,INR,UPI,debit
1,23b8c1e9-3924-46de-beb1-3b9046685257,2020-08-30,Big Bazaar,groceries,2191.75,INR,NetBanking,debit
2,bd9c66b3-ad3c-4d6d-9a3d-1fa7bc8960a9,2020-08-28,Uber,travel,1655.87,INR,UPI,debit
3,972a8469-1641-4f82-8b9d-2434e465e150,2020-08-19,Water Bill,utilities,997.09,INR,Cash,debit
4,17fc695a-07a0-4a6e-8822-e8f36c031199,2020-09-05,Indigo,travel,6666.01,INR,NetBanking,debit


# Generate Q&A Pairs

In [12]:
from collections import defaultdict
from dateutil.parser import parse

# Load the data
df = pd.read_csv("user1_bank_statement.csv")
df["date"] = pd.to_datetime(df["date"])
df["month"] = df["date"].dt.strftime('%B %Y')

# Group by month and category
monthly_summary = defaultdict(lambda: defaultdict(float))

for _, row in df.iterrows():
    if row['type'] == 'debit':
        monthly_summary[row['month']][row['category']] += row['amount']

# Sort months chronologically
months = sorted(monthly_summary.keys(), key=lambda x: parse(f"01 {x}"))

qa_data = []

for i in range(1, len(months)):
    this_month = months[i]
    prev_month = months[i - 1]

    for cat in CATEGORIES.keys():
        current_spend = monthly_summary[this_month].get(cat, 0.0)
        previous_spend = monthly_summary[prev_month].get(cat, 0.0)

        # 1. Total spent this month
        qa_data.append({
            "prompt": f"What was my total spend on {cat} in {this_month}?",
            "response": f"You spent ₹{current_spend:.2f} on {cat} in {this_month}."
        })

        # 2. Comparison to previous month (Skip comparison if previous spend is tiny)
        if previous_spend > 100:
            change = ((current_spend - previous_spend) / previous_spend) * 100
            if change > 0:
                resp = f"You spent ₹{current_spend:.2f} on {cat} in {this_month}, which is {abs(change):.1f}% more than {prev_month}."
            else:
                resp = f"You spent ₹{current_spend:.2f} on {cat} in {this_month}, which is {abs(change):.1f}% less than {prev_month}."

            qa_data.append({
                "prompt": f"Did I overspend on {cat} in {this_month} compared to {prev_month}?",
                "response": resp
            })


# Save the Q&A as JSONL

In [13]:
with open("user1_qa_dataset.jsonl", "w") as f:
    for qa in qa_data:
        f.write(json.dumps(qa) + "\n")

print(f"✅ Generated {len(qa_data)} Q&A pairs and saved to user1_qa_dataset.jsonl")


✅ Generated 879 Q&A pairs and saved to user1_qa_dataset.jsonl


In [14]:
# save to Google Drive

from google.colab import drive
drive.mount('/content/drive')

!cp /content/user1_qa_dataset.jsonl /content/drive/MyDrive/user1_qa_dataset.json


Mounted at /content/drive
