In [131]:
!pip install faker

import pandas as pd
import numpy as np
import random
from faker import Faker
from geopy.distance import geodesic



In [132]:
# Initialize Faker
fake = Faker()

# Parameters
NUM_USERS = 1000
FRAUD_RATIO = 0.04  # 4% fraud

# List of Bengaluru-specific regions
BENGALURU_REGIONS = {
    'Koramangala': (12.9288, 77.6228),'Jayanagar': (12.9333, 77.5833),'Whitefield': (12.9764, 77.7513),'Indiranagar': (12.9701, 77.6402),'Malleshwaram': (13.0034, 77.5723),
    'Hebbal': (13.0312, 77.5924),'Hennur': (13.0245, 77.6247),'Sarjapur Road': (12.9121, 77.6774),'Bannerghatta Road': (12.8786, 77.5900),'Electronic City': (12.8543, 77.6780),
    'Kalyan Nagar': (13.0272, 77.6463),'BTM Layout': (12.9341, 77.5910),'Vijayanagar': (12.9557, 77.5500),'Bellandur': (12.9336, 77.6543),'Kengeri': (12.9202, 77.4856),
    'Yelahanka': (13.1008, 77.5963),'Rajajinagar': (12.9917, 77.5568),'Marathahalli': (12.9561, 77.7017),'HSR Layout': (12.9121, 77.6446),'Nagawara': (13.0452, 77.6226),
    'Devanahalli': (13.2485, 77.7132),'Attibele': (12.7762, 77.7672),'Nelamangala': (13.0982, 77.3935),'Hoskote': (13.0707, 77.7850),'Anekal': (12.7110, 77.6956)
}

# Region Probability Distributions
USER_REGION_CHOICES = {
    "Jayanagar": 0.15, "Malleshwaram": 0.15, "Rajajinagar": 0.15,
    "Whitefield": 0.07, "Marathahalli": 0.07, "Electronic City": 0.07,
    "Bellandur": 0.06, "HSR Layout": 0.06, "Sarjapur Road": 0.05,
    "Kengeri": 0.06, "Yelahanka": 0.05, "Hebbal": 0.05, "Hennur": 0.04, "Kalyan Nagar": 0.04,
    "Devanahalli": 0.03, "Attibele": 0.03, "Nelamangala": 0.02, "Hoskote": 0.02, "Anekal": 0.02
}
ORDER_REGION_CHOICES = {
    "Whitefield": 0.15, "Marathahalli": 0.15, "Bellandur": 0.15, "Electronic City": 0.14, "Indiranagar": 0.13,
    "HSR Layout": 0.10, "Koramangala": 0.10, "Sarjapur Road": 0.10, "BTM Layout": 0.10, "Kalyan Nagar": 0.08,
    "Jayanagar": 0.07, "Malleshwaram": 0.07, "Rajajinagar": 0.07, "Hebbal": 0.06, "Yelahanka": 0.05,
    "Devanahalli": 0.02, "Anekal": 0.02, "Attibele": 0.02, "Nelamangala": 0.01
}
RECEIVER_REGION_CHOICES = {
    "Whitefield": 0.18, "Marathahalli": 0.18, "Electronic City": 0.18, "Bellandur": 0.17, "HSR Layout": 0.16,
    "Koramangala": 0.12, "Indiranagar": 0.12, "BTM Layout": 0.12, "Sarjapur Road": 0.12,
    "Jayanagar": 0.08, "Malleshwaram": 0.08, "Rajajinagar": 0.08, "Yelahanka": 0.07, "Hebbal": 0.07,
    "Devanahalli": 0.04, "Anekal": 0.03, "Attibele": 0.03, "Nelamangala": 0.02, "Hoskote": 0.02
}

BENGALURU_REGION_NAMES = list(BENGALURU_REGIONS.keys())



# List of Indian first names for email generation
INDIAN_FIRST_NAMES = ['Amit', 'Ramesh', 'Suresh', 'Rajesh', 'Priya', 'Kiran', 'Sunil', 'Anjali', 'Neha', 'Manish',
                      'Meena', 'Pooja', 'Vikas', 'Ravi', 'Rahul', 'Sonia', 'Deepak', 'Shreya', 'Kavita']
# List of email domain names for email generation
INDIAN_EMAIL_DOMAINS = ['gmail.com', 'yahoo.co.in', 'rediffmail.com', 'outlook.co.in', 'indiatimes.com',
                        'aol.in', 'airtelmail.com', 'bsnl.in', 'zoho.com']
# Function to generate indian email
def generate_indian_email():
    first_name = np.random.choice(INDIAN_FIRST_NAMES)
    domain = np.random.choice(INDIAN_EMAIL_DOMAINS)
    return f"{first_name.lower()}@{domain}"



# Function to generate unique Indian phone number
def generate_unique_indian_phone_number(existing_numbers):
    while True:
        phone_number = f"+91 {random.randint(7000000000, 9999999999)}"  # Indian phone number format
        if phone_number not in existing_numbers:
            existing_numbers.add(phone_number)
            return phone_number


# Generate a pool of unique phone numbers
unique_phone_numbers = set()
for _ in range(NUM_USERS * 3):  # Assuming 3 phone numbers per user on average
    generate_unique_indian_phone_number(unique_phone_numbers)
# Convert the set of unique phone numbers to a list
unique_phone_numbers = list(unique_phone_numbers)


# Generate probability distribution for card networks using exponential decay
CARD_NETWORKS = ["Visa", "Mastercard", "American Express", "Rupay"]
decay_factor = 1.0  # Adjust to control skewness
synthetic_distribution = np.exp(-decay_factor * np.arange(len(CARD_NETWORKS)))
synthetic_distribution /= synthetic_distribution.sum()  # Normalize


# Generate probability distribution for card types using exponential decay (higher weight to "Debit")
CARD_TYPES = ['Debit', 'Credit', 'Prepaid']
decay_factor = 0.8  # Adjust for skewness
raw_counts = np.exp(-decay_factor * np.arange(len(CARD_TYPES)))
raw_counts /= raw_counts.sum()  # Normalize to sum to 1
damping_factor = 2.0  # Apply a damping factor to balance smaller categories, Ensures "Prepaid" isn't underrepresented
adjusted_counts = raw_counts * (1 + damping_factor * (raw_counts < np.median(raw_counts)))
card_probabilities = adjusted_counts / adjusted_counts.sum()# Normalize final probabilities


# Generate user data
users_data = {
    "User_ID": np.arange(1000, NUM_USERS + 1000),  # Fix indexing
    "Sender_email": [generate_indian_email() for _ in range(NUM_USERS)],
    "Card Number": [fake.credit_card_number(card_type="visa16") for _ in range(NUM_USERS)],
    "BIN Number": [str(fake.credit_card_number(card_type="visa16"))[:6] for _ in range(NUM_USERS)],  # FIXED
    "Card Network": np.random.choice(CARD_NETWORKS, NUM_USERS, p=synthetic_distribution),
    "Card Tier": np.random.choice(['Silver', 'Gold', 'Platinum', 'Black'], NUM_USERS),
    "Card Type": np.random.choice(CARD_TYPES, size=NUM_USERS, p=card_probabilities),
    "User_Region": np.random.choice(BENGALURU_REGION_NAMES, NUM_USERS),
    "transactions":np.random.poisson(lam=10, size=NUM_USERS).clip(1, 50),
}
users_data["BIN Number"] = [str(card)[:6] for card in users_data["Card Number"]]


#Assign cards
users_df = pd.DataFrame(users_data)
user_card_mapping = {}
for user_id in users_df["User_ID"]:
    num_cards = np.random.randint(1, 4)
    card_numbers = [fake.credit_card_number(card_type="visa16") for _ in range(num_cards)]
    user_card_mapping[user_id] = card_numbers


# Step 1: Assign 1 to 3 unique phone numbers per user
user_phone_mapping = {}  # Stores user -> list of phone numbers
used_phone_numbers = set()
for user_id in users_df["User_ID"]:
    num_phone_numbers = np.random.randint(1, 4)  # Each user gets 1 to 3 phone numbers
    phone_numbers = [generate_unique_indian_phone_number(used_phone_numbers) for _ in range(num_phone_numbers)]
    user_phone_mapping[user_id] = phone_numbers  # Store user's available phone numbers


# Step 2: Assign phone numbers to each card (some may share numbers)
card_phone_mapping = {}  # Stores card -> phone number
for user_id, cards in user_card_mapping.items():
    user_phones = user_phone_mapping[user_id]  # Get available phone numbers for this user
    for card in cards:
        card_phone_mapping[card] = random.choice(user_phones)  # Assign one of the user's numbers to this card


# Ensure correct card network assignment
def get_card_network(card_number, user_card_mapping, users_df):
    for user_id, cards in user_card_mapping.items():
        if card_number in cards:
            return users_df.loc[users_df['User_ID'] == user_id, 'Card Network'].values[0]
    return np.random.choice(CARD_NETWORKS, p=synthetic_distribution)  # Default if not found


# Generate probability distribution for product categories using exponential decay (more weight to "Wallet")
PRODUCT_CATEGORIES = ['Wallet', 'Consumable', 'Retail', 'Household', 'Services', 'Miscellaneous']
decay_factor = 0.9  # Adjust to control skewness
raw_counts = np.exp(-decay_factor * np.arange(len(PRODUCT_CATEGORIES)))
raw_counts /= raw_counts.sum()  # Normalize to sum to 1
damping_factor = 1.5  # Apply a damping factor to balance smaller categories (e.g., Household, Services, Miscellaneous) Ensures smaller categories are not underrepresented
adjusted_counts = raw_counts * (1 + damping_factor * (raw_counts < np.median(raw_counts)))
product_probabilities = adjusted_counts / adjusted_counts.sum()# Normalize final probabilities




#Assigning regions
user_region_mapping = {}
def assign_regions(user_id):
    """
    Assign User_Region, Order_Region, and Receiver_Region based on realistic scenarios.
    Ensures that each user has a fixed User_Region.
    """
    global user_region_mapping  # Ensure the mapping persists across function calls

    # Assign User_Region permanently if not already assigned
    if user_id not in user_region_mapping:
        user_region_mapping[user_id] = random.choices(
            list(USER_REGION_CHOICES.keys()), weights=USER_REGION_CHOICES.values(),k=1)[0]

    user_region = user_region_mapping[user_id]

    # Define probabilities for different scenarios
    scenario = random.choices(
        ["all_same", "user_order_same", "order_receiver_same", "user_receiver_same", "all_different"],
        weights=[0.35, 0.20, 0.15, 0.15, 0.15],k=1)[0]

    if scenario == "all_same":
        order_region = user_region
        receiver_region = user_region

    elif scenario == "user_order_same":
        order_region = user_region
        receiver_region = random.choices(list(RECEIVER_REGION_CHOICES.keys()), weights=RECEIVER_REGION_CHOICES.values(), k=1)[0]
        while receiver_region == user_region:
            receiver_region = random.choices(list(RECEIVER_REGION_CHOICES.keys()), weights=RECEIVER_REGION_CHOICES.values(), k=1)[0]

    elif scenario == "order_receiver_same":
        order_region = random.choices(list(ORDER_REGION_CHOICES.keys()), weights=ORDER_REGION_CHOICES.values(), k=1)[0]
        receiver_region = order_region
        while receiver_region == user_region:
            receiver_region = random.choices(list(RECEIVER_REGION_CHOICES.keys()), weights=RECEIVER_REGION_CHOICES.values(), k=1)[0]

    elif scenario == "user_receiver_same":
        receiver_region = user_region
        order_region = random.choices(list(ORDER_REGION_CHOICES.keys()), weights=ORDER_REGION_CHOICES.values(), k=1)[0]
        while order_region == user_region:
            order_region = random.choices(list(ORDER_REGION_CHOICES.keys()), weights=ORDER_REGION_CHOICES.values(), k=1)[0]

    elif scenario == "all_different":
        order_region = random.choices(list(ORDER_REGION_CHOICES.keys()), weights=ORDER_REGION_CHOICES.values(), k=1)[0]
        receiver_region = random.choices(list(RECEIVER_REGION_CHOICES.keys()), weights=RECEIVER_REGION_CHOICES.values(), k=1)[0]
        while receiver_region == user_region or receiver_region == order_region:
            receiver_region = random.choices(list(RECEIVER_REGION_CHOICES.keys()), weights=RECEIVER_REGION_CHOICES.values(), k=1)[0]

    return user_region, order_region, receiver_region

# Generate transaction data
data = []
transaction_id = 10000

for _, user in users_df.iterrows():
    user_id = user["User_ID"]
    user_region = user["User_Region"]
    num_user_transactions = user["transactions"]

    for _ in range(num_user_transactions):
        # Pick a card randomly for this transaction
        card_number = random.choice(user_card_mapping[user_id])
        sender_email = user["Sender_email"]  # Use the same email for all cards of the user

        transaction = {
            "TransactionID": transaction_id,
            "User_ID": user_id,
            "User_Region": user_region,
            "Sender_email": sender_email,
            "Card Number": card_number,
            "BIN Number": user["BIN Number"],
            "Card Network": get_card_network(card_number, user_card_mapping, users_df),
            "Card Tier": user["Card Tier"],
            "Card Type": user["Card Type"],
            "Phone Numbers": card_phone_mapping[card_number]
        }
        transaction_id += 1

        user_region, order_region, receiver_region = assign_regions(transaction['User_ID'])
        transaction["User_Region"] = user_region
        transaction["Order_Region"] = order_region
        transaction["Receiver_Region"] = receiver_region

        # Distance Calculation (Between Order_Region and Receiver_Region)
        if transaction["Order_Region"] == transaction["Receiver_Region"]:
          transaction["Distance"] = np.round(np.random.uniform(0.1, 5), 2)  # Small within-region distance
        else:
          order_coords = BENGALURU_REGIONS[transaction["Order_Region"]]
          receiver_coords = BENGALURU_REGIONS[transaction["Receiver_Region"]]
          transaction["Distance"] = np.round(geodesic(order_coords, receiver_coords).km, 2)  # Actual distance


        # Assign fraud label
        # transaction["isFraud"] = np.random.choice([0, 1], p=[1 - FRAUD_RATIO, FRAUD_RATIO])
        transaction["isFraud"]=0

        # Assign TransactionAmt with different variance for fraud vs legit
        mu = np.log(135.03)  # Log mean of the dataset
        fraud_sigma = np.log(1 + (500 / 135.03))  # Higher variance for fraud
        legit_sigma = np.log(1 + (239.16 / 135.03))  # Original value for legit

        transaction['TransactionAmt'] = np.random.lognormal(
                mean=mu,
                sigma=fraud_sigma if transaction["isFraud"] else legit_sigma,
                size=1)[0]
        transaction['TransactionAmt'] = np.clip(transaction['TransactionAmt'], 0.251, 31937.391)# Clip values to match min and max range



        transaction["ProductCD"] = np.random.choice(PRODUCT_CATEGORIES, p=product_probabilities)

        # Define merchants under relevant ProductCD categories
        MERCHANT_MAPPING = {
                            "Wallet": ["Flipkart", "Amazon","Google Play", "BigBasket", "Uber", "Zomato", "Swiggy Instamart"],  # Wallets are payment methods, not merchants
                            "Consumable": ["BigBasket", "Blinkit", "DMart", "JioMart", "Swiggy Instamart", "Zepto", "Nature’s Basket", "MilkBasket"],
                            "Retail": ["Flipkart", "Amazon", "Reliance Digital", "Croma", "Tata Cliq", "Myntra", "Nykaa", "Ajio", "Meesho", "Snapdeal"],
                            "Household": ["Pepperfry", "Urban Ladder", "IKEA", "Wakefit", "Home Centre", "Nilkamal", "Durian", "Godrej Interio", "Hometown"],
                            "Services": ["Netflix", "Amazon Prime", "Hotstar", "Spotify", "Zee5", "JioSaavn", "Unacademy", "Byju's", "ALT Balaji", "Sony LIV", "Audible", "Coursera", "Udemy", "Skillshare"],
                            "Miscellaneous": ["Dream11", "RummyCircle", "PokerBaazi", "MPL", "Decathlon", "FirstCry", "Tata 1mg", "1x BET", "Betway", "Lottoland", "WinZO", "Nazara Games", "Netmeds", "Practo", "PharmEasy"]
                        }

        # Track new merchants added in a structured manner
        new_merchant_counters = {category: 1 for category in MERCHANT_MAPPING.keys()}  # Start from 1

        # Probability of introducing a new merchant
        new_merchant_prob = 0.05  # 5% chance

        def assign_merchant(product_cd):
          if random.random() < new_merchant_prob:
            merchant_name = f"{product_cd}_Merchant{new_merchant_counters[product_cd]:02d}"  # Example: Wallet_Merchant01
            new_merchant_counters[product_cd] += 1  # Increment counter for next merchant
            MERCHANT_MAPPING[product_cd].append(merchant_name)  # Store it for future use
            return merchant_name
          else:
            return random.choice(MERCHANT_MAPPING[product_cd])

        # Assign Merchant based on ProductCD
        transaction["Merchant"] = assign_merchant(transaction["ProductCD"])

        def generate_merchant_email(product_category, merchant_name):
          """Generate an email address in the format category@merchant.com."""
          category_prefix = transaction['ProductCD'].lower()
          domain = merchant_name.lower().replace(" ", "") + ".com"  # Format domain
          return f"{category_prefix}@{domain}"

        def assign_merchant_email(transaction):
            product_category = transaction.get("ProductCD", None)  # Get ProductCD
            merchant_name = transaction.get("Merchant", None)  # Get merchant name
            if product_category in MERCHANT_MAPPING:
                valid_merchants = MERCHANT_MAPPING[product_category]
                # Check if the merchant exists in the valid category list
                if merchant_name in valid_merchants:
                    selected_merchant = merchant_name
                else:
                    selected_merchant = random.choice(valid_merchants)  # Pick a random one if invalid/missing
                transaction["Merchant_email"] = generate_merchant_email(product_category, selected_merchant)
            else:
                transaction["Merchant_email"] = "unknown@unknown.com"  # Fallback for unknown categories

            return transaction


        assign_merchant_email(transaction)

        # Random date in the year 2024
        start_date = pd.to_datetime("2024-01-01")
        end_date = pd.to_datetime("2024-12-31")
        transaction["TransactionDT"] = start_date + (end_date - start_date) * np.random.rand()
        transaction["TransactionDT"] = transaction["TransactionDT"].strftime('%Y-%m-%d %H:%M:%S')
        data.append(transaction)

# Convert list to DataFrame
df = pd.DataFrame(data)

# List of realistic DeviceInfo types
device_info_list = [
    "Windows", "iOS Device", "MacOS", "Trident/7.0", "rv:11.0", "rv:57.0",
    "SM-J700M Build/MMB29K", "SM-G610M Build/MMB29K", "SM-G531H Build/LMY48B",
    "rv:59.0", "SM-G935F Build/NRD90M", "SM-G955U Build/NRD90M", "SM-G532M Build/MMB29T",
    "ALE-L23 Build/HuaweiALE-L23", "SM-G950U Build/NRD90M", "SM-G930V Build/NRD90M",
    "rv:58.0", "rv:52.0", "SAMSUNG", "SM-G950F Build/NRD90M"
]

# Generate synthetic frequency counts using an exponential decay function (skewed towards common devices)
decay_factor = 1.2  # Adjust to control skewness
raw_counts = np.exp(-decay_factor * np.arange(len(device_info_list)))
raw_counts /= raw_counts.sum()  # Normalize to sum to 1
total_transactions = len(df)# Scale counts to match dataset size
device_info_counts = {device: int(count * total_transactions) for device, count in zip(device_info_list, raw_counts)}
damping_factor = 10 # Apply damping to ensure rare values are not underrepresented
device_info_counts = {device: count + damping_factor for device, count in device_info_counts.items()}
total_count = sum(device_info_counts.values()) # Normalize to get valid probabilities
device_info_probabilities = {device: count / total_count for device, count in device_info_counts.items()}

# Assign DeviceInfo based on generated probabilities
df["DeviceInfo"] = np.random.choice(
    list(device_info_probabilities.keys()),
    size=len(df),
    p=list(device_info_probabilities.values()))

# Infer DeviceType based on DeviceInfo
def infer_device_type(device_info):
    if "Windows" in device_info or "MacOS" in device_info or "rv:" in device_info:
        return "desktop"
    elif "SM-" in device_info or "SAMSUNG" in device_info or "Build/" in device_info or "iOS" in device_info:
        return "mobile"
    else:
        return np.random.choice(["desktop", "mobile"], p=[0.7, 0.3])  # Default skewed towards desktop

df["DeviceType"] = df["DeviceInfo"].apply(infer_device_type)

In [133]:
# Introduce Missing Values to Match Required Percentages
MISSING_PERCENTAGES = {
    "Sender_email": 16,
    "DeviceType": 15,
    "DeviceInfo": 15
}

for column, percentage in MISSING_PERCENTAGES.items():
    num_missing = int(len(df) * (percentage / 100))  # Calculate number of missing values
    missing_indices = np.random.choice(df.index, num_missing, replace=False)  # Randomly select indices
    df.loc[missing_indices, column] = np.nan  # Assign NaN to selected rows

**D columns**

In [134]:
df['TransactionDT'] = pd.to_datetime(df['TransactionDT'], format='%Y-%m-%d %H:%M:%S') # Ensure TransactionDT is in datetime format

# 'D2'
df = df.sort_values(by=['User_ID', 'TransactionDT'])  # Sort the DataFrame by User_ID and TransactionDT before applying diff()
df['Days_Since_LastTransac(D2)'] = df.groupby('User_ID')['TransactionDT'].diff().dt.days  # Calculate 'D2' (Days since last transaction per user)
df['Days_Since_LastTransac(D2)'] = df['Days_Since_LastTransac(D2)'].clip(lower=0)


# 'D3'
df = df.sort_values(by=['Card Number', 'TransactionDT'])  # Sort by Card Number and TransactionDT to fix D3
df['SameCard_DaysDiff(D3)'] = df.groupby('Card Number')['TransactionDT'].diff().dt.days # Calculate 'D3' (Days difference between transactions for the same card)
df['SameCard_DaysDiff(D3)'] = df['SameCard_DaysDiff(D3)'].clip(lower=0) # Replace negative values with 0 (optional, based on business logic)


# 'D4'
df['SameAddress_DaysDiff(D4)'] = df.groupby('Order_Region')['TransactionDT'].diff().dt.days.apply(lambda x: max(x, 0))  # Calculate D4 (Time difference between current and last transaction at the same billing address)


# 'D10'
df['SameReceiverEmail_DaysDiff(D10)'] = df.groupby('Merchant_email')['TransactionDT'].diff().dt.days.apply(lambda x: max(x, 0)) # Calculate D10 (Time difference between transactions for the same receiver email domain)


# 'D11'
df['SameDeviceType_DaysDiff(D11)'] = df.groupby('DeviceType')['TransactionDT'].diff().dt.days.apply(lambda x: max(x, 0))  # Calculate D11 (Time difference between transactions for a specific device type)

**M columns**

In [135]:
# M4
# Find the most used DeviceType and DeviceInfo for each user
most_used_device_info = df.groupby('User_ID').agg({
    'DeviceType': lambda x: x.mode()[0] if not x.mode().empty else 'Unknown',  # Most frequent DeviceType, default to 'Unknown' if empty
    'DeviceInfo': lambda x: x.mode()[0] if not x.mode().empty else 'Unknown'   # Most frequent DeviceInfo, default to 'Unknown' if empty
}).reset_index()

df = pd.merge(df, most_used_device_info, on='User_ID', how='left', suffixes=('', '_MostUsed')) # Merge the most used device information back to the original dataframe

# Create the 'Device Matching(M4)' column to indicate whether both DeviceType and DeviceInfo match
df['Device Matching(M4)'] = (
    (df['DeviceType'] == df['DeviceType_MostUsed']) &
    (df['DeviceInfo'] == df['DeviceInfo_MostUsed'])
).astype(int)
df = df.drop(columns=['DeviceType_MostUsed', 'DeviceInfo_MostUsed'])  # Drop the temporary columns used for matching


# M6
# For 'DeviceMismatch' column, we need to check if the device used in a transaction is different from the most frequent device for that user.
# First, find the most frequently used device for each user (User_ID)
most_used_device = df.groupby('User_ID')['DeviceType'].agg(
    lambda x: x.mode()[0] if not x.mode().empty else 'Unknown'  # Handle empty mode by setting 'Unknown'
).reset_index()
most_used_device.columns = ['User_ID', 'MostUsedDevice']
df = pd.merge(df, most_used_device, on='User_ID', how='left') # Merge the most used device information back to the original dataframe

# Create the 'Device Mismatch(M6)' column to indicate whether the device used in the transaction is different from the most frequent device for that user
df['Device Mismatch(M6)'] = (df['DeviceType'] != df['MostUsedDevice']).astype(int)
df = df.drop(columns=['MostUsedDevice'])  # Drop 'MostUsedDevice' as it's no longer needed

In [136]:
# M8
# Function to calculate geodesic distance between User_Region and Receiver_Region
def calculate_user_order_distance(user_region, receiver_region, threshold_km=40):
    user_coords = BENGALURU_REGIONS.get(user_region)
    receiver_coords = BENGALURU_REGIONS.get(receiver_region)
    if user_coords is None or receiver_coords is None:  # If any of the regions are missing or invalid, return NaN (indicating missing data)
        return np.nan
    distance = np.round(geodesic(user_coords, receiver_coords).km, 2) # Calculate geodesic distance
    return 1 if distance > threshold_km else 0  # Flag as mismatch (1) if distance exceeds the threshold, otherwise no mismatch (0)

# Apply function to DataFrame to get Region Mismatch (M8)
df['RegionMismatch(M8)'] = df.apply(lambda x: calculate_user_order_distance(x['User_Region'], x['Receiver_Region']), axis=1)


# M9

df['TransactionDT'] = pd.to_datetime(df['TransactionDT']) # Ensure 'TransactionDT' is a datetime object
df['_TransactionDT_numeric'] = (df['TransactionDT'] - df['TransactionDT'].min()).dt.total_seconds() # Create a temporary numeric column for transaction time (convert to seconds)
# Identify the most common value for each feature per user
user_patterns = df.groupby('User_ID').agg({
    'TransactionAmt': 'mean',                                                 # The typical transaction amount (mean or mode)
    'ProductCD': lambda x: x.mode()[0] if not x.mode().empty else 'Unknown',  # Most common product category
    'DeviceType': lambda x: x.mode()[0] if not x.mode().empty else 'Unknown', # Most common device type
    'DeviceInfo': lambda x: x.mode()[0] if not x.mode().empty else 'Unknown', # Most common device info
    '_TransactionDT_numeric': 'mean'                                          # Typical transaction time in seconds
}).reset_index()
user_patterns.rename(columns={'_TransactionDT_numeric': 'TransactionDT_mode'}, inplace=True)  # Rename temporary column in user_patterns
df = df.merge(user_patterns, on='User_ID', suffixes=('', '_mode'))  # Merge these user patterns back into the main dataset

# Define thresholds to flag inconsistency
threshold_amount = 0.2  # Example: If the transaction amount deviates by 20% from the mean, mark as inconsistent
threshold_time = 6 * 3600  # 6 hours in seconds

# Flag deviations in transaction parameters
df['TransactionConsistency(M9)'] = (
    (abs(df['TransactionAmt'] - df['TransactionAmt_mode']) > threshold_amount * df['TransactionAmt_mode']) &
    (df['ProductCD'] != df['ProductCD_mode']) &
    (df['DeviceType'] != df['DeviceType_mode']) &
    (df['DeviceInfo'] != df['DeviceInfo_mode']) &
    (abs(df['_TransactionDT_numeric'] - df['TransactionDT_mode']) > threshold_time)  # Time inconsistency check
).astype(int)  # 1 if inconsistent, 0 if consistent

df.drop(columns=['_TransactionDT_numeric', 'TransactionAmt_mode', 'ProductCD_mode', 'TransactionDT_mode',
                 'DeviceType_mode', 'DeviceInfo_mode'], inplace=True) # Drop the temporary column

**C columns**

In [137]:
# C1
# Count transactions associated with the same card1 and order region
df['Transaction Count(card,U_Region C1)'] = df.groupby(['Card Number', 'Order_Region'])['TransactionID'].transform('count')

# C4
# Count unique merchants per Card Number (C4)
df['Unique Merchants(per card C4)'] = df.groupby('Card Number')['Merchant'].transform('nunique')

# C5
# Count transactions linked to the same Billing Region per User_ID
df['Same B_region count(C5)'] = df.groupby(['User_ID', 'Order_Region'])['TransactionID'].transform('count')

# C6
# Count transactions from the same Device per User_ID
df['Same Device count(C6)'] = df.groupby(['User_ID', 'DeviceType', 'DeviceInfo'])['TransactionID'].transform('count')

# C11
# Count of unique Billing Regions linked to the same Card Number
df['Unique B_region(same card C11)'] = df.groupby('Card Number')['Order_Region'].transform('nunique')


df['TransactionDT'] = pd.to_datetime(df['TransactionDT'], format='%Y-%m-%d %H:%M:%S') # Ensure 'TransactionDT' is in datetime format




desired_order = [
                'TransactionID', 'TransactionAmt', 'TransactionDT', 'ProductCD', 'User_ID', 'Merchant',
                'Card Number', 'BIN Number', 'Card Network', 'Card Tier', 'Card Type','Phone Numbers',
                'User_Region', 'Order_Region','Receiver_Region', 'Distance', 'Sender_email', 'Merchant_email',
                'DeviceType', 'DeviceInfo', 'Days_Since_LastTransac(D2)','SameCard_DaysDiff(D3)',
                'SameAddress_DaysDiff(D4)', 'SameReceiverEmail_DaysDiff(D10)','SameDeviceType_DaysDiff(D11)',
                'Device Matching(M4)', 'Device Mismatch(M6)','RegionMismatch(M8)', 'TransactionConsistency(M9)',
                'Transaction Count(card,U_Region C1)','Unique Merchants(per card C4)', 'Same B_region count(C5)',
                'Same Device count(C6)', 'Unique B_region(same card C11)', 'isFraud']

# Reorder DataFrame columns
df = df[desired_order]

In [138]:
df.columns

Index(['TransactionID', 'TransactionAmt', 'TransactionDT', 'ProductCD',
       'User_ID', 'Merchant', 'Card Number', 'BIN Number', 'Card Network',
       'Card Tier', 'Card Type', 'Phone Numbers', 'User_Region',
       'Order_Region', 'Receiver_Region', 'Distance', 'Sender_email',
       'Merchant_email', 'DeviceType', 'DeviceInfo',
       'Days_Since_LastTransac(D2)', 'SameCard_DaysDiff(D3)',
       'SameAddress_DaysDiff(D4)', 'SameReceiverEmail_DaysDiff(D10)',
       'SameDeviceType_DaysDiff(D11)', 'Device Matching(M4)',
       'Device Mismatch(M6)', 'RegionMismatch(M8)',
       'TransactionConsistency(M9)', 'Transaction Count(card,U_Region C1)',
       'Unique Merchants(per card C4)', 'Same B_region count(C5)',
       'Same Device count(C6)', 'Unique B_region(same card C11)', 'isFraud'],
      dtype='object')

In [139]:
len(df.columns)

35

In [140]:
df["TransactionAmt"] = df["TransactionAmt"].apply(lambda x: round(x, 2))
df.shape

(9954, 35)

In [141]:
df['TransactionDT'] = pd.to_datetime(df['TransactionDT'], format='%Y-%m-%d %H:%M:%S') # Ensure 'TransactionDT' is in datetime format


# E2 - Time Range Encoding (Time of Day)
def time_to_range(dt):
    hour = dt.hour
    return (0 if 10 <= hour < 14 else
            1 if 14 <= hour < 18 else
            2 if 18 <= hour < 22 else
            3 if hour >= 22 or hour < 2 else
            4 if 2 <= hour < 6 else 5)
df['TransactionTimeSlot(E2)'] = df['TransactionDT'].apply(time_to_range)


# E3 - Hour Difference from Range Start
def time_to_range12(dt):
    hour = dt.hour
    return (hour - 10 if 10 <= hour < 14 else
            hour - 14 if 14 <= hour < 18 else
            hour - 18 if 18 <= hour < 22 else
            (hour - 22) if hour >= 22 else (hour + 2) if hour < 2 else
            hour - 2 if 2 <= hour < 6 else hour - 6)
df['HourWithinSlot(E3)'] = df['TransactionDT'].apply(time_to_range12)


# E4 - Day of the Week
df['TransactionWeekday(E4)'] = df['TransactionDT'].dt.weekday


# E5 - Average Transaction Interval
df['TimeDiff'] = df.groupby('Card Number')['TransactionDT'].diff().dt.total_seconds() / 60
df['AvgTransactionInterval(E5)'] = df.groupby('Card Number')['TimeDiff'].transform('mean')
df['AvgTransactionInterval(E5)'] = pd.cut(df['AvgTransactionInterval(E5)'], bins=[-np.inf, df['AvgTransactionInterval(E5)'].quantile(0.33), df['AvgTransactionInterval(E5)'].quantile(0.66), np.inf], labels=['0', '1', '2'], right=False)
df.drop(columns=['TimeDiff'], inplace=True)


# E6 - Transaction Amount Variance
df['TransactionAmountVariance(E6)'] = df.groupby('Card Number')['TransactionAmt'].transform(lambda x: x.var())
df['TransactionAmountVariance(E6)'] = pd.qcut(df['TransactionAmountVariance(E6)'], q=10, labels=range(10), duplicates='drop')


# E7 - Transaction Amount Ratio
df['AvgTransactionAmount'] = df.groupby('Card Number')['TransactionAmt'].transform('mean')
df['TransactionRatio(E7)'] = (df['TransactionAmt'] / df['AvgTransactionAmount']).replace([np.inf, -np.inf], np.nan).fillna(0)
df['TransactionRatio(E7)'] = pd.qcut(df['TransactionRatio(E7)'], q=7, labels=range(7), duplicates='drop')
df.drop(columns=['AvgTransactionAmount'], inplace=True)


# E8 - Median Transaction Amount
df['MedianTransactionAmount(E8)'] = df.groupby('Card Number')['TransactionAmt'].transform('median')
df['MedianTransactionAmount(E8)'] = pd.qcut(df['MedianTransactionAmount(E8)'], q=9, labels=range(9), duplicates='drop')


# E11
def add_anomaly_group(df, device_col='DeviceType', dist_col='Distance', time_col='TransactionDT', speed_kmh=10, user_col='Order_Region'):
    df = df.copy()  # Ensure the original DataFrame is not modified unintentionally
    df.columns = df.columns.str.strip()  # Clean up column names to remove any extra spaces
    df[time_col] = pd.to_datetime(df[time_col])
    df = df.sort_values(by=[device_col, time_col])
    df['PrevTime'] = df.groupby(device_col)[time_col].shift(1)
    df['TimeDiffHours'] = (df[time_col] - df['PrevTime']).dt.total_seconds() / 3600 # Calculate the time difference between consecutive transactions in hours
    df['ExpectedTimeHours'] = df[dist_col] / speed_kmh  # Calculate the expected time to travel the given distance at the specified speed
    df['TimingAnomaly'] = df['TimeDiffHours'] < df['ExpectedTimeHours'] # Detect anomalies: mark as True if the time difference is less than the expected time
    df['Timing Anomaly(E11)'] = df['TimingAnomaly'].astype(int) # Create the Anomaly Group (1 for anomaly, 0 for no anomaly)
    df.drop(columns=['PrevTime', 'TimeDiffHours', 'ExpectedTimeHours'], inplace=True) # Drop temporary columns, as only the final anomaly-related columns are needed

    return df

df = add_anomaly_group(df)


# E12
def add_region_anomaly_v34(df, sender_col='Order_Region', receiver_col='Receiver_Region', dist_col='Distance', time_col='TransactionDT', speed_kmh=30):
    df = df.copy()
    df[time_col] = pd.to_datetime(df[time_col])
    df = df.sort_values(by=[sender_col, time_col])
    prev_time = df.groupby(sender_col)[time_col].shift(1)
    time_diff_hours = (df[time_col] - prev_time).dt.total_seconds() / 3600
    expected_time_hours = df[dist_col] / speed_kmh
    df['Region Anomaly(E12)'] = (time_diff_hours < expected_time_hours).astype(int)

    return df

df = add_region_anomaly_v34(df)



#E13
def add_hourly_transaction_count(df, time_col="TransactionDT", group_col="Merchant_email"):
    df = df.copy()
    if not pd.api.types.is_datetime64_any_dtype(df[time_col]):     # Ensure TransactionDT is in datetime format
        df[time_col] = pd.to_datetime(df[time_col])
    df["Receiver_emaildomain"] = df[group_col].str.split('@').str[1]  # Extract domain from the Receiver_email
    df["Receiver_emaildomain"] = df["Receiver_emaildomain"].fillna("Unknown")     # Handle missing or malformed email addresses
    df["HourlyTransactionCount(E13)"] = (df.groupby([pd.Grouper(key=time_col, freq="1h"), "Receiver_emaildomain"])
        ["Receiver_emaildomain"].transform("count"))  # Group transactions by hour and count occurrences
    df = df.drop(columns=["Receiver_emaildomain"])

    return df

df = add_hourly_transaction_count(df)
df = df.sort_values(by="TransactionDT")

In [142]:
#E9

def add_avg_transaction_amt_24hrs(df, group_col='Card Number', amount_col='TransactionAmt', time_col='TransactionDT'):
    df = df.copy()
    df[time_col] = pd.to_datetime(df[time_col])
    df = df.sort_values(by=[group_col, time_col])
    df['Last24hTransactionSum'] = 0.0   # Initialize new column
    for card, group in df.groupby(group_col): # Compute last 24-hour transaction sum per group
        sums = []
        for i, row in group.iterrows():
            time_threshold = row[time_col] - pd.Timedelta(hours=24)
            last_24h_sum = group[(group[time_col] >= time_threshold) & (group[time_col] <= row[time_col])][amount_col].sum()
            sums.append(last_24h_sum)
        df.loc[group.index, 'Last24hTransactionSum'] = sums
    df['Last24hTransactionSum'] = df['Last24hTransactionSum'].fillna(0)
    df['AvgTransactionAmt_24Hrs(E9)'] = pd.qcut(df['Last24hTransactionSum'], q=3, labels=[0, 1, 2], duplicates='drop') # Categorizing into bins
    return df.drop(columns=['Last24hTransactionSum']) # Drop intermediate column

df = add_avg_transaction_amt_24hrs(df)


#E10

def add_transaction_velocity(df, time_col='TransactionDT', id_col='Card Number'):
    df = df.copy()
    df[time_col] = pd.to_datetime(df[time_col], errors='coerce')
    if df[time_col].isna().any():   # Check for invalid datetime values
        raise ValueError("Some values in TransactionDT could not be converted to datetime.")
    df = df.sort_values(by=[id_col, time_col])
    df['Transaction Velocity(E10)'] = 0     # Initialize new column
    for card, group in df.groupby(id_col):        # Compute transaction count in the last 1 hour per group
        counts = []
        for i, row in group.iterrows():
            time_threshold = row[time_col] - pd.Timedelta(hours=1)
            last_hour_count = group[(group[time_col] >= time_threshold) & (group[time_col] < row[time_col])].shape[0]
            counts.append(last_hour_count)
        df.loc[group.index, 'Transaction Velocity(E10)'] = counts

    return df

df = add_transaction_velocity(df)

In [143]:
df.columns

Index(['TransactionID', 'TransactionAmt', 'TransactionDT', 'ProductCD',
       'User_ID', 'Merchant', 'Card Number', 'BIN Number', 'Card Network',
       'Card Tier', 'Card Type', 'Phone Numbers', 'User_Region',
       'Order_Region', 'Receiver_Region', 'Distance', 'Sender_email',
       'Merchant_email', 'DeviceType', 'DeviceInfo',
       'Days_Since_LastTransac(D2)', 'SameCard_DaysDiff(D3)',
       'SameAddress_DaysDiff(D4)', 'SameReceiverEmail_DaysDiff(D10)',
       'SameDeviceType_DaysDiff(D11)', 'Device Matching(M4)',
       'Device Mismatch(M6)', 'RegionMismatch(M8)',
       'TransactionConsistency(M9)', 'Transaction Count(card,U_Region C1)',
       'Unique Merchants(per card C4)', 'Same B_region count(C5)',
       'Same Device count(C6)', 'Unique B_region(same card C11)', 'isFraud',
       'TransactionTimeSlot(E2)', 'HourWithinSlot(E3)',
       'TransactionWeekday(E4)', 'AvgTransactionInterval(E5)',
       'TransactionAmountVariance(E6)', 'TransactionRatio(E7)',
       'MedianTr

In [144]:
df.shape

(9954, 48)

In [145]:
desired_order = [
                'TransactionID', 'TransactionAmt', 'TransactionDT', 'ProductCD', 'User_ID', 'Merchant','Card Number', 'BIN Number',
                'Card Network', 'Card Tier', 'Card Type','Phone Numbers', 'User_Region', 'Order_Region','Receiver_Region', 'Distance',
                'Sender_email', 'Merchant_email','DeviceType', 'DeviceInfo', 'Days_Since_LastTransac(D2)','SameCard_DaysDiff(D3)',
                'SameAddress_DaysDiff(D4)', 'SameReceiverEmail_DaysDiff(D10)','SameDeviceType_DaysDiff(D11)', 'Device Matching(M4)',
                'Device Mismatch(M6)', 'RegionMismatch(M8)', 'TransactionConsistency(M9)', 'Transaction Count(card,U_Region C1)',
                'Unique Merchants(per card C4)', 'Same B_region count(C5)','Same Device count(C6)', 'Unique B_region(same card C11)',
                'TransactionTimeSlot(E2)','HourWithinSlot(E3)','TransactionWeekday(E4)','AvgTransactionInterval(E5)',
                'TransactionAmountVariance(E6)','TransactionRatio(E7)','MedianTransactionAmount(E8)','AvgTransactionAmt_24Hrs(E9)',
                'Transaction Velocity(E10)','Timing Anomaly(E11)', 'Region Anomaly(E12)','HourlyTransactionCount(E13)','isFraud'
]

df = df[desired_order]

In [146]:
def assign_fraud_label(row, fraud_probabilities, df_grouped_by_user):
    base_prob = FRAUD_RATIO
    final_prob = base_prob

    user_id = row['User_ID']

    try:
        user_data = df_grouped_by_user.get_group(user_id)
        user_avg_amt = user_data['TransactionAmt'].mean()

        current_time = row['TransactionDT']
        last_30_sec_transactions = user_data[user_data['TransactionDT'] >= current_time - pd.Timedelta(seconds=30)]
        num_transactions_last_30_sec = len(last_30_sec_transactions)

        # Calculate temporary AvgTransactionAmt_24Hrs (if it doesn't exist in the ROW)
        if "AvgTransactionAmt_24Hrs" not in row:  # Check the ROW, not the DataFrame
            user_data_24hrs = user_data[user_data['TransactionDT'] >= current_time - pd.Timedelta(hours=24)]
            row['AvgTransactionAmt_24Hrs'] = user_data_24hrs['TransactionAmt'].mean() if len(user_data_24hrs)>0 else 0

        # Calculate temporary Avg_TransactionAmt (if it doesn't exist in the ROW)
        if "Avg_TransactionAmt" not in row:  # Check the ROW, not the DataFrame
            row['Avg_TransactionAmt'] = user_avg_amt

    except KeyError:
        user_avg_amt = row['TransactionAmt']
        num_transactions_last_30_sec = 0
        row['AvgTransactionAmt_24Hrs'] = 0
        row['Avg_TransactionAmt'] = 0

    # New Rule 1
    if (row["Days_Since_LastTransac(D2)"] > 30 and
        row["Device Matching(M4)"] == 0 and  # Use the renamed column
        row["TransactionAmt"] > 3 * user_avg_amt and
        (user_data['TransactionAmt'] > 3* user_data['TransactionAmt'].mean()).sum() > 0 and
        (row["AvgTransactionAmt_24Hrs"] < row["Avg_TransactionAmt"] or row["RegionMismatch(M8)"] == 1)):  # Corrected condition
        final_prob = max(final_prob, 0.02)

    # New Rule 2
    if (row["Days_Since_LastTransac(D2)"] < 1 and
        row["TransactionAmt"] > 3 * user_avg_amt and
        num_transactions_last_30_sec > 3 and
        row["Device Mismatch(M6)"] == 1 and
        row["RegionMismatch(M8)"] == 1):
        final_prob = max(final_prob, 0.05)

    # New Rule 3
    if (row['ProductCD'] in ["Household", "Consumable", "Services"] and
        row['Device Mismatch(M6)'] == 1 and
        row['RegionMismatch(M8)'] == 1 and row['TransactionAmt'] > 3*row['Avg_TransactionAmt']):
        final_prob = max(final_prob, 0.34)



    if row["Card Network"] == "Americanexpress" and row["Card Type"] == "credit":
        final_prob = max(final_prob, 0.09)
    elif row["ProductCD"] == "Consumable" and row["Card Network"] == "Mastercard" and row["Card Type"] == "debit":
        final_prob = max(final_prob, 0.02)
    elif row["TransactionAmt"] > 10000:
        final_prob = max(final_prob, 0.015)

    # Probabilistic Adjustments
    for feature, probabilities in fraud_probabilities.items():
        if feature in row and str(row[feature]) in probabilities:
            feature_prob = probabilities[str(row[feature])]
            final_prob = max(final_prob, feature_prob)

    # M Feature Influence
    m_score = 0
    if row["Device Mismatch(M6)"] == 1:
        m_score += 0.02
    if row["RegionMismatch(M8)"] == 1:
        m_score += 0.015
    if row["TransactionConsistency(M9)"] == 1:
        m_score += 0.01

    final_prob = min(1, final_prob + m_score)

    # Delete the temporary columns AFTER they are used
    if "AvgTransactionAmt_24Hrs" in row:
        del row['AvgTransactionAmt_24Hrs']
    if "Avg_TransactionAmt" in row:
        del row['Avg_TransactionAmt']

    return np.random.choice([0, 1], p=[1 - final_prob, final_prob])


def calculate_fraud_probabilities(df, features):
    fraud_probabilities = {}
    for feature in features:
        try:
            probabilities = df.groupby(feature)["isFraud"].mean().to_dict()
            string_probabilities = {str(k): v for k, v in probabilities.items()}
            fraud_probabilities[feature] = string_probabilities
        except (KeyError, TypeError) as e:
            print(f"Warning: Could not calculate probabilities for {feature}. Error: {e}")
            default_prob = df["isFraud"].mean() if "isFraud" in df else FRAUD_RATIO
            fraud_probabilities[feature] = {str(val): default_prob for val in df[feature].unique()}
    return fraud_probabilities


# --- Main Execution ---
FRAUD_RATIO = 0.04

features_for_prob = ["Card Network", "Card Type", "ProductCD", "Order_Region", "DeviceType"]
fraud_probabilities = calculate_fraud_probabilities(df, features_for_prob)

df_grouped_by_user = df.groupby('User_ID')

df["isFraud"] = df.apply(assign_fraud_label, axis=1, args=(fraud_probabilities, df_grouped_by_user))

In [147]:
df = df.sort_values(by="TransactionDT")


In [148]:
print(len(df.columns))

47


In [149]:
df['isFraud'].value_counts()

Unnamed: 0_level_0,count
isFraud,Unnamed: 1_level_1
0,9491
1,463


In [150]:
# Code to Duplicate the transctions (30% legit and 40% fraud)
num_legit_repeats = int(0.3 * len(df[df['isFraud'] == 0]))
num_fraud_repeats = int(0.4 * len(df[df['isFraud'] == 1]))

repeat_legit = df[df['isFraud'] == 0].sample(num_legit_repeats, replace=True).copy()
repeat_fraud = df[df['isFraud'] == 1].sample(num_fraud_repeats, replace=True).copy()

# *** KEY CHANGE: Generate new TransactionIDs ***
new_legit_ids = range(df['TransactionID'].max() + 1, df['TransactionID'].max() + 1 + num_legit_repeats)
repeat_legit['TransactionID'] = new_legit_ids

new_fraud_ids = range(repeat_legit['TransactionID'].max() + 1, repeat_legit['TransactionID'].max() + 1 + num_fraud_repeats)
repeat_fraud['TransactionID'] = new_fraud_ids


# Modify timestamps, amounts, and shuffle device info (as before)
repeat_legit['TransactionDT'] += pd.to_timedelta(np.random.randint(10, 1440, num_legit_repeats), unit='m')
repeat_fraud['TransactionDT'] += pd.to_timedelta(np.random.randint(1, 60, num_fraud_repeats), unit='m')

repeat_legit['TransactionAmt'] *= np.random.uniform(0.98, 1.02, num_legit_repeats)
repeat_fraud['TransactionAmt'] *= np.random.uniform(0.95, 1.05, num_fraud_repeats)

repeat_fraud['DeviceInfo'] = repeat_fraud['DeviceInfo'].sample(frac=1).values

# Ensure same columns
repeat_legit = repeat_legit[df.columns]
repeat_fraud = repeat_fraud[df.columns]

# Concatenate
df = pd.concat([df, repeat_legit, repeat_fraud], ignore_index=True)

# Sort by TransactionDT
df = df.sort_values(by="TransactionDT")


In [151]:
df['isFraud'].value_counts()

Unnamed: 0_level_0,count
isFraud,Unnamed: 1_level_1
0,12338
1,648


In [152]:
import pandas as pd

# Original column names
old_columns = ['TransactionID', 'TransactionAmt', 'TransactionDT', 'ProductCD',
       'User_ID', 'Merchant', 'Card Number', 'BIN Number', 'Card Network',
       'Card Tier', 'Card Type', 'Phone Numbers', 'User_Region',
       'Order_Region', 'Receiver_Region', 'Distance', 'Sender_email',
       'Merchant_email', 'DeviceType', 'DeviceInfo',
       'Days_Since_LastTransac(D2)', 'SameCard_DaysDiff(D3)',
       'SameAddress_DaysDiff(D4)', 'SameReceiverEmail_DaysDiff(D10)',
       'SameDeviceType_DaysDiff(D11)', 'Device Matching(M4)',
       'Device Mismatch(M6)', 'RegionMismatch(M8)',
       'TransactionConsistency(M9)', 'Transaction Count(card,U_Region C1)',
       'Unique Merchants(per card C4)', 'Same B_region count(C5)',
       'Same Device count(C6)', 'Unique B_region(same card C11)',
       'TransactionTimeSlot(E2)', 'HourWithinSlot(E3)',
       'TransactionWeekday(E4)', 'AvgTransactionInterval(E5)',
       'TransactionAmountVariance(E6)', 'TransactionRatio(E7)',
       'MedianTransactionAmount(E8)', 'AvgTransactionAmt_24Hrs(E9)',
       'Transaction Velocity(E10)', 'Timing Anomaly(E11)',
       'Region Anomaly(E12)', 'HourlyTransactionCount(E13)', 'isFraud']

# New column names
new_columns = ['TransactionID', 'TransactionAmt', 'TransactionDT', 'ProductCD',
       'User_ID', 'Merchant', 'Card_Number', 'BIN_Number', 'Card_Network',
       'Card_Tier', 'Card_Type', 'Phone_Numbers', 'User_Region',
       'Order_Region', 'Receiver_Region', 'Distance', 'Sender_email', 'Merchant_email',
       'DeviceType', 'DeviceInfo', 'Days_Since_LastTransac_D2',
       'SameCard_DaysDiff_D3', 'SameAddress_DaysDiff_D4',
       'SameReceiverEmail_DaysDiff_D10', 'SameDeviceType_DaysDiff_D11',
       'Device_Matching_M4', 'Device_Mismatch_M6', 'RegionMismatch_M8',
       'TransactionConsistency_M9', 'Transaction_Count_C1',
       'Unique_Merchants_C4', 'Same_B_region_count_C5', 'Same_Device_count_C6',
       'Unique_B_region_C11', 'TransactionTimeSlot_E2', 'HourWithinSlot_E3',
       'TransactionWeekday_E4', 'AvgTransactionInterval_E5',
       'TransactionAmountVariance_E6', 'TransactionRatio_E7',
       'MedianTransactionAmount_E8', 'AvgTransactionAmt_24Hrs_E9',
       'TransactionVelocity_E10', 'TimingAnomaly_E11', 'RegionAnomaly_E12',
       'HourlyTransactionCount_E13', 'isFraud']

# Create a renaming dictionary
rename_dict = dict(zip(old_columns, new_columns))

# Assuming df is your DataFrame
df.rename(columns=rename_dict, inplace=True)

df["TransactionAmt"] = df["TransactionAmt"].apply(lambda x: round(x, 2))

# Display the new column names
print(df.columns)


Index(['TransactionID', 'TransactionAmt', 'TransactionDT', 'ProductCD',
       'User_ID', 'Merchant', 'Card_Number', 'BIN_Number', 'Card_Network',
       'Card_Tier', 'Card_Type', 'Phone_Numbers', 'User_Region',
       'Order_Region', 'Receiver_Region', 'Distance', 'Sender_email',
       'Merchant_email', 'DeviceType', 'DeviceInfo',
       'Days_Since_LastTransac_D2', 'SameCard_DaysDiff_D3',
       'SameAddress_DaysDiff_D4', 'SameReceiverEmail_DaysDiff_D10',
       'SameDeviceType_DaysDiff_D11', 'Device_Matching_M4',
       'Device_Mismatch_M6', 'RegionMismatch_M8', 'TransactionConsistency_M9',
       'Transaction_Count_C1', 'Unique_Merchants_C4', 'Same_B_region_count_C5',
       'Same_Device_count_C6', 'Unique_B_region_C11', 'TransactionTimeSlot_E2',
       'HourWithinSlot_E3', 'TransactionWeekday_E4',
       'AvgTransactionInterval_E5', 'TransactionAmountVariance_E6',
       'TransactionRatio_E7', 'MedianTransactionAmount_E8',
       'AvgTransactionAmt_24Hrs_E9', 'TransactionVelocity

In [153]:
df.to_csv("synthetic_dataset.csv", index=False)