In [1]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta

np.random.seed(42)
random.seed(42)

regions = {
    "Urban Slum": ["Makoko", "Ajegunle", "Kibera", "Mathare"],
    "Peri-Urban": ["Ikorodu", "Nsawam", "Entebbe", "Kasoa"],
    "Rural": ["Gboko", "Nzega", "Chibombo", "Lokoja"],
    "Coastal": ["Takoradi", "Lamu", "Bonny", "Bagamoyo"],
    "Dryland": ["Garissa", "Maradi", "Zinder", "Dori"]
}

start_date = datetime.strptime("2019-01-01", "%Y-%m-%d")
dates = [start_date + timedelta(days=3.5*i) for i in range(5 * 52 * 2)]  # 2 per week for 5 years

data = []

for region, communities in regions.items():
    for community in communities:
        for date in dates:
            turbidity = max(0, np.random.normal(loc=5 if region == "Urban Slum" else 3 if region == "Peri-Urban" else 2 if region == "Rural" else 4 if region == "Coastal" else 6, scale=1.5))
            ecoli = max(0, np.random.normal(loc=100 if region == "Urban Slum" else 60 if region == "Peri-Urban" else 30 if region == "Rural" else 80 if region == "Coastal" else 120, scale=20))
            nitrate = max(0, np.random.normal(loc=20 if region == "Urban Slum" else 15 if region == "Peri-Urban" else 10 if region == "Rural" else 18 if region == "Coastal" else 25, scale=5))
            pH = round(np.random.normal(loc=6.8, scale=0.3), 2)

            lam_cholera = max(0.1, 0.05 * ecoli + 0.2 * turbidity)
            lam_typhoid = max(0.1, 0.03 * ecoli + 0.1 * turbidity)
            lam_diarrhea = max(0.1, 0.04 * ecoli + 0.15 * turbidity)

            cholera = int(np.random.poisson(lam=lam_cholera))
            typhoid = int(np.random.poisson(lam=lam_typhoid))
            diarrhea = int(np.random.poisson(lam=lam_diarrhea))
            total_cases = cholera + typhoid + diarrhea

            risk_level = "High" if total_cases >= 10 else "Medium" if total_cases >= 5 else "Low"

            data.append({
                "Date": date.strftime("%Y-%m-%d"),
                "Region": region,
                "Community": community,
                "Turbidity(NTU)": round(turbidity, 2),
                "Ecoli_Count(CFU/100ml)": int(ecoli),
                "Nitrate(mg/L)": round(nitrate, 2),
                "pH": pH,
                "Cholera_Cases": cholera,
                "Typhoid_Cases": typhoid,
                "Diarrhea_Cases": diarrhea,
                "Total_Waterborne_Cases": total_cases,
                "Risk_Level": risk_level
            })

df = pd.DataFrame(data)
df.to_csv("large_water_disease_dataset.csv", index=False)
print("Generated:", len(df), "rows")


Generated: 10400 rows


In [None]:
df_kenya.to_csv('kenya_fraud_detection.csv')

In [2]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta

# Set random seed for reproducibility
np.random.seed(42)

# Simulate parameters
n_transactions = 50000
user_ids = [f"U{str(i).zfill(4)}" for i in range(1, 1001)]
device_ids = [f"D{str(i).zfill(3)}" for i in range(1, 201)]
agent_ids = [f"A{str(i).zfill(3)}" for i in range(1, 301)]
locations = ["Ikeja", "Yaba", "Ajah", "Kano", "Aba", "Makurdi", "Portharcout", "Owerri"]
transaction_types = ["send", "receive", "cash_in", "cash_out"]
channels = ["USSD", "App", "Agent", "Web"]

# Generate timestamps
start_date = datetime(2024, 1, 1)
timestamps = [start_date + timedelta(minutes=np.random.randint(0, 60*24*180)) for _ in range(n_transactions)]

# Generate transactions
data = {
    "transaction_id": [f"T{str(i).zfill(6)}" for i in range(n_transactions)],
    "user_id": np.random.choice(user_ids, n_transactions),
    "transaction_type": np.random.choice(transaction_types, n_transactions, p=[0.35, 0.35, 0.15, 0.15]),
    "amount": np.round(np.random.exponential(scale=5000, size=n_transactions), 2),
    "device_id": np.random.choice(device_ids, n_transactions),
    "location": np.random.choice(locations, n_transactions),
    "timestamp": timestamps,
    "channel": np.random.choice(channels, n_transactions, p=[0.5, 0.3, 0.15, 0.05]),
    "agent_id": np.random.choice(agent_ids, n_transactions),
    "sim_swap_flag": np.random.choice([0, 1], n_transactions, p=[0.98, 0.02])
}

df = pd.DataFrame(data)

# Derive balance fields
df["balance_before"] = np.round(np.random.uniform(1000, 100000, size=n_transactions), 2)
df["balance_after"] = df["balance_before"] - df["amount"]
df["balance_after"] = df["balance_after"].apply(lambda x: x if x > 0 else np.random.uniform(0, 1000))


# Add transaction velocity: simulate past 1-hour tx count for each user
df["transaction_velocity"] = np.random.poisson(lam=1.2, size=n_transactions)

# Save to CSV
df.to_csv('mobile_money_transactions.csv', index=False)
