In [None]:
!pip install faker

Collecting faker
  Downloading faker-37.8.0-py3-none-any.whl.metadata (15 kB)
Downloading faker-37.8.0-py3-none-any.whl (2.0 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.0 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.9/2.0 MB[0m [31m68.9 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m37.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faker
Successfully installed faker-37.8.0


In [None]:
import random
import pandas as pd
import numpy as np

NUM_RECORDS = 11000

# Common household appliances in Rwanda with realistic distributions
appliances = ["Iron", "Laptop", "Blender", "Fan", "Electric Kettle", "Heater",
              "CFL Lamp", "LED Lamp", "Rice Cooker", "Incandescent Lamp",
              "Washing Machine", "Tube Light", "TV", "Microwave", "Refrigerator",
              "Radio", "Phone Charger", "Water Heater"]

# Appliances by income level probability
appliance_probability = {
    "Low": ["LED Lamp", "Phone Charger", "Radio", "Fan", "Iron", "TV"],
    "Medium": ["LED Lamp", "TV", "Laptop", "Refrigerator", "Iron", "Blender",
               "Electric Kettle", "Fan", "Washing Machine"],
    "High": ["LED Lamp", "TV", "Laptop", "Refrigerator", "Iron", "Blender",
             "Electric Kettle", "Microwave", "Water Heater", "AC", "Washing Machine"]
}

# Regions in Rwanda with population distribution weights
regions = ["Kigali", "Huye", "Musanze", "Rubavu", "Nyagatare", "Rusizi", "Muhanga"]
region_weights = [0.35, 0.10, 0.12, 0.11, 0.10, 0.11, 0.11]  # Kigali has higher population

# Income distribution (based on Rwanda statistics)
income_levels = ["Low", "Medium", "High"]
income_weights = [0.40, 0.45, 0.15]

# Realistic power ranges for Rwandan appliances (in watts)
power_ranges = {
    "Iron": (800, 1200),
    "Laptop": (40, 90),
    "Blender": (300, 600),
    "Fan": (40, 75),
    "Electric Kettle": (1000, 1500),
    "Heater": (1500, 2500),
    "CFL Lamp": (10, 20),
    "LED Lamp": (5, 15),
    "Rice Cooker": (500, 800),
    "Incandescent Lamp": (40, 100),
    "Washing Machine": (400, 800),
    "Tube Light": (20, 40),
    "AC": (1500, 3000),
    "TV": (50, 150),
    "Microwave": (800, 1200),
    "Refrigerator": (100, 200),
    "Radio": (5, 15),
    "Phone Charger": (5, 10),
    "Water Heater": (2000, 3000)
}

# Realistic usage patterns by appliance and income level
usage_patterns = {
    "Low": {
        "Iron": (0.5, 2.0), "Laptop": (1.0, 4.0), "Blender": (0.2, 0.5),
        "Fan": (2.0, 6.0), "Electric Kettle": (0.3, 1.0), "Heater": (0, 0),
        "CFL Lamp": (3.0, 6.0), "LED Lamp": (3.0, 6.0), "Rice Cooker": (0.5, 1.0),
        "Incandescent Lamp": (3.0, 6.0), "Washing Machine": (0.5, 1.5),
        "Tube Light": (3.0, 6.0), "AC": (0, 0), "TV": (2.0, 5.0),
        "Microwave": (0.2, 0.5), "Refrigerator": (8.0, 12.0), "Radio": (2.0, 5.0),
        "Phone Charger": (2.0, 4.0), "Water Heater": (0, 0)
    },
    "Medium": {
        "Iron": (0.5, 2.5), "Laptop": (2.0, 6.0), "Blender": (0.3, 0.8),
        "Fan": (3.0, 8.0), "Electric Kettle": (0.5, 1.5), "Heater": (1.0, 3.0),
        "CFL Lamp": (4.0, 8.0), "LED Lamp": (4.0, 8.0), "Rice Cooker": (0.5, 1.5),
        "Incandescent Lamp": (4.0, 8.0), "Washing Machine": (1.0, 2.0),
        "Tube Light": (4.0, 8.0), "AC": (2.0, 5.0), "TV": (3.0, 6.0),
        "Microwave": (0.3, 0.8), "Refrigerator": (10.0, 14.0), "Radio": (1.0, 3.0),
        "Phone Charger": (3.0, 6.0), "Water Heater": (1.0, 2.0)
    },
    "High": {
        "Iron": (1.0, 3.0), "Laptop": (3.0, 8.0), "Blender": (0.5, 1.0),
        "Fan": (4.0, 10.0), "Electric Kettle": (1.0, 2.0), "Heater": (2.0, 5.0),
        "CFL Lamp": (5.0, 10.0), "LED Lamp": (5.0, 10.0), "Rice Cooker": (1.0, 2.0),
        "Incandescent Lamp": (5.0, 10.0), "Washing Machine": (1.5, 3.0),
        "Tube Light": (5.0, 10.0), "AC": (4.0, 8.0), "TV": (4.0, 8.0),
        "Microwave": (0.5, 1.2), "Refrigerator": (12.0, 18.0), "Radio": (0.5, 2.0),
        "Phone Charger": (4.0, 8.0), "Water Heater": (2.0, 4.0)
    }
}

# Usage days per month (not all appliances used daily)
usage_days_ranges = {
    "Iron": (4, 12), "Laptop": (20, 30), "Blender": (8, 20),
    "Fan": (15, 30), "Electric Kettle": (20, 30), "Heater": (5, 20),
    "CFL Lamp": (25, 30), "LED Lamp": (25, 30), "Rice Cooker": (15, 30),
    "Incandescent Lamp": (25, 30), "Washing Machine": (4, 12),
    "Tube Light": (25, 30), "AC": (10, 25), "TV": (25, 30),
    "Microwave": (15, 25), "Refrigerator": (30, 30), "Radio": (20, 30),
    "Phone Charger": (25, 30), "Water Heater": (20, 30)
}

# Quantity ranges by appliance and income level
quantity_ranges = {
    "Low": {
        "Iron": (1, 1), "Laptop": (1, 1), "Blender": (1, 1),
        "Fan": (1, 2), "Electric Kettle": (1, 1), "Heater": (0, 1),
        "CFL Lamp": (2, 4), "LED Lamp": (2, 5), "Rice Cooker": (0, 1),
        "Incandescent Lamp": (1, 3), "Washing Machine": (0, 1),
        "Tube Light": (1, 3), "AC": (0, 0), "TV": (1, 1),
        "Microwave": (0, 1), "Refrigerator": (0, 1), "Radio": (1, 1),
        "Phone Charger": (1, 3), "Water Heater": (0, 0)
    },
    "Medium": {
        "Iron": (1, 1), "Laptop": (1, 2), "Blender": (1, 1),
        "Fan": (1, 3), "Electric Kettle": (1, 1), "Heater": (0, 1),
        "CFL Lamp": (3, 5), "LED Lamp": (3, 6), "Rice Cooker": (0, 1),
        "Incandescent Lamp": (1, 2), "Washing Machine": (0, 1),
        "Tube Light": (2, 4), "AC": (0, 1), "TV": (1, 2),
        "Microwave": (0, 1), "Refrigerator": (1, 1), "Radio": (1, 1),
        "Phone Charger": (2, 4), "Water Heater": (0, 1)
    },
    "High": {
        "Iron": (1, 2), "Laptop": (1, 3), "Blender": (1, 2),
        "Fan": (2, 4), "Electric Kettle": (1, 2), "Heater": (0, 2),
        "CFL Lamp": (4, 6), "LED Lamp": (4, 8), "Rice Cooker": (0, 2),
        "Incandescent Lamp": (0, 1), "Washing Machine": (1, 1),
        "Tube Light": (2, 4), "AC": (0, 2), "TV": (1, 3),
        "Microwave": (0, 2), "Refrigerator": (1, 2), "Radio": (0, 1),
        "Phone Charger": (3, 6), "Water Heater": (0, 2)
    }
}

# Rwanda tariffs (accurate as of 2024)
def calculate_bill(kwh):
    if kwh <= 20:
        return kwh * 89
    elif kwh <= 50:
        return (20 * 89) + (kwh - 20) * 310
    else:
        return (20 * 89) + (30 * 310) + (kwh - 50) * 369

# Generate realistic dataset
data = []
household_appliances = {}

for i in range(1, NUM_RECORDS + 1):
    household_id = f"H-{i:04d}"

    # Generate household characteristics first
    region = random.choices(regions, weights=region_weights, k=1)[0]
    income_level = random.choices(income_levels, weights=income_weights, k=1)[0]
    household_size = random.randint(2, 8)

    # Determine number of appliances based on income and household size
    if income_level == "Low":
        appliances_count = random.randint(3, 6)
    elif income_level == "Medium":
        appliances_count = random.randint(5, 9)
    else:
        appliances_count = random.randint(7, 12)

    # Select appliances for this household based on income level
    household_appliances[household_id] = random.sample(
        appliance_probability[income_level],
        min(appliances_count, len(appliance_probability[income_level]))
    )

    # Create records for each appliance in the household
    for appliance in household_appliances[household_id]:
        # Get realistic values based on appliance type and income level
        power_watts = random.randint(*power_ranges[appliance])

        # Get usage hours based on income level
        usage_hours = round(random.uniform(*usage_patterns[income_level][appliance]), 2)

        # Get quantity based on income level
        quantity = random.randint(*quantity_ranges[income_level][appliance])
        if quantity == 0:
            continue

        # Get usage days per month
        usage_days = random.randint(*usage_days_ranges[appliance])

        # Calculate total monthly kWh
        total_kwh = round((power_watts * usage_hours * usage_days * quantity) / 1000, 2)

        # Only include records with reasonable consumption
        if total_kwh > 1000:
            total_kwh = round(random.uniform(50, 300), 2)

        bill = round(calculate_bill(total_kwh), 2)

        # tariff bracket label
        if total_kwh <= 20:
            tariff = "0-20 kWh"
        elif total_kwh <= 50:
            tariff = "21-50 kWh"
        else:
            tariff = "50+ kWh"

        data.append([
            household_id, appliance, power_watts, usage_hours, quantity,
            region, income_level, appliances_count, usage_days, household_size,
            total_kwh, tariff, bill
        ])

# Create DataFrame
columns = [
    "Household_ID", "Appliance", "Power_Watts", "Usage_Hours_Daily", "Quantity",
    "Region", "Income_Level", "Appliances_Count", "Usage_Days_Monthly", "Household_Size",
    "Total_kWh_Monthly", "Tariff_Bracket", "Estimated_Bill_Fr"
]

df = pd.DataFrame(data, columns=columns)

# Add some realistic noise to the data
df['Power_Watts'] = df.apply(lambda x: max(5, x['Power_Watts'] + random.randint(-10, 10)), axis=1)
df['Usage_Hours_Daily'] = df.apply(lambda x: max(0.1, round(x['Usage_Hours_Daily'] + random.uniform(-0.5, 0.5), 2)), axis=1)
df['Total_kWh_Monthly'] = df.apply(lambda x: max(0.1, round(x['Total_kWh_Monthly'] * random.uniform(0.9, 1.1), 2)), axis=1)

# Recalculate bill with updated kWh values
df['Estimated_Bill_Fr'] = df['Total_kWh_Monthly'].apply(calculate_bill).round(2)

# Save to CSV
df.to_csv("realistic_rwanda_energy_dataset.csv", index=False)

print("Realistic dataset created: realistic_rwanda_energy_dataset.csv")
print(f"Total records: {len(df)}")
print(f"Unique households: {df['Household_ID'].nunique()}")
print(f"Unique appliances: {df['Appliance'].nunique()}")

# Show summary statistics
print("\n Summary Statistics:")
print(f"Average monthly consumption: {df['Total_kWh_Monthly'].mean():.2f} kWh")
print(f"Average monthly bill: {df['Estimated_Bill_Fr'].mean():.2f} RWF")
print(f"Income distribution:\n{df['Income_Level'].value_counts(normalize=True).round(2)}")
print(f"Regional distribution:\n{df['Region'].value_counts(normalize=True).round(2)}")

✅ Realistic dataset created: realistic_rwanda_energy_dataset.csv
📊 Total records: 66713
🏠 Unique households: 11000
💡 Unique appliances: 14

📈 Summary Statistics:
Average monthly consumption: 26.09 kWh
Average monthly bill: 6224.43 RWF
Income distribution:
Income_Level
Medium    0.49
Low       0.29
High      0.22
Name: proportion, dtype: float64
Regional distribution:
Region
Kigali       0.35
Muhanga      0.11
Musanze      0.11
Rubavu       0.11
Rusizi       0.11
Nyagatare    0.10
Huye         0.10
Name: proportion, dtype: float64
