In [None]:
# Import libraries
import pandas as pd
import numpy as np
from faker import Faker
from mimesis import Generic


: 

In [None]:
# Initialize Faker and Mimesis
fake = Faker()
generic = Generic("en")

# Define the number of records
n_records = 1000

In [None]:
# Generate dataset
data = {
    "Power_Plant_ID": [fake.unique.uuid4() for _ in range(n_records)],  # Unique identifiers
    "Plant_Location": [fake.city() for _ in range(n_records)],  # Plant location
    "Operating_Capacity_MW": [generic.random.uniform(50, 1000) for _ in range(n_records)],  # MW capacity
    "Fuel_Type": np.random.choice(["Coal", "Gas", "Nuclear", "Renewable"], size=n_records, p=[0.3, 0.3, 0.2, 0.2]),
    "Emission_Level_CO2_tonnes": [generic.random.uniform(1000, 50000) for _ in range(n_records)],  # CO2 emissions
    "Operational_Years": [generic.random.randint(1, 60) for _ in range(n_records)],  # Plant age in years
    "Maintenance_Cost_MUSD": [generic.random.uniform(0.5, 20) for _ in range(n_records)],  # Maintenance cost
    "Temperature_C": [generic.random.uniform(15, 45) for _ in range(n_records)],  # Surrounding temperature
    "Humidity_percent": [generic.random.uniform(20, 90) for _ in range(n_records)],  # Humidity
    "Wind_Speed_kmh": [generic.random.uniform(0, 100) for _ in range(n_records)],  # Wind speed
    "Soil_Moisture_percent": [generic.random.uniform(5, 40) for _ in range(n_records)],  # Soil moisture
    "Dam_Height_m": [generic.random.uniform(30, 300) for _ in range(n_records)],  # Dam height
    "Reservoir_Capacity_MCM": [generic.random.uniform(10, 5000) for _ in range(n_records)],  # Reservoir capacity
    "Sedimentation_Rate_mpy": [generic.random.uniform(0.1, 1) for _ in range(n_records)],  # Sedimentation rate
    "Efficiency_percent": [generic.random.uniform(60, 95) for _ in range(n_records)],  # Power plant efficiency
}

In [None]:
# Add missing values
for col in ["Operating_Capacity_MW", "Emission_Level_CO2_tonnes", "Efficiency_percent"]:
    indices = np.random.choice(n_records, size=20, replace=False)
    for idx in indices:
        data[col][idx] = None

In [None]:
# Introduce outliers
outlier_indices = np.random.choice(n_records, size=10, replace=False)
for idx in outlier_indices:
    data["Emission_Level_CO2_tonnes"][idx] = data["Emission_Level_CO2_tonnes"][idx] * 10  # Extreme outliers

In [None]:
# Introduce duplicates (copy some random rows and append them)
duplicate_indices = np.random.choice(n_records, size=50, replace=False)
duplicated_data = pd.DataFrame(data).iloc[duplicate_indices]
duplicated_data["Power_Plant_ID"] = [fake.unique.uuid4() for _ in range(len(duplicated_data))]  # Generate unique IDs
data.update(duplicated_data.to_dict(orient='list'))  # Update original data with duplicated rows


In [None]:
# Add noise by slightly modifying numerical data
noise_indices = np.random.choice(n_records, size=30, replace=False)
for idx in noise_indices:
    data["Operating_Capacity_MW"][idx] += np.random.uniform(-50, 50)  # Add small random noise
    data["Emission_Level_CO2_tonnes"][idx] += np.random.uniform(-500, 500)  # Add small random noise


In [None]:
# Create DataFrame
df = pd.DataFrame(data)

# Save to CSV
df.to_csv("power_plant_dataset.csv", index=False)
print("Dataset created, saved as 'power_plant_dataset.csv'")