In [1]:
pip install pandas numpy

Note: you may need to restart the kernel to use updated packages.




In [2]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random

In [3]:

# Parameters for generating the dataset for 34 OEMs
num_factories = 34
machinery_per_factory = 12  # Approximate number of unique machinery types per factory
factory_ids = [f"VW_{i:02}" for i in range(1, num_factories + 1)]
locations = [
    {"city": "Beijing", "country": "China"},
    {"city": "Shanghai", "country": "China"},
    {"city": "Pune", "country": "India"},
    {"city": "Chennai", "country": "India"},
    {"city": "Bangkok", "country": "Thailand"},
    {"city": "Jakarta", "country": "Indonesia"},
    {"city": "Hanoi", "country": "Vietnam"},
    {"city": "Manila", "country": "Philippines"},
    {"city": "Seoul", "country": "South Korea"},
    {"city": "Kuala Lumpur", "country": "Malaysia"},
    {"city": "Tokyo", "country": "Japan"},
    {"city": "Ho Chi Minh City", "country": "Vietnam"},
    {"city": "Yokohama", "country": "Japan"},
    {"city": "Taipei", "country": "Taiwan"},
    {"city": "Guangzhou", "country": "China"},
    {"city": "Mumbai", "country": "India"},
    {"city": "New Delhi", "country": "India"},
    {"city": "Surabaya", "country": "Indonesia"},
    {"city": "Fukuoka", "country": "Japan"},
    {"city": "Kyoto", "country": "Japan"},
    {"city": "Busan", "country": "South Korea"},
    {"city": "Kobe", "country": "Japan"},
    {"city": "Nagoya", "country": "Japan"},
    {"city": "Nanjing", "country": "China"},
    {"city": "Shenzhen", "country": "China"},
    {"city": "Wuhan", "country": "China"},
    {"city": "Kaohsiung", "country": "Taiwan"},
    {"city": "Ulsan", "country": "South Korea"},
    {"city": "Osaka", "country": "Japan"},
    {"city": "Bandung", "country": "Indonesia"},
    {"city": "Hiroshima", "country": "Japan"},
    {"city": "Nagoya", "country": "Japan"},
    {"city": "Suzhou", "country": "China"}
]

# Extended machine types and base energy consumption rates
extended_machine_types = [
    'Painting Booth', 'Oven', 'Torque Tool', 'Injection Moulding Machine',
    'Air Compressor', 'AGV (Automated Guided Vehicle)', 'Testing Calibration Equipment',
    'Conveyor', 'Heater', 'Cooler', 'Press', 'Milling'
]
extended_machine_features = {
    'Painting Booth': 5.5,
    'Oven': 6.5,
    'Torque Tool': 1.8,
    'Injection Moulding Machine': 4.5,
    'Air Compressor': 7.0,
    'AGV (Automated Guided Vehicle)': 2.0,
    'Testing Calibration Equipment': 3.2,
    'Conveyor': 1.5,
    'Heater': 3.5,
    'Cooler': 2.5,
    'Press': 4.0,
    'Milling': 3.0
}

# Seasons
seasons = ["Winter", "Spring", "Summer", "Fall"]
season_dates = {
    "Winter": (1, [1, 2, 12]),
    "Spring": (2, [3, 4, 5]),
    "Summer": (3, [6, 7, 8]),
    "Fall": (4, [9, 10, 11])
}


In [4]:
# Generating data
data = []
base_date = datetime(2023, 1, 1)
hours_in_day = 24 // 4  # Every 4-hour intervals for the entire year

for factory_id, location in zip(factory_ids, locations):
    for machine_type in extended_machine_types:
        base_energy = extended_machine_features[machine_type]
        
        # Generate a random buy date within the past 10 years
        buy_date = base_date - timedelta(days=random.randint(100, 3650))
        machine_age_years = (base_date - buy_date).days / 365.25
        
        # Initial efficiency score based on age, with a small annual decrease
        efficiency_score = round(max(0.7, 1.0 - machine_age_years * 0.02), 2)
        
        # Generate data for each 4-hour interval throughout the year
        for day in range(365):
            date = base_date + timedelta(days=day)
            season = next((s for s, (_, months) in season_dates.items() if date.month in months), "Unknown")
            for hour_block in range(hours_in_day):
                timestamp = date + timedelta(hours=hour_block * 4)
                
                # Determine operational status with higher chances of downtime for older machines
                downtime_chance = 0.02 + (machine_age_years * 0.005)  # Increase chance with age
                operational_status = (
                    random.choice(['Active', 'Idle', 'Off'])
                    if random.random() < downtime_chance
                    else 'Active'
                )
                
                # Generate temperatures
                avg_machine_temp = round(random.uniform(25, 80), 2)
                avg_day_temp = round(random.uniform(-5, 40), 2)
                
                # Calculate energy rating and consumption
                energy_rating = base_energy * (1 + efficiency_score)
                energy_consumption = energy_rating if operational_status == 'Active' else 0

                data.append({
                    'Timestamp': timestamp,
                    'Factory_ID': factory_id,
                    'Factory_Location': f"{location['city']}, {location['country']}",
                    'Machine_ID': f"{factory_id}_M{random.randint(1000, 9999)}",
                    'Machine_Type': machine_type,
                    'Machine_Temperature': avg_machine_temp,
                    'Temperature_Current': avg_day_temp,
                    'Buying_Date': buy_date.strftime('%Y-%m-%d'),
                    'Efficiency_Score': efficiency_score,
                    'Energy_Rating': energy_rating,
                    'Energy_Consumption': energy_consumption,
                    'Season': season,
                    'Operational_Status': operational_status
                })

# Converting to DataFrame and saving
dataset = pd.DataFrame(data)
dataset.to_csv("Volkswagen_OEM_AsiaPacific_2023_SyntheticDataset_withAge.csv", index=False)

print("Dataset created and saved as 'Volkswagen_OEM_AsiaPacific_2023_SyntheticDataset_withAge.csv'")

Dataset created and saved as 'Volkswagen_OEM_AsiaPacific_2023_SyntheticDataset_withAge.csv'
