# Dummy Data Generator
Generates dummy data in .csv for ESG scores of core ESG metrics for 4 companies in the tech industry  
Core ESG metrics were referenced from company_esg_scores.csv (Experimental Querying done by David, Elise & JZ)  
Scores were artificially generated using Apple's data and industry mean data (https://www.spglobal.com/esg/scores/results?cid=4004205)  
Code made using DeepSeek

In [14]:
import pandas as pd
import numpy as np
import random

# Define company and years
company = ["Apple", "Capgemini", "DataDog", "WiseTech Global"]
years = list(range(2019, 2025))

# Define base scores for Apple and industry means
apple_scores = {
    "Waste reduction": 4.5,
    "Climate strategy": 7.6,
    "Labour practices": 5.1,
    "Human capital management": 3.3,
}
industry_means = {
    "Waste reduction": 4.0,
    "Climate strategy": 5.0,
    "Labour practices": 4.4,
    "Human capital management": 4.3,
}

# Function to generate scores for a company
def generate_scores(company, year):
    if company == "AAPL":
        # Apple's scores improve slightly each year
        base_scores = {
            "Greenhouse Gas Emission reduction": apple_scores["Climate strategy"] + (year - 2019) * 0.2,
            "Energy Consumption reduction": apple_scores["Climate strategy"] - 0.6 + (year - 2019) * 0.2,
            "Water consumption reduction": apple_scores["Climate strategy"] - 1.1 + (year - 2019) * 0.2,
            "Waste reduction": apple_scores["Waste reduction"] + (year - 2019) * 0.1,
            "Current Employees by Gender": 7.8 + (year - 2019) * 0.2,
            "Turnover rate by Gender": 7.3 + (year - 2019) * 0.1,
            "New Hires by Gender": 7.6 + (year - 2019) * 0.2,
            "Current Employees by Age Groups": 8.0 + (year - 2019) * 0.2,
            "New employee hires by age group": 7.7 + (year - 2019) * 0.2,
            "Turnover rate": 5.8 + (year - 2019) * 0.1,
            "Average Training Hours per Employee": 7.3 + (year - 2019) * 0.1,
            "Fatalities": 0.0,
            "High-consequence injuries": 0.6 - (year - 2019) * 0.1,
            "Recordable injuries": 1.3 - (year - 2019) * 0.1,
            "Number of Cases of Work-related Ill Health Cases": 0.9 - (year - 2019) * 0.1,
            "Board Independence": 8.8 + (year - 2019) * 0.1,
            "Women on the Board": 8.3 + (year - 2019) * 0.1,
            "Women in Management": 7.6 + (year - 2019) * 0.1,
            "Anti-corruption disclosures": 7.8 + (year - 2019) * 0.1,
            "Anti-Corruption Training for Employees": 7.3 + (year - 2019) * 0.1,
            "Certification": random.randint(0, 1),
            "Alignment with frameworks and disclosure practices": 8.8 + (year - 2019) * 0.1,
            "Assurance of sustainability report": 7.8 + (year - 2019) * 0.1,
        }
    else:
        # Other companies' scores are based on industry means with some variation
        base_scores = {
            "Greenhouse Gas Emission reduction": industry_means["Climate strategy"] + (year - 2019) * 0.2,
            "Energy Consumption reduction": industry_means["Climate strategy"] - 0.5 + (year - 2019) * 0.2,
            "Water consumption reduction": industry_means["Climate strategy"] - 1.0 + (year - 2019) * 0.2,
            "Waste reduction": industry_means["Waste reduction"] + (year - 2019) * 0.1,
            "Current Employees by Gender": 6.0 + (year - 2019) * 0.2,
            "Turnover rate by Gender": 5.5 + (year - 2019) * 0.1,
            "New Hires by Gender": 6.0 + (year - 2019) * 0.2,
            "Current Employees by Age Groups": 6.5 + (year - 2019) * 0.2,
            "New employee hires by age group": 6.0 + (year - 2019) * 0.2,
            "Turnover rate": 5.0 + (year - 2019) * 0.1,
            "Average Training Hours per Employee": 5.5 + (year - 2019) * 0.1,
            "Fatalities": 0.2 - (year - 2019) * 0.05,
            "High-consequence injuries": 1.0 - (year - 2019) * 0.1,
            "Recordable injuries": 2.0 - (year - 2019) * 0.1,
            "Number of Cases of Work-related Ill Health Cases": 1.5 - (year - 2019) * 0.1,
            "Board Independence": 7.0 + (year - 2019) * 0.1,
            "Women on the Board": 6.0 + (year - 2019) * 0.1,
            "Women in Management": 5.5 + (year - 2019) * 0.1,
            "Anti-corruption disclosures": 6.5 + (year - 2019) * 0.1,
            "Anti-Corruption Training for Employees": 6.0 + (year - 2019) * 0.1,
            "Certification": random.randint(0, 1),
            "Alignment with frameworks and disclosure practices": 7.5 + (year - 2019) * 0.1,
            "Assurance of sustainability report": 6.5 + (year - 2019) * 0.1,
        }
        # Add some randomness to simulate variation between companies
        for key in base_scores:
            if key not in ["Certification", "Fatalities", "High-consequence injuries", "Recordable injuries", "Number of Cases of Work-related Ill Health Cases"]:
                base_scores[key] += np.random.uniform(-0.2, 0.2)  # Add small random variation

        for key in base_scores:
            if key not in ["Certification"]:
                base_scores[key] = base_scores[key] / 10
            
    return base_scores

# Generate the dataset
data = []
for comp in company:
    for year in years:
        row = {"company": comp, "year": year}
        row.update(generate_scores(comp, year))
        data.append(row)

# Convert to DataFrame
df = pd.DataFrame(data)

# Save to CSV
df.to_csv("esg_dataset_2019_2024.csv", index=False)
print("Dataset saved to 'esg_dataset_2019_2024.csv'")

Dataset saved to 'esg_dataset_2019_2024.csv'
