## Import relevant libraries

In [1]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


In [2]:
print("The current working directory is: ",
       os.getcwd()
     )                                                  # Current working directory 

The current working directory is:  c:\Users\30697\Desktop\Jobs\Job Descriptions\ML_assignment_2025


## Import, merge and drop unnecessary variables of the original datasets

In [3]:
# CO2 factor (Y)
CO2factor_2021 = pd.read_csv("electriciteitsmix-2021-uur-data.csv")
CO2factor_2022 = pd.read_csv("electriciteitsmix-2022-uur-data.csv")
CO2factor_2023 = pd.read_csv("electriciteitsmix-2023-uur-data.csv") 
CO2factor_2024 = pd.read_csv("electriciteitsmix-2024-uur-data.csv")         

In [4]:
print(CO2factor_2021.shape, CO2factor_2022.shape, CO2factor_2023.shape, CO2factor_2024.shape)  # Check the shapes of the datasets

(8760, 13) (8760, 13) (8760, 13) (8784, 13)


In [5]:
# merge the datasets
CO2factor = pd.concat([CO2factor_2021, CO2factor_2022, CO2factor_2023, CO2factor_2024], ignore_index=True)

In [6]:
CO2factor.shape # merged dataset shape

(35064, 13)

In [7]:
CO2factor.head() # display the first few rows of the merged dataset

Unnamed: 0,validfrom (UTC),validto (UTC),point,type,granularity,timezone,activity,classification,capacity (kW),volume (kWh),percentage,emission (kg CO2),emissionfactor (kg CO2/kWh)
0,2020-12-31 23:00:00,2021-01-01 00:00:00,Nederland,ElectricityMix,Hour,UTC,Providing,Current,10899485,10899485,0.605003,3939727,0.361425
1,2021-01-01 00:00:00,2021-01-01 01:00:00,Nederland,ElectricityMix,Hour,UTC,Providing,Current,10180546,10180546,0.565084,3646349,0.35814
2,2021-01-01 01:00:00,2021-01-01 02:00:00,Nederland,ElectricityMix,Hour,UTC,Providing,Current,9166898,9166898,0.508808,3201484,0.34921
3,2021-01-01 02:00:00,2021-01-01 03:00:00,Nederland,ElectricityMix,Hour,UTC,Providing,Current,8063011,8063011,0.447527,2739129,0.339635
4,2021-01-01 03:00:00,2021-01-01 04:00:00,Nederland,ElectricityMix,Hour,UTC,Providing,Current,7712362,7712362,0.428055,2552202,0.330915


In [8]:
CO2factor.tail() # Display the last few rows of the merged dataset

Unnamed: 0,validfrom (UTC),validto (UTC),point,type,granularity,timezone,activity,classification,capacity (kW),volume (kWh),percentage,emission (kg CO2),emissionfactor (kg CO2/kWh)
35059,2024-12-31 18:00:00,2024-12-31 19:00:00,Nederland,ElectricityMix,Hour,UTC,Providing,Current,14775988,14775988,0.395878,1401628,0.094857
35060,2024-12-31 19:00:00,2024-12-31 20:00:00,Nederland,ElectricityMix,Hour,UTC,Providing,Current,14406607,14406607,0.385977,1242798,0.086264
35061,2024-12-31 20:00:00,2024-12-31 21:00:00,Nederland,ElectricityMix,Hour,UTC,Providing,Current,14291198,14291198,0.382881,1189433,0.083229
35062,2024-12-31 21:00:00,2024-12-31 22:00:00,Nederland,ElectricityMix,Hour,UTC,Providing,Current,14137435,14137435,0.378757,1155729,0.081749
35063,2024-12-31 22:00:00,2024-12-31 23:00:00,Nederland,ElectricityMix,Hour,UTC,Providing,Current,13976718,13976718,0.374447,1084655,0.077602


In [9]:
print(CO2factor.nunique())  # Check for unique values in each column



validfrom (UTC)                35064
validto (UTC)                  35064
point                              1
type                               1
granularity                        1
timezone                           1
activity                           1
classification                     1
capacity (kW)                  35016
volume (kWh)                   35016
percentage                     34838
emission (kg CO2)              34942
emissionfactor (kg CO2/kWh)    34683
dtype: int64


In [10]:
# Clean the CO2 factor dataset
CO2factor = CO2factor.drop(columns={'validto (UTC)',
                                               'point', 
                                               'type', 
                                               'granularity', 
                                               'timezone', 
                                               'activity', 
                                               'classification', 
                                               'capacity (kW)', 
                                               'volume (kWh)', 
                                               'percentage', 
                                               'emission (kg CO2)'}
                                               )

In [11]:
# Rename columns for clarity
CO2factor = CO2factor.rename(columns={
    'validfrom (UTC)': 'starting_time',
    'emissionfactor (kg CO2/kWh)': 'CO2factor'}
)

In [12]:
CO2factor.duplicated().sum()  # Check for duplicates in the dataset

np.int64(0)

In [13]:
CO2factor.isna().sum()  # Check for missing values in the dataset

starting_time    0
CO2factor        0
dtype: int64

In [14]:
CO2factor.head() # view final dataset

Unnamed: 0,starting_time,CO2factor
0,2020-12-31 23:00:00,0.361425
1,2021-01-01 00:00:00,0.35814
2,2021-01-01 01:00:00,0.34921
3,2021-01-01 02:00:00,0.339635
4,2021-01-01 03:00:00,0.330915


In [15]:
# solar energy(X_1)
solar_2021 = pd.read_csv("zon-2021-uur-data.csv")
solar_2022 = pd.read_csv("zon-2022-uur-data.csv")
solar_2023 = pd.read_csv("zon-2023-uur-data.csv")
solar_2024 = pd.read_csv("zon-2024-uur-data.csv")  

In [16]:
print(solar_2021.shape, solar_2022.shape, solar_2023.shape, solar_2024.shape)  # Check the shapes of the datasets   

(8760, 13) (8760, 13) (8760, 13) (8784, 13)


In [17]:
solar = pd.concat([solar_2021, solar_2022, solar_2023, solar_2024], ignore_index=True)  # merge the datasets

In [18]:
solar.shape

(35064, 13)

In [19]:
solar.head(5)  # display the first few rows of the merged dataset

Unnamed: 0,validfrom (UTC),validto (UTC),point,type,granularity,timezone,activity,classification,capacity (kW),volume (kWh),percentage,emission (kg CO2),emissionfactor (kg CO2/kWh)
0,2020-12-31 23:00:00,2021-01-01 00:00:00,Nederland,Solar,Hour,UTC,Providing,Current,0,0,0.0,0,0
1,2021-01-01 00:00:00,2021-01-01 01:00:00,Nederland,Solar,Hour,UTC,Providing,Current,0,0,0.0,0,0
2,2021-01-01 01:00:00,2021-01-01 02:00:00,Nederland,Solar,Hour,UTC,Providing,Current,0,0,0.0,0,0
3,2021-01-01 02:00:00,2021-01-01 03:00:00,Nederland,Solar,Hour,UTC,Providing,Current,0,0,0.0,0,0
4,2021-01-01 03:00:00,2021-01-01 04:00:00,Nederland,Solar,Hour,UTC,Providing,Current,0,0,0.0,0,0


In [20]:
solar.tail()  # Display the last few rows of the merged dataset

Unnamed: 0,validfrom (UTC),validto (UTC),point,type,granularity,timezone,activity,classification,capacity (kW),volume (kWh),percentage,emission (kg CO2),emissionfactor (kg CO2/kWh)
35059,2024-12-31 18:00:00,2024-12-31 19:00:00,Nederland,Solar,Hour,UTC,Providing,Current,0,0,0.0,0,0
35060,2024-12-31 19:00:00,2024-12-31 20:00:00,Nederland,Solar,Hour,UTC,Providing,Current,0,0,0.0,0,0
35061,2024-12-31 20:00:00,2024-12-31 21:00:00,Nederland,Solar,Hour,UTC,Providing,Current,0,0,0.0,0,0
35062,2024-12-31 21:00:00,2024-12-31 22:00:00,Nederland,Solar,Hour,UTC,Providing,Current,0,0,0.0,0,0
35063,2024-12-31 22:00:00,2024-12-31 23:00:00,Nederland,Solar,Hour,UTC,Providing,Current,0,0,0.0,0,0


In [21]:
print(solar.nunique())  # Check for unique values in each column



validfrom (UTC)                35064
validto (UTC)                  35064
point                              1
type                               1
granularity                        1
timezone                           1
activity                           1
classification                     1
capacity (kW)                  19123
volume (kWh)                   19123
percentage                     19149
emission (kg CO2)                  1
emissionfactor (kg CO2/kWh)        1
dtype: int64


In [22]:
# Clean the CO2 factor dataset
solar = solar.drop(columns={'validto (UTC)',
                                               'point', 
                                               'type', 
                                               'granularity', 
                                               'timezone', 
                                               'activity', 
                                               'classification', 
                                               'capacity (kW)', 
                                               'emissionfactor (kg CO2/kWh)', 
                                               'percentage', 
                                               'emission (kg CO2)'}
                                               )

In [23]:
# Rename columns for clarity
solar = solar.rename(columns={
    'validfrom (UTC)': 'starting_time',
    'volume (kWh)': 'solar'}
)

In [24]:
solar.duplicated().sum()  # Check for duplicates in the dataset

np.int64(0)

In [25]:
solar.isna().sum()  # Check for missing values in the dataset

starting_time    0
solar            0
dtype: int64

In [26]:
solar.head()

Unnamed: 0,starting_time,solar
0,2020-12-31 23:00:00,0
1,2021-01-01 00:00:00,0
2,2021-01-01 01:00:00,0
3,2021-01-01 02:00:00,0
4,2021-01-01 03:00:00,0


In [27]:
# wind energy onshore (X_2)
wind_onshore_2021 = pd.read_csv("wind-2021-uur-data.csv")
wind_onshore_2022 = pd.read_csv("wind-2022-uur-data.csv")
wind_onshore_2023 = pd.read_csv("wind-2023-uur-data.csv")
wind_onshore_2024 = pd.read_csv("wind-2024-uur-data.csv")

In [28]:
print(wind_onshore_2021.shape, wind_onshore_2022.shape, wind_onshore_2023.shape, wind_onshore_2024.shape)  # Check the shapes of the datasets

(8760, 13) (8760, 13) (8760, 13) (8784, 13)


In [29]:
landwind = pd.concat([wind_onshore_2021, wind_onshore_2022, wind_onshore_2023, wind_onshore_2024], ignore_index=True)  # merge the datasets

In [30]:
landwind.shape

(35064, 13)

In [31]:
landwind.head()

Unnamed: 0,validfrom (UTC),validto (UTC),point,type,granularity,timezone,activity,classification,capacity (kW),volume (kWh),percentage,emission (kg CO2),emissionfactor (kg CO2/kWh)
0,2020-12-31 23:00:00,2021-01-01 00:00:00,Nederland,Wind,Hour,UTC,Providing,Current,82894,82894,0.023317,0,0
1,2021-01-01 00:00:00,2021-01-01 01:00:00,Nederland,Wind,Hour,UTC,Providing,Current,95906,95906,0.026977,0,0
2,2021-01-01 01:00:00,2021-01-01 02:00:00,Nederland,Wind,Hour,UTC,Providing,Current,122117,122117,0.034349,0,0
3,2021-01-01 02:00:00,2021-01-01 03:00:00,Nederland,Wind,Hour,UTC,Providing,Current,87353,87353,0.024571,0,0
4,2021-01-01 03:00:00,2021-01-01 04:00:00,Nederland,Wind,Hour,UTC,Providing,Current,59687,59687,0.016788,0,0


In [32]:
landwind.tail()

Unnamed: 0,validfrom (UTC),validto (UTC),point,type,granularity,timezone,activity,classification,capacity (kW),volume (kWh),percentage,emission (kg CO2),emissionfactor (kg CO2/kWh)
35059,2024-12-31 18:00:00,2024-12-31 19:00:00,Nederland,Wind,Hour,UTC,Providing,Current,6375458,6375458,0.973078,0,0
35060,2024-12-31 19:00:00,2024-12-31 20:00:00,Nederland,Wind,Hour,UTC,Providing,Current,6448966,6448966,0.984291,0,0
35061,2024-12-31 20:00:00,2024-12-31 21:00:00,Nederland,Wind,Hour,UTC,Providing,Current,6472458,6472458,0.987871,0,0
35062,2024-12-31 21:00:00,2024-12-31 22:00:00,Nederland,Wind,Hour,UTC,Providing,Current,6486790,6486790,0.990052,0,0
35063,2024-12-31 22:00:00,2024-12-31 23:00:00,Nederland,Wind,Hour,UTC,Providing,Current,6499171,6499171,0.991935,0,0


In [33]:
print(landwind.nunique())  # Check for unique values in each column



validfrom (UTC)                35064
validto (UTC)                  35064
point                              1
type                               1
granularity                        1
timezone                           1
activity                           1
classification                     1
capacity (kW)                  34056
volume (kWh)                   34056
percentage                     34105
emission (kg CO2)                  1
emissionfactor (kg CO2/kWh)        1
dtype: int64


In [34]:
# Clean the CO2 factor dataset
landwind = landwind.drop(columns={'validto (UTC)',
                                               'point', 
                                               'type', 
                                               'granularity', 
                                               'timezone', 
                                               'activity', 
                                               'classification', 
                                               'capacity (kW)', 
                                               'emissionfactor (kg CO2/kWh)', 
                                               'percentage', 
                                               'emission (kg CO2)'}
                                               )

In [35]:
# Rename columns for clarity
landwind = landwind.rename(columns={
    'validfrom (UTC)': 'starting_time',
    'volume (kWh)': 'landwind'}
)

In [36]:
landwind.duplicated().sum()  # Check for duplicates in the dataset

np.int64(0)

In [37]:
landwind.isna().sum()  # Check for missing values in the dataset

starting_time    0
landwind         0
dtype: int64

In [38]:
landwind.head()

Unnamed: 0,starting_time,landwind
0,2020-12-31 23:00:00,82894
1,2021-01-01 00:00:00,95906
2,2021-01-01 01:00:00,122117
3,2021-01-01 02:00:00,87353
4,2021-01-01 03:00:00,59687


In [39]:
# wind energy offshore (X_3)
seawind_2021 = pd.read_csv("zeewind-2021-uur-data.csv")
seawind_2022 = pd.read_csv("zeewind-2022-uur-data.csv")
seawind_2023 = pd.read_csv("zeewind-2023-uur-data.csv")
seawind_2024 = pd.read_csv("zeewind-2024-uur-data.csv") 

In [40]:
print(seawind_2021.shape, seawind_2022.shape, seawind_2023.shape, seawind_2024.shape)  # Check the shapes of the datasets

(8760, 13) (8760, 13) (8760, 13) (8784, 13)


In [41]:
print(seawind_2021.shape, seawind_2022.shape, seawind_2023.shape, seawind_2024.shape)  # Check the shapes of the datasets

(8760, 13) (8760, 13) (8760, 13) (8784, 13)


In [42]:
seawind = pd.concat([seawind_2021, seawind_2022, seawind_2023, seawind_2024], ignore_index=True)  # merge the datasets

In [43]:
seawind.shape

(35064, 13)

In [44]:
seawind.head()

Unnamed: 0,validfrom (UTC),validto (UTC),point,type,granularity,timezone,activity,classification,capacity (kW),volume (kWh),percentage,emission (kg CO2),emissionfactor (kg CO2/kWh)
0,2020-12-31 23:00:00,2021-01-01 00:00:00,Nederland,WindOffshoreC,Hour,UTC,Providing,Current,256749,256749,0.116869,0,0
1,2021-01-01 00:00:00,2021-01-01 01:00:00,Nederland,WindOffshoreC,Hour,UTC,Providing,Current,325500,325500,0.148163,0,0
2,2021-01-01 01:00:00,2021-01-01 02:00:00,Nederland,WindOffshoreC,Hour,UTC,Providing,Current,332500,332500,0.151349,0,0
3,2021-01-01 02:00:00,2021-01-01 03:00:00,Nederland,WindOffshoreC,Hour,UTC,Providing,Current,322749,322749,0.146911,0,0
4,2021-01-01 03:00:00,2021-01-01 04:00:00,Nederland,WindOffshoreC,Hour,UTC,Providing,Current,351750,351750,0.160112,0,0


In [45]:
seawind.tail()

Unnamed: 0,validfrom (UTC),validto (UTC),point,type,granularity,timezone,activity,classification,capacity (kW),volume (kWh),percentage,emission (kg CO2),emissionfactor (kg CO2/kWh)
35059,2024-12-31 18:00:00,2024-12-31 19:00:00,Nederland,WindOffshoreC,Hour,UTC,Providing,Current,3711000,3711000,0.892497,0,0
35060,2024-12-31 19:00:00,2024-12-31 20:00:00,Nederland,WindOffshoreC,Hour,UTC,Providing,Current,3718000,3718000,0.89418,0,0
35061,2024-12-31 20:00:00,2024-12-31 21:00:00,Nederland,WindOffshoreC,Hour,UTC,Providing,Current,3744500,3744500,0.900553,0,0
35062,2024-12-31 21:00:00,2024-12-31 22:00:00,Nederland,WindOffshoreC,Hour,UTC,Providing,Current,3686750,3686750,0.886664,0,0
35063,2024-12-31 22:00:00,2024-12-31 23:00:00,Nederland,WindOffshoreC,Hour,UTC,Providing,Current,3719250,3719250,0.894481,0,0


In [46]:
print(seawind.nunique())  # Check for unique values in each column



validfrom (UTC)                35064
validto (UTC)                  35064
point                              1
type                               1
granularity                        1
timezone                           1
activity                           1
classification                     1
capacity (kW)                  19861
volume (kWh)                   19861
percentage                     30932
emission (kg CO2)                  1
emissionfactor (kg CO2/kWh)        1
dtype: int64


In [47]:
# Clean the CO2 factor dataset
seawind = seawind.drop(columns={'validto (UTC)',
                                               'point', 
                                               'type', 
                                               'granularity', 
                                               'timezone', 
                                               'activity', 
                                               'classification', 
                                               'capacity (kW)', 
                                               'emissionfactor (kg CO2/kWh)', 
                                               'percentage', 
                                               'emission (kg CO2)'}
                                               )

In [48]:
# Rename columns for clarity
seawind = seawind.rename(columns={
    'validfrom (UTC)': 'starting_time',
    'volume (kWh)': 'seawind'}
)

In [49]:
seawind.duplicated().sum()  # Check for duplicates in the dataset

np.int64(0)

In [50]:
seawind.isna().sum()  # Check for missing values in the dataset

starting_time    0
seawind          0
dtype: int64

In [51]:
seawind.head()

Unnamed: 0,starting_time,seawind
0,2020-12-31 23:00:00,256749
1,2021-01-01 00:00:00,325500
2,2021-01-01 01:00:00,332500
3,2021-01-01 02:00:00,322749
4,2021-01-01 03:00:00,351750


In [52]:
# biomass energy (X_4)
biomass_2021 = pd.read_csv("biomassa-2021-uur-data.csv") 
biomass_2022 = pd.read_csv("biomassa-2022-uur-data.csv")
biomass_2023 = pd.read_csv("biomassa-2023-uur-data.csv")
biomass_2024 = pd.read_csv("biomassa-2024-uur-data.csv")   

In [53]:
print(biomass_2021.shape, biomass_2022.shape, biomass_2023.shape, biomass_2024.shape)  # Check the shapes of the datasets

(8760, 13) (8760, 13) (8760, 13) (8784, 13)


In [54]:
biomass = pd.concat([biomass_2021, biomass_2022, biomass_2023, biomass_2024], ignore_index=True)  # merge the datasets  

In [55]:
biomass.shape

(35064, 13)

In [56]:
biomass.head()

Unnamed: 0,validfrom (UTC),validto (UTC),point,type,granularity,timezone,activity,classification,capacity (kW),volume (kWh),percentage,emission (kg CO2),emissionfactor (kg CO2/kWh)
0,2020-12-31 23:00:00,2021-01-01 00:00:00,Nederland,BiomassPower,Hour,UTC,Providing,Current,662593,662593,0.662594,0,0
1,2021-01-01 00:00:00,2021-01-01 01:00:00,Nederland,BiomassPower,Hour,UTC,Providing,Current,675321,675321,0.675321,0,0
2,2021-01-01 01:00:00,2021-01-01 02:00:00,Nederland,BiomassPower,Hour,UTC,Providing,Current,588699,588699,0.588699,0,0
3,2021-01-01 02:00:00,2021-01-01 03:00:00,Nederland,BiomassPower,Hour,UTC,Providing,Current,487599,487599,0.487599,0,0
4,2021-01-01 03:00:00,2021-01-01 04:00:00,Nederland,BiomassPower,Hour,UTC,Providing,Current,392943,392943,0.392943,0,0


In [57]:
biomass.tail()

Unnamed: 0,validfrom (UTC),validto (UTC),point,type,granularity,timezone,activity,classification,capacity (kW),volume (kWh),percentage,emission (kg CO2),emissionfactor (kg CO2/kWh)
35059,2024-12-31 18:00:00,2024-12-31 19:00:00,Nederland,BiomassPower,Hour,UTC,Providing,Current,70087,70087,0.070088,0,0
35060,2024-12-31 19:00:00,2024-12-31 20:00:00,Nederland,BiomassPower,Hour,UTC,Providing,Current,70205,70205,0.070205,0,0
35061,2024-12-31 20:00:00,2024-12-31 21:00:00,Nederland,BiomassPower,Hour,UTC,Providing,Current,70264,70264,0.070264,0,0
35062,2024-12-31 21:00:00,2024-12-31 22:00:00,Nederland,BiomassPower,Hour,UTC,Providing,Current,70263,70263,0.070264,0,0
35063,2024-12-31 22:00:00,2024-12-31 23:00:00,Nederland,BiomassPower,Hour,UTC,Providing,Current,70146,70146,0.070147,0,0


In [58]:
print(biomass.nunique())  # Check for unique values in each column



validfrom (UTC)                35064
validto (UTC)                  35064
point                              1
type                               1
granularity                        1
timezone                           1
activity                           1
classification                     1
capacity (kW)                  29626
volume (kWh)                   29626
percentage                     30076
emission (kg CO2)                  1
emissionfactor (kg CO2/kWh)        1
dtype: int64


In [59]:
# Clean the CO2 factor dataset
biomass = biomass.drop(columns={'validto (UTC)',
                                               'point', 
                                               'type', 
                                               'granularity', 
                                               'timezone', 
                                               'activity', 
                                               'classification', 
                                               'capacity (kW)', 
                                               'emissionfactor (kg CO2/kWh)', 
                                               'percentage', 
                                               'emission (kg CO2)'}
                                               )

In [60]:
# Rename columns for clarity
biomass = biomass.rename(columns={
    'validfrom (UTC)': 'starting_time',
    'volume (kWh)': 'biomass'}
)

In [61]:
biomass.duplicated().sum()  # Check for duplicates in the dataset

np.int64(0)

In [62]:
biomass.isna().sum()  # Check for missing values in the dataset

starting_time    0
biomass          0
dtype: int64

In [63]:
biomass.head()

Unnamed: 0,starting_time,biomass
0,2020-12-31 23:00:00,662593
1,2021-01-01 00:00:00,675321
2,2021-01-01 01:00:00,588699
3,2021-01-01 02:00:00,487599
4,2021-01-01 03:00:00,392943


In [64]:
energy = pd.merge(CO2factor, solar, on='starting_time', how='outer')  # Merge CO2 factor and solar datasets
energy = pd.merge(energy, landwind, on='starting_time', how='outer')    # Merge with land wind dataset
energy = pd.merge(energy, seawind, on='starting_time', how='outer')    # Merge with sea wind dataset
energy = pd.merge(energy, biomass, on='starting_time', how='outer')  # Merge with biomass dataset

In [65]:
energy.shape  # Check the shape of the final dataset

(35064, 6)

In [66]:
energy.head(5)

Unnamed: 0,starting_time,CO2factor,solar,landwind,seawind,biomass
0,2020-12-31 23:00:00,0.361425,0,82894,256749,662593
1,2021-01-01 00:00:00,0.35814,0,95906,325500,675321
2,2021-01-01 01:00:00,0.34921,0,122117,332500,588699
3,2021-01-01 02:00:00,0.339635,0,87353,322749,487599
4,2021-01-01 03:00:00,0.330915,0,59687,351750,392943


In [67]:
energy.tail()

Unnamed: 0,starting_time,CO2factor,solar,landwind,seawind,biomass
35059,2024-12-31 18:00:00,0.094857,0,6375458,3711000,70087
35060,2024-12-31 19:00:00,0.086264,0,6448966,3718000,70205
35061,2024-12-31 20:00:00,0.083229,0,6472458,3744500,70264
35062,2024-12-31 21:00:00,0.081749,0,6486790,3686750,70263
35063,2024-12-31 22:00:00,0.077602,0,6499171,3719250,70146


In [68]:
energy.nunique()  # Check for unique values in each column  

starting_time    35064
CO2factor        34683
solar            19123
landwind         34056
seawind          19861
biomass          29626
dtype: int64

In [69]:
energy.dtypes # Check the data types of each column

starting_time     object
CO2factor        float64
solar              int64
landwind           int64
seawind            int64
biomass            int64
dtype: object

In [70]:
energy.isna().sum()  # Check for missing values in the dataset

starting_time    0
CO2factor        0
solar            0
landwind         0
seawind          0
biomass          0
dtype: int64

In [71]:
energy['starting_time'] = pd.to_datetime(energy['starting_time']) # set 'starting_time' as datetime type

### Save the cleaned dataset

In [None]:
energy.to_excel("energy_data.xlsx", index=False) 
