In [77]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

link - https://www.kaggle.com/datasets/nicholasjhana/energy-consumption-generation-prices-and-weather


# Load the data

In [78]:
# load the data
df_energy = pd.read_csv('data/energy_dataset.csv')
df_weather = pd.read_csv('data/weather_features.csv')

# Inspect the data

In [79]:
print(df_energy.head())

                        time  generation biomass  \
0  2015-01-01 00:00:00+01:00               447.0   
1  2015-01-01 01:00:00+01:00               449.0   
2  2015-01-01 02:00:00+01:00               448.0   
3  2015-01-01 03:00:00+01:00               438.0   
4  2015-01-01 04:00:00+01:00               428.0   

   generation fossil brown coal/lignite  generation fossil coal-derived gas  \
0                                 329.0                                 0.0   
1                                 328.0                                 0.0   
2                                 323.0                                 0.0   
3                                 254.0                                 0.0   
4                                 187.0                                 0.0   

   generation fossil gas  generation fossil hard coal  generation fossil oil  \
0                 4844.0                       4821.0                  162.0   
1                 5196.0                       4755.

In [80]:
df_energy.shape

(35064, 29)

In [81]:
df_energy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35064 entries, 0 to 35063
Data columns (total 29 columns):
 #   Column                                       Non-Null Count  Dtype  
---  ------                                       --------------  -----  
 0   time                                         35064 non-null  object 
 1   generation biomass                           35045 non-null  float64
 2   generation fossil brown coal/lignite         35046 non-null  float64
 3   generation fossil coal-derived gas           35046 non-null  float64
 4   generation fossil gas                        35046 non-null  float64
 5   generation fossil hard coal                  35046 non-null  float64
 6   generation fossil oil                        35045 non-null  float64
 7   generation fossil oil shale                  35046 non-null  float64
 8   generation fossil peat                       35046 non-null  float64
 9   generation geothermal                        35046 non-null  float64
 10

In [82]:
# time should be datetime not string
type(df_energy['time'][0])

str

- *'time'* should be changed to datetime, as it represents time.

# Missing Values 

In [83]:
df_energy.isnull().sum()

time                                               0
generation biomass                                19
generation fossil brown coal/lignite              18
generation fossil coal-derived gas                18
generation fossil gas                             18
generation fossil hard coal                       18
generation fossil oil                             19
generation fossil oil shale                       18
generation fossil peat                            18
generation geothermal                             18
generation hydro pumped storage aggregated     35064
generation hydro pumped storage consumption       19
generation hydro run-of-river and poundage        19
generation hydro water reservoir                  18
generation marine                                 19
generation nuclear                                17
generation other                                  18
generation other renewable                        18
generation solar                              

- All the records for *'generation hydro pumped storage aggregated'* & *'forecast wind offshore eday ahead'* contain null values. Therefore, the entire columns must be dropped.

In [84]:
# count the frequency of missing values in each row (i.e. how many columns are empty)
print(df_energy.isnull().sum(axis=1).value_counts())

2     35017
3        29
23       12
22        6
Name: count, dtype: int64


- Almost all the rows, have 2 null columns.

# Invalid Data

In [85]:
# get the unique values count for each column
for column in df_energy.columns:
    print(df_energy[column].value_counts())
    print('-'*20)

time
2015-01-01 00:00:00+01:00    1
2017-09-01 03:00:00+02:00    1
2017-08-31 21:00:00+02:00    1
2017-08-31 22:00:00+02:00    1
2017-08-31 23:00:00+02:00    1
                            ..
2016-05-01 19:00:00+02:00    1
2016-05-01 18:00:00+02:00    1
2016-05-01 17:00:00+02:00    1
2016-05-01 16:00:00+02:00    1
2018-12-31 23:00:00+01:00    1
Name: count, Length: 35064, dtype: int64
--------------------
generation biomass
361.0    321
362.0    318
351.0    310
358.0    305
359.0    305
        ... 
101.0      1
589.0      1
174.0      1
175.0      1
168.0      1
Name: count, Length: 423, dtype: int64
--------------------
generation fossil brown coal/lignite
0.0      10517
663.0      165
664.0      124
595.0      108
657.0      103
         ...  
144.0        1
39.0         1
87.0         1
41.0         1
35.0         1
Name: count, Length: 956, dtype: int64
--------------------
generation fossil coal-derived gas
0.0    35046
Name: count, dtype: int64
--------------------
generation fo

- Columns of *'generation fossil coal-derived gas', 'generation fossil oil shale', 'generation fossil peat', 'generation geothermal', 'generation marine', 'generation wind offshore'* have all record values as zero.

In [86]:
# count the negative values for each numeric column
print((df_energy.select_dtypes(include='number') < 0).sum())

generation biomass                             0
generation fossil brown coal/lignite           0
generation fossil coal-derived gas             0
generation fossil gas                          0
generation fossil hard coal                    0
generation fossil oil                          0
generation fossil oil shale                    0
generation fossil peat                         0
generation geothermal                          0
generation hydro pumped storage aggregated     0
generation hydro pumped storage consumption    0
generation hydro run-of-river and poundage     0
generation hydro water reservoir               0
generation marine                              0
generation nuclear                             0
generation other                               0
generation other renewable                     0
generation solar                               0
generation waste                               0
generation wind offshore                       0
generation wind onsh

- No columns contain negative values.

# Data Preparation

In [87]:
# drop columns with all null values
columns_to_drop = ['generation hydro pumped storage aggregated', 'forecast wind offshore eday ahead']
df_energy.drop(columns_to_drop, axis=1, inplace=True)

In [88]:
df_energy.isnull().sum()

time                                            0
generation biomass                             19
generation fossil brown coal/lignite           18
generation fossil coal-derived gas             18
generation fossil gas                          18
generation fossil hard coal                    18
generation fossil oil                          19
generation fossil oil shale                    18
generation fossil peat                         18
generation geothermal                          18
generation hydro pumped storage consumption    19
generation hydro run-of-river and poundage     19
generation hydro water reservoir               18
generation marine                              19
generation nuclear                             17
generation other                               18
generation other renewable                     18
generation solar                               18
generation waste                               19
generation wind offshore                       18


In [89]:
# count the frequency of missing values in each row (i.e. how many columns are empty)
print(df_energy.isnull().sum(axis=1).value_counts())

0     35017
1        29
21       12
20        6
Name: count, dtype: int64


In [90]:
# drop columns with all zero values
columns_to_drop = ['generation fossil coal-derived gas', 'generation fossil oil shale', 'generation fossil peat', 
                   'generation geothermal', 'generation marine', 'generation wind offshore' ]
df_energy.drop(columns_to_drop, axis=1, inplace=True)

In [91]:
# count the frequency of missing values in each row (i.e. how many columns are empty)
print(df_energy.isnull().sum(axis=1).value_counts())

0     35018
1        28
15       12
14        6
Name: count, dtype: int64


In [92]:
# remove the rows having null values
df_energy = df_energy.dropna()

In [94]:
# count the frequency of missing values in each row (i.e. how many columns are empty)
print(df_energy.isnull().sum(axis=1).value_counts())

0    35018
Name: count, dtype: int64


In [95]:
df_energy['time']

0        2015-01-01 00:00:00+01:00
1        2015-01-01 01:00:00+01:00
2        2015-01-01 02:00:00+01:00
3        2015-01-01 03:00:00+01:00
4        2015-01-01 04:00:00+01:00
                   ...            
35059    2018-12-31 19:00:00+01:00
35060    2018-12-31 20:00:00+01:00
35061    2018-12-31 21:00:00+01:00
35062    2018-12-31 22:00:00+01:00
35063    2018-12-31 23:00:00+01:00
Name: time, Length: 35018, dtype: object

In [96]:
# Convert the column from string to datetime
df_energy['time'] = pd.to_datetime(df_energy['time'])

- We have taken care of all the missing and invalid datatypes and values. 