# ETL: Extract, Transform, Load

Download a dataset from [Kaggle](https://kaggle.com/) to develop this project.

**Note**: It must contain at least:

- 50 rows
- 2 categorical columns
- 2 numerical columns

## Load data

In [1]:
import pandas as pd

df_base = pd.read_csv('../data/p48_2015_2022_EN.csv')
df_base

Unnamed: 0,datetime_utc,Hydraulic,Adjustment P48,Balance Andorra,Balance Morocco,Balance France,Balance Portugal,Other Renewables,Non-Renewable Waste,Cogeneration,Wind,Fuel-Gas,Coal,Balearic Link,Pumping Consumption,Solar Thermal,Solar Photovoltaic,Combined Cycle,Pumping Turbine,Nuclear
0,2015-01-01 00:00:00+01:00,2621.90,1300.0,-36.0,-540.0,-1000.0,291.7,486.4,,1586.9,5517.400,421.6,5077.9,-128.0,-850.000,16.0,,3458.10,,7105.00
1,2015-01-01 01:00:00+01:00,2532.30,1000.0,-35.0,-600.0,-1100.0,-44.1,486.4,,1591.2,5034.400,422.3,5086.3,-102.0,-850.000,16.0,,3789.50,,7104.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70126,2022-12-31 22:00:00+01:00,4055.50,-200.0,-67.0,-740.0,-1715.8,3355.0,259.2,264.7,760.6,8871.225,,260.0,-153.0,-1846.700,,1.7,1725.00,689.6000,6457.75
70127,2022-12-31 23:00:00+01:00,3963.95,-200.0,-62.0,-750.0,-258.0,2755.0,257.2,264.9,761.6,7506.300,,260.0,-128.0,-2632.025,,1.6,2104.55,240.9125,6415.45


## Clean and preprocess data

In [2]:
df_base['datetime_utc'] = pd.to_datetime(df_base['datetime_utc'], utc=True)
df_base = df_base.set_index('datetime_utc')
df_base = df_base.tz_convert('Europe/Madrid')
df_base

Unnamed: 0_level_0,Hydraulic,Adjustment P48,Balance Andorra,Balance Morocco,Balance France,Balance Portugal,Other Renewables,Non-Renewable Waste,Cogeneration,Wind,Fuel-Gas,Coal,Balearic Link,Pumping Consumption,Solar Thermal,Solar Photovoltaic,Combined Cycle,Pumping Turbine,Nuclear
datetime_utc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2015-01-01 00:00:00+01:00,2621.90,1300.0,-36.0,-540.0,-1000.0,291.7,486.4,,1586.9,5517.400,421.6,5077.9,-128.0,-850.000,16.0,,3458.10,,7105.00
2015-01-01 01:00:00+01:00,2532.30,1000.0,-35.0,-600.0,-1100.0,-44.1,486.4,,1591.2,5034.400,422.3,5086.3,-102.0,-850.000,16.0,,3789.50,,7104.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-12-31 22:00:00+01:00,4055.50,-200.0,-67.0,-740.0,-1715.8,3355.0,259.2,264.7,760.6,8871.225,,260.0,-153.0,-1846.700,,1.7,1725.00,689.6000,6457.75
2022-12-31 23:00:00+01:00,3963.95,-200.0,-62.0,-750.0,-258.0,2755.0,257.2,264.9,761.6,7506.300,,260.0,-128.0,-2632.025,,1.6,2104.55,240.9125,6415.45


## Temporal columns

### Create

In [3]:
s = df_base.index

In [4]:
df_time = pd.DataFrame({
    'year': s.year,
    'month': s.month,
    'day': s.day,
    'hour': s.hour,
    'weekday': s.weekday,
}, index=s)

df_time['weekend'] = (df_time['weekday'] >= 5).astype(int)
df_time

Unnamed: 0_level_0,year,month,day,hour,weekday,weekend
datetime_utc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2015-01-01 00:00:00+01:00,2015,1,1,0,3,0
2015-01-01 01:00:00+01:00,2015,1,1,1,3,0
...,...,...,...,...,...,...
2022-12-31 22:00:00+01:00,2022,12,31,22,5,1
2022-12-31 23:00:00+01:00,2022,12,31,23,5,1


### Concatenate `DataFrames`

In [5]:
df = pd.concat([df_time, df_base], axis=1)
df

Unnamed: 0_level_0,year,month,day,hour,weekday,weekend,Hydraulic,Adjustment P48,Balance Andorra,Balance Morocco,...,Wind,Fuel-Gas,Coal,Balearic Link,Pumping Consumption,Solar Thermal,Solar Photovoltaic,Combined Cycle,Pumping Turbine,Nuclear
datetime_utc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2015-01-01 00:00:00+01:00,2015,1,1,0,3,0,2621.90,1300.0,-36.0,-540.0,...,5517.400,421.6,5077.9,-128.0,-850.000,16.0,,3458.10,,7105.00
2015-01-01 01:00:00+01:00,2015,1,1,1,3,0,2532.30,1000.0,-35.0,-600.0,...,5034.400,422.3,5086.3,-102.0,-850.000,16.0,,3789.50,,7104.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-12-31 22:00:00+01:00,2022,12,31,22,5,1,4055.50,-200.0,-67.0,-740.0,...,8871.225,,260.0,-153.0,-1846.700,,1.7,1725.00,689.6000,6457.75
2022-12-31 23:00:00+01:00,2022,12,31,23,5,1,3963.95,-200.0,-62.0,-750.0,...,7506.300,,260.0,-128.0,-2632.025,,1.6,2104.55,240.9125,6415.45


### Set temporal columns as `index`

In [6]:
df = df.set_index(['year', 'month', 'day', 'hour', 'weekday', 'weekend'])
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Hydraulic,Adjustment P48,Balance Andorra,Balance Morocco,Balance France,Balance Portugal,Other Renewables,Non-Renewable Waste,Cogeneration,Wind,Fuel-Gas,Coal,Balearic Link,Pumping Consumption,Solar Thermal,Solar Photovoltaic,Combined Cycle,Pumping Turbine,Nuclear
year,month,day,hour,weekday,weekend,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
2015,1,1,0,3,0,2621.90,1300.0,-36.0,-540.0,-1000.0,291.7,486.4,,1586.9,5517.400,421.6,5077.9,-128.0,-850.000,16.0,,3458.10,,7105.00
2015,1,1,1,3,0,2532.30,1000.0,-35.0,-600.0,-1100.0,-44.1,486.4,,1591.2,5034.400,422.3,5086.3,-102.0,-850.000,16.0,,3789.50,,7104.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022,12,31,22,5,1,4055.50,-200.0,-67.0,-740.0,-1715.8,3355.0,259.2,264.7,760.6,8871.225,,260.0,-153.0,-1846.700,,1.7,1725.00,689.6000,6457.75
2022,12,31,23,5,1,3963.95,-200.0,-62.0,-750.0,-258.0,2755.0,257.2,264.9,761.6,7506.300,,260.0,-128.0,-2632.025,,1.6,2104.55,240.9125,6415.45


## Export to `parquet`

In [7]:
df.to_parquet('../data/p48_2015_2022_EN.parquet')