#Get Solar Energy data

In [None]:
import pandas as pd
import requests
from datetime import datetime, date
from dateutil.relativedelta import relativedelta

class WeatherEnergy:
    def __init__(self, limit, offset, refine, city1, years=10):
        self.city1 = city1
        self.years = years
        self.limit = limit
        self.offset = offset
        self.refine = refine

    def get_weather(self): 
        
        '''
        This function receives the name of three cities and a number of years, and returns a dataframe 
        with the average of the weather data from these three cities during those past years
        '''
        
        # First we declare the weather parameters. Here we'll be taking all params supported by the API
        weather_params = ['temperature_2m','relativehumidity_2m','dewpoint_2m',
                      'apparent_temperature','pressure_msl','surface_pressure',
                      'precipitation','rain','snowfall','cloudcover',
                      'cloudcover_low','cloudcover_mid','cloudcover_high',
                      'shortwave_radiation','direct_radiation','direct_normal_irradiance',
                      'diffuse_radiation','windspeed_10m','windspeed_100m',
                      'winddirection_10m','winddirection_100m','windgusts_10m',
                      'et0_fao_evapotranspiration','weathercode','vapor_pressure_deficit',
                      'soil_temperature_0_to_7cm','soil_temperature_7_to_28cm',
                      'soil_temperature_28_to_100cm','soil_temperature_100_to_255cm',
                      'soil_moisture_0_to_7cm','soil_moisture_7_to_28cm',
                      'soil_moisture_28_to_100cm','soil_moisture_100_to_255cm']
        
        # This request is done in order to get the latitude and longitude of the desired city1
        city_response_1 = requests.get('https://geocoding-api.open-meteo.com/v1/search',
                           params = {'name': self.city1}).json()
        
        lat_1 = city_response_1['results'][0]['latitude']
        lon_1 = city_response_1['results'][0]['longitude']
        
        # Then we compute the dates used to get the weather data
        ## The API only has data until 9 days ago
        end_date = (date.today() - relativedelta(days=8)).strftime('%Y-%m-%d') 
        
        #start_date = (datetime.date.today() - relativedelta(years=years)).strftime('%Y-%m-%d')
        start_date = (date.today() - relativedelta(years=self.years)).strftime('%Y-%m-%d')
        
        # So we make the request to the weather API archive
        weather_response_1 = requests.get('https://archive-api.open-meteo.com/v1/archive',
                           params = {'latitude': lat_1,
                                    'longitude': lon_1,
                                    'start_date': start_date,
                                    'end_date': end_date,
                                    'hourly': weather_params,
                                    'timezone': 'auto'}).json()
        
        weather_df_1 = pd.DataFrame(weather_response_1['hourly'], columns = ['time'] + weather_params)
        weather_df_1['time'] = pd.to_datetime(weather_df_1['time'], format='%Y-%m-%d')
        weather_df_1 = weather_df_1.set_index('time')
        
        return weather_df_1

    def get_energy_production(self):
        
        '''
        This function receives the name of a region, a limit and an offset, and returns a dataframe 
        with energy production data from this region
        '''
        
        #params to pass into the requests
        params = {'limit': self.limit, 'offset': self.offset, 'refine': f'libelle_region:{self.refine}'}
        
        #request the API for the data from 2013-2022
        url_2013_2022 = 'https://odre.opendatasoft.com/api/v2/catalog/datasets/eco2mix-regional-cons-def/exports/json'
        response_2013_2022 = requests.get(url=url_2013_2022,params = params).json()
        
        #transform API request into a dataframe
        df_2013_2022 = pd.DataFrame(response_2013_2022)
        
        #request the API for the data from 2022-today
        url_2022_today = 'https://odre.opendatasoft.com/api/v2/catalog/datasets/eco2mix-regional-tr/exports/json'
        response_2022_today = requests.get(url=url_2022_today,params = params).json()
        
        #transform API request into a dataframe
        df_2022_today = pd.DataFrame(response_2022_today)
        
        #merge those two together on just columns that exist in the first one
        energy_production_df = pd.concat([df_2013_2022, df_2022_today], sort=False,join="inner")
        
        #transform the column "date_heure", so that it is compatible with the weather data
        energy_production_df.insert(0, "time", energy_production_df['date'] + ' ' + energy_production_df['heure'])
        energy_production_df['time'] =  pd.to_datetime(energy_production_df['time'])
        energy_production_df = energy_production_df.sort_values('time')
        energy_production_df = energy_production_df.set_index('time')
        
        return energy_production_df

    def merged(self):
        
        '''
        This function takes in the get_weather and the get_energy_production dataframes
        and merges them into a merged_df dataframe
        '''
        
        #calls the get_weather function and stores the result in a dataframe
        weather_df = self.get_weather()
        
        #calls the get_energy_production and stores the result in a dataframe
        energy_production_df = self.get_energy_production()
        
        #merges the two dataframes and returns the merged_df
        merged_df = pd.merge(weather_df, energy_production_df['solaire'], left_index=True, right_index=True)
        
        return merged_df

In [None]:
data = WeatherEnergy(limit=-1, offset=0, refine='Nouvelle-Aquitaine', city1='Cestas', years=10)

In [None]:
solar_energy = data.merged()
solar_energy

Unnamed: 0_level_0,temperature_2m,relativehumidity_2m,dewpoint_2m,apparent_temperature,pressure_msl,surface_pressure,precipitation,rain,snowfall,cloudcover,...,vapor_pressure_deficit,soil_temperature_0_to_7cm,soil_temperature_7_to_28cm,soil_temperature_28_to_100cm,soil_temperature_100_to_255cm,soil_moisture_0_to_7cm,soil_moisture_7_to_28cm,soil_moisture_28_to_100cm,soil_moisture_100_to_255cm,solaire
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2013-03-14 00:00:00,1.9,84,-0.6,-0.8,1015.2,1009.6,0.0,0.0,0.0,3,...,0.11,3.4,5.7,8.5,8.7,0.228,0.232,0.240,0.252,0.0
2013-03-14 01:00:00,1.5,86,-0.6,-1.5,1015.3,1009.6,0.0,0.0,0.0,8,...,0.09,3.0,5.6,8.5,8.7,0.228,0.232,0.240,0.252,0.0
2013-03-14 02:00:00,1.2,87,-0.7,-1.9,1015.3,1009.6,0.0,0.0,0.0,8,...,0.09,2.8,5.4,8.5,8.7,0.228,0.232,0.240,0.252,0.0
2013-03-14 03:00:00,0.9,88,-0.9,-2.7,1015.2,1009.5,0.0,0.0,0.0,3,...,0.08,2.5,5.3,8.5,8.7,0.228,0.232,0.240,0.252,0.0
2013-03-14 04:00:00,0.4,89,-1.2,-3.5,1014.7,1009.0,0.0,0.0,0.0,2,...,0.07,2.1,5.1,8.4,8.7,0.228,0.232,0.240,0.252,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-03-06 19:00:00,6.7,71,1.8,4.8,1010.9,1005.4,0.0,0.0,0.0,54,...,0.29,6.9,6.6,7.5,9.8,0.222,0.233,0.254,0.237,62.0
2023-03-06 20:00:00,6.4,73,1.9,3.7,1011.1,1005.6,0.0,0.0,0.0,56,...,0.26,6.4,6.6,7.5,9.8,0.222,0.233,0.254,0.237,0.0
2023-03-06 21:00:00,5.3,78,1.7,2.1,1011.0,1005.4,0.0,0.0,0.0,41,...,0.20,5.8,6.6,7.5,9.8,0.222,0.232,0.254,0.237,0.0
2023-03-06 22:00:00,4.4,81,1.5,1.3,1010.9,1005.3,0.0,0.0,0.0,23,...,0.16,5.3,6.5,7.5,9.8,0.222,0.232,0.254,0.237,0.0


##Drop weathercode column

In [None]:
solar_energy.drop('weathercode', axis=1, inplace=True)
solar_energy

Unnamed: 0_level_0,temperature_2m,relativehumidity_2m,dewpoint_2m,apparent_temperature,pressure_msl,surface_pressure,precipitation,rain,snowfall,cloudcover,...,vapor_pressure_deficit,soil_temperature_0_to_7cm,soil_temperature_7_to_28cm,soil_temperature_28_to_100cm,soil_temperature_100_to_255cm,soil_moisture_0_to_7cm,soil_moisture_7_to_28cm,soil_moisture_28_to_100cm,soil_moisture_100_to_255cm,solaire
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2013-03-14 00:00:00,1.9,84,-0.6,-0.8,1015.2,1009.6,0.0,0.0,0.0,3,...,0.11,3.4,5.7,8.5,8.7,0.228,0.232,0.240,0.252,0.0
2013-03-14 01:00:00,1.5,86,-0.6,-1.5,1015.3,1009.6,0.0,0.0,0.0,8,...,0.09,3.0,5.6,8.5,8.7,0.228,0.232,0.240,0.252,0.0
2013-03-14 02:00:00,1.2,87,-0.7,-1.9,1015.3,1009.6,0.0,0.0,0.0,8,...,0.09,2.8,5.4,8.5,8.7,0.228,0.232,0.240,0.252,0.0
2013-03-14 03:00:00,0.9,88,-0.9,-2.7,1015.2,1009.5,0.0,0.0,0.0,3,...,0.08,2.5,5.3,8.5,8.7,0.228,0.232,0.240,0.252,0.0
2013-03-14 04:00:00,0.4,89,-1.2,-3.5,1014.7,1009.0,0.0,0.0,0.0,2,...,0.07,2.1,5.1,8.4,8.7,0.228,0.232,0.240,0.252,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-03-06 19:00:00,6.7,71,1.8,4.8,1010.9,1005.4,0.0,0.0,0.0,54,...,0.29,6.9,6.6,7.5,9.8,0.222,0.233,0.254,0.237,62.0
2023-03-06 20:00:00,6.4,73,1.9,3.7,1011.1,1005.6,0.0,0.0,0.0,56,...,0.26,6.4,6.6,7.5,9.8,0.222,0.233,0.254,0.237,0.0
2023-03-06 21:00:00,5.3,78,1.7,2.1,1011.0,1005.4,0.0,0.0,0.0,41,...,0.20,5.8,6.6,7.5,9.8,0.222,0.232,0.254,0.237,0.0
2023-03-06 22:00:00,4.4,81,1.5,1.3,1010.9,1005.3,0.0,0.0,0.0,23,...,0.16,5.3,6.5,7.5,9.8,0.222,0.232,0.254,0.237,0.0


##Check for Nan and Outliers

In [None]:
import numpy as np
from scipy import stats

In [None]:
solar_energy[solar_energy.isna().any(axis=1)]

Unnamed: 0_level_0,temperature_2m,relativehumidity_2m,dewpoint_2m,apparent_temperature,pressure_msl,surface_pressure,precipitation,rain,snowfall,cloudcover,...,vapor_pressure_deficit,soil_temperature_0_to_7cm,soil_temperature_7_to_28cm,soil_temperature_28_to_100cm,soil_temperature_100_to_255cm,soil_moisture_0_to_7cm,soil_moisture_7_to_28cm,soil_moisture_28_to_100cm,soil_moisture_100_to_255cm,solaire
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-10-30 03:00:00,14.5,86,12.2,13.1,1016.3,1010.9,0.0,0.0,0.0,54,...,0.22,16.4,19.9,20.1,20.5,0.154,0.173,0.098,0.191,
2022-10-30 04:00:00,14.4,87,12.2,11.9,1015.4,1010.0,0.0,0.0,0.0,58,...,0.22,16.2,19.7,20.1,20.5,0.154,0.173,0.098,0.191,
2022-10-30 05:00:00,14.4,86,12.1,12.3,1015.6,1010.2,0.0,0.0,0.0,52,...,0.23,16.1,19.5,20.1,20.5,0.154,0.173,0.098,0.191,
2022-10-30 06:00:00,14.6,86,12.2,13.2,1016.8,1011.4,0.2,0.2,0.0,68,...,0.24,16.1,19.3,20.1,20.5,0.154,0.173,0.098,0.191,
2022-10-30 07:00:00,14.2,87,12.1,12.6,1017.5,1012.1,0.0,0.0,0.0,49,...,0.21,15.7,19.1,20.1,20.5,0.154,0.173,0.098,0.191,
2022-10-30 08:00:00,15.0,92,13.7,12.7,1017.8,1012.4,0.0,0.0,0.0,52,...,0.13,15.8,18.9,20.1,20.5,0.155,0.173,0.098,0.191,
2022-10-30 09:00:00,15.8,91,14.2,15.5,1018.7,1013.3,0.1,0.1,0.0,34,...,0.17,16.4,18.7,20.0,20.5,0.155,0.173,0.098,0.191,
2022-10-30 10:00:00,17.4,87,15.3,18.4,1019.0,1013.6,0.0,0.0,0.0,37,...,0.25,18.0,18.7,20.0,20.5,0.155,0.172,0.098,0.191,
2022-10-30 11:00:00,19.3,80,15.8,19.7,1019.9,1014.5,0.1,0.1,0.0,71,...,0.44,19.3,18.8,20.0,20.5,0.155,0.172,0.098,0.191,
2022-10-30 12:00:00,19.8,75,15.2,20.2,1019.7,1014.4,0.1,0.1,0.0,30,...,0.58,20.1,18.9,20.0,20.5,0.155,0.172,0.098,0.191,


In [None]:
solar_energy = solar_energy.dropna()

In [None]:
solar_energy[(np.abs(stats.zscore(solar_energy)) < 3).all(axis=1)]

Unnamed: 0_level_0,temperature_2m,relativehumidity_2m,dewpoint_2m,apparent_temperature,pressure_msl,surface_pressure,precipitation,rain,snowfall,cloudcover,...,vapor_pressure_deficit,soil_temperature_0_to_7cm,soil_temperature_7_to_28cm,soil_temperature_28_to_100cm,soil_temperature_100_to_255cm,soil_moisture_0_to_7cm,soil_moisture_7_to_28cm,soil_moisture_28_to_100cm,soil_moisture_100_to_255cm,solaire
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2013-03-13 00:00:00,1.5,80,-1.7,-2.2,1003.8,998.2,0.0,0.0,0.0,100,...,0.1,4.5,7.7,9.0,8.7,0.2,0.2,0.2,0.3,0.0
2013-03-13 01:00:00,1.4,77,-2.3,-2.5,1004.0,998.4,0.0,0.0,0.0,100,...,0.2,4.3,7.5,9.0,8.7,0.2,0.2,0.2,0.3,0.0
2013-03-13 02:00:00,1.2,74,-3.0,-3.1,1004.0,998.4,0.0,0.0,0.0,100,...,0.2,4.1,7.2,9.0,8.7,0.2,0.2,0.2,0.3,0.0
2013-03-13 03:00:00,1.0,70,-3.9,-3.4,1004.0,998.4,0.0,0.0,0.0,100,...,0.2,3.9,7.0,9.0,8.7,0.2,0.2,0.2,0.3,0.0
2013-03-13 04:00:00,0.6,69,-4.4,-4.0,1004.1,998.5,0.0,0.0,0.0,100,...,0.2,3.7,6.8,8.9,8.7,0.2,0.2,0.2,0.3,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-03-05 19:00:00,8.0,66,1.9,5.1,1013.4,1007.9,0.0,0.0,0.0,0,...,0.4,8.6,8.2,7.5,9.9,0.2,0.2,0.3,0.2,64.0
2023-03-05 20:00:00,6.9,70,1.9,2.8,1013.5,1007.9,0.0,0.0,0.0,0,...,0.3,7.6,8.1,7.5,9.9,0.2,0.2,0.3,0.2,0.0
2023-03-05 21:00:00,5.7,76,1.8,0.7,1013.4,1007.8,0.0,0.0,0.0,3,...,0.2,6.8,8.0,7.5,9.9,0.2,0.2,0.3,0.2,0.0
2023-03-05 22:00:00,4.6,81,1.6,-0.8,1013.2,1007.6,0.0,0.0,0.0,1,...,0.2,6.1,7.9,7.5,9.9,0.2,0.2,0.3,0.2,0.0


In [None]:
solar_energy[(np.abs(stats.zscore(solar_energy.iloc[:,32])) < 3)]

Unnamed: 0_level_0,temperature_2m,relativehumidity_2m,dewpoint_2m,apparent_temperature,pressure_msl,surface_pressure,precipitation,rain,snowfall,cloudcover,...,vapor_pressure_deficit,soil_temperature_0_to_7cm,soil_temperature_7_to_28cm,soil_temperature_28_to_100cm,soil_temperature_100_to_255cm,soil_moisture_0_to_7cm,soil_moisture_7_to_28cm,soil_moisture_28_to_100cm,soil_moisture_100_to_255cm,solaire
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


##Scale the data

In [None]:
y_solar = solar_energy['solaire']

In [None]:
X_solar = solar_energy.drop('solaire', axis=1)

In [None]:
from sklearn.preprocessing import StandardScaler
 
std_scaler = StandardScaler()
 
X_solar_scaled = std_scaler.fit(X_solar)
X_solar_scaled = std_scaler.transform(X_solar)
X_solar_scaled = pd.DataFrame(X_solar_scaled, index=X_solar.index, columns = X_solar.columns)

In [None]:
solar_energy_scaled = pd.merge(X_solar_scaled, y_solar, left_index=True, right_index=True)

##Save to csv

In [None]:
solar_energy.to_csv('/content/drive/MyDrive/LeWagon/solar_energy_scaled.csv')

#Get Wind Power data

In [None]:
import pandas as pd
import requests
from datetime import datetime, date
from dateutil.relativedelta import relativedelta

class WeatherEnergy:
    def __init__(self, limit, offset, refine, city1, city2, city3, years=10):
        self.city1 = city1
        self.city2 = city2
        self.city3 = city3
        self.years = years
        self.limit = limit
        self.offset = offset
        self.refine = refine

    def get_weather(self): 
        
        '''
        This function receives the name of three cities and a number of years, and returns a dataframe 
        with the average of the weather data from these three cities during those past years
        '''
        
        # First we declare the weather parameters. Here we'll be taking all params supported by the API
        weather_params = ['temperature_2m','relativehumidity_2m','dewpoint_2m',
                      'apparent_temperature','pressure_msl','surface_pressure',
                      'precipitation','rain','snowfall','cloudcover',
                      'cloudcover_low','cloudcover_mid','cloudcover_high',
                      'shortwave_radiation','direct_radiation','direct_normal_irradiance',
                      'diffuse_radiation','windspeed_10m','windspeed_100m',
                      'winddirection_10m','winddirection_100m','windgusts_10m',
                      'et0_fao_evapotranspiration','weathercode','vapor_pressure_deficit',
                      'soil_temperature_0_to_7cm','soil_temperature_7_to_28cm',
                      'soil_temperature_28_to_100cm','soil_temperature_100_to_255cm',
                      'soil_moisture_0_to_7cm','soil_moisture_7_to_28cm',
                      'soil_moisture_28_to_100cm','soil_moisture_100_to_255cm']
        
        # This request is done in order to get the latitude and longitude of the desired city1
        city_response_1 = requests.get('https://geocoding-api.open-meteo.com/v1/search',
                           params = {'name': self.city1}).json()
        
        lat_1 = city_response_1['results'][0]['latitude']
        lon_1 = city_response_1['results'][0]['longitude']
        
        # Then we compute the dates used to get the weather data
        ## The API only has data until 9 days ago
        end_date = (date.today() - relativedelta(days=8)).strftime('%Y-%m-%d') 
        
        #start_date = (datetime.date.today() - relativedelta(years=years)).strftime('%Y-%m-%d')
        start_date = (date.today() - relativedelta(years=self.years)).strftime('%Y-%m-%d')
        
        # So we make the request to the weather API archive
        weather_response_1 = requests.get('https://archive-api.open-meteo.com/v1/archive',
                           params = {'latitude': lat_1,
                                    'longitude': lon_1,
                                    'start_date': start_date,
                                    'end_date': end_date,
                                    'hourly': weather_params,
                                    'timezone': 'auto'}).json()
        
        weather_df_1 = pd.DataFrame(weather_response_1['hourly'], columns = ['time'] + weather_params)
        weather_df_1['time'] = pd.to_datetime(weather_df_1['time'], format='%Y-%m-%d')
        weather_df_1 = weather_df_1.set_index('time')
        
        # Get data for city2
        city_response_2 = requests.get('https://geocoding-api.open-meteo.com/v1/search',
                           params = {'name': self.city1}).json()
        
        lat_2 = city_response_2['results'][0]['latitude']
        lon_2 = city_response_2['results'][0]['longitude']
        
        weather_response_2 = requests.get('https://archive-api.open-meteo.com/v1/archive',
                           params = {'latitude': lat_2,
                                    'longitude': lon_2,
                                    'start_date': start_date,
                                    'end_date': end_date,
                                    'hourly': weather_params,
                                    'timezone': 'auto'}).json()
        
        weather_df_2 = pd.DataFrame(weather_response_2['hourly'], columns = ['time'] + weather_params)
        weather_df_2['time'] = pd.to_datetime(weather_df_2['time'], format='%Y-%m-%d')
        weather_df_2 = weather_df_2.set_index('time')
        
        # Get data for city3
        city_response_3 = requests.get('https://geocoding-api.open-meteo.com/v1/search',
                           params = {'name': self.city1}).json()
        
        lat_3 = city_response_3['results'][0]['latitude']
        lon_3 = city_response_3['results'][0]['longitude']
        
        weather_response_3 = requests.get('https://archive-api.open-meteo.com/v1/archive',
                           params = {'latitude': lat_3,
                                    'longitude': lon_3,
                                    'start_date': start_date,
                                    'end_date': end_date,
                                    'hourly': weather_params,
                                    'timezone': 'auto'}).json()
        
        weather_df_3 = pd.DataFrame(weather_response_3['hourly'], columns = ['time'] + weather_params)
        weather_df_3['time'] = pd.to_datetime(weather_df_3['time'], format='%Y-%m-%d')
        weather_df_3 = weather_df_3.set_index('time')
        
        # Format float to 1 decimal, sum the 3 tables and return the average 
        pd.options.display.float_format = "{:,.1f}".format
        
        weather_df = (weather_df_1 + weather_df_2 + weather_df_3) / 3
        weather_df
        
        return weather_df

    def get_energy_production(self):
        
        '''
        This function receives the name of a region, a limit and an offset, and returns a dataframe 
        with energy production data from this region
        '''
        
        #params to pass into the requests
        params = {'limit': self.limit, 'offset': self.offset, 'refine': f'libelle_region:{self.refine}'}
        
        #request the API for the data from 2013-2022
        url_2013_2022 = 'https://odre.opendatasoft.com/api/v2/catalog/datasets/eco2mix-regional-cons-def/exports/json'
        response_2013_2022 = requests.get(url=url_2013_2022,params = params).json()
        
        #transform API request into a dataframe
        df_2013_2022 = pd.DataFrame(response_2013_2022)
        
        #request the API for the data from 2022-today
        url_2022_today = 'https://odre.opendatasoft.com/api/v2/catalog/datasets/eco2mix-regional-tr/exports/json'
        response_2022_today = requests.get(url=url_2022_today,params = params).json()
        
        #transform API request into a dataframe
        df_2022_today = pd.DataFrame(response_2022_today)
        
        #merge those two together on just columns that exist in the first one
        energy_production_df = pd.concat([df_2013_2022, df_2022_today], sort=False,join="inner")
        
        #transform the column "date_heure", so that it is compatible with the weather data
        energy_production_df.insert(0, "time", energy_production_df['date'] + ' ' + energy_production_df['heure'])
        energy_production_df['time'] =  pd.to_datetime(energy_production_df['time'])
        energy_production_df = energy_production_df.sort_values('time')
        energy_production_df = energy_production_df.set_index('time')
        
        return energy_production_df

    def merged(self):
        
        '''
        This function takes in the get_weather and the get_energy_production dataframes
        and merges them into a merged_df dataframe
        '''
        
        #calls the get_weather function and stores the result in a dataframe
        weather_df = self.get_weather()
        
        #calls the get_energy_production and stores the result in a dataframe
        energy_production_df = self.get_energy_production()
        
        #merges the two dataframes and returns the merged_df
        merged_df = pd.merge(weather_df, energy_production_df['eolien'], left_index=True, right_index=True)
        
        return merged_df

In [None]:
data1 = WeatherEnergy(limit=-1, offset=0, refine='Hauts-de-France', city1='Heudicourt', city2='Bucy-les-Pierrepont', city3='Riencourt', years=10)

In [None]:
wind_energy = data1.merged()
wind_energy

Unnamed: 0_level_0,temperature_2m,relativehumidity_2m,dewpoint_2m,apparent_temperature,pressure_msl,surface_pressure,precipitation,rain,snowfall,cloudcover,...,vapor_pressure_deficit,soil_temperature_0_to_7cm,soil_temperature_7_to_28cm,soil_temperature_28_to_100cm,soil_temperature_100_to_255cm,soil_moisture_0_to_7cm,soil_moisture_7_to_28cm,soil_moisture_28_to_100cm,soil_moisture_100_to_255cm,eolien
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2013-03-14 00:00:00,-5.4,91.0,-6.7,-4.0,1012.4,998.3,0.0,0.0,0.0,22.0,...,0.0,1.4,2.1,4.7,5.7,0.4,0.4,0.4,0.4,133.0
2013-03-14 01:00:00,-5.8,92.0,-6.9,-4.1,1012.7,998.6,0.0,0.0,0.0,21.0,...,0.0,1.3,2.1,4.7,5.7,0.4,0.4,0.4,0.4,139.0
2013-03-14 02:00:00,-6.2,93.0,-7.1,-4.3,1013.1,999.0,0.0,0.0,0.0,23.0,...,0.0,1.1,2.1,4.7,5.7,0.4,0.4,0.4,0.4,125.0
2013-03-14 03:00:00,-6.3,94.0,-7.0,-4.5,1013.2,999.1,0.0,0.0,0.0,14.0,...,0.0,1.0,2.1,4.6,5.7,0.4,0.4,0.4,0.4,122.0
2013-03-14 04:00:00,-6.6,96.0,-7.1,-4.4,1013.2,999.1,0.0,0.0,0.0,10.0,...,0.0,0.9,2.0,4.6,5.7,0.4,0.4,0.4,0.4,140.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-03-06 19:00:00,5.7,68.0,0.3,2.5,1007.4,993.7,0.0,0.0,0.0,100.0,...,0.3,5.5,5.0,5.4,7.6,0.3,0.3,0.4,0.4,1797.0
2023-03-06 20:00:00,4.7,80.0,1.6,1.7,1007.3,993.6,0.0,0.0,0.0,100.0,...,0.2,4.9,5.0,5.4,7.6,0.3,0.3,0.4,0.4,1768.0
2023-03-06 21:00:00,4.3,82.0,1.4,1.4,1007.1,993.4,0.0,0.0,0.0,100.0,...,0.1,4.5,5.0,5.4,7.6,0.3,0.3,0.4,0.4,1974.0
2023-03-06 22:00:00,3.9,83.0,1.2,0.7,1006.8,993.1,0.0,0.0,0.0,62.0,...,0.1,4.2,5.0,5.4,7.6,0.3,0.3,0.4,0.4,1928.0


##Drop the weathercode column

In [None]:
wind_energy.drop('weathercode', axis=1, inplace=True)

##Check for Nan and Outliers

In [None]:
wind_energy[wind_energy.isna().any(axis=1)]

Unnamed: 0_level_0,temperature_2m,relativehumidity_2m,dewpoint_2m,apparent_temperature,pressure_msl,surface_pressure,precipitation,rain,snowfall,cloudcover,...,vapor_pressure_deficit,soil_temperature_0_to_7cm,soil_temperature_7_to_28cm,soil_temperature_28_to_100cm,soil_temperature_100_to_255cm,soil_moisture_0_to_7cm,soil_moisture_7_to_28cm,soil_moisture_28_to_100cm,soil_moisture_100_to_255cm,eolien
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-10-30 03:00:00,15.1,87.0,12.9,13.3,1016.9,1003.4,0.0,0.0,0.0,63.0,...,0.2,14.2,16.3,16.0,16.5,0.3,0.3,0.3,0.3,
2022-10-30 04:00:00,13.0,94.0,12.1,13.1,1016.3,1002.8,0.0,0.0,0.0,65.0,...,0.1,14.0,16.2,16.0,16.5,0.3,0.3,0.3,0.3,
2022-10-30 05:00:00,12.1,97.0,11.6,11.7,1016.0,1002.5,0.0,0.0,0.0,82.0,...,0.1,13.6,16.0,16.0,16.5,0.3,0.3,0.3,0.3,
2022-10-30 06:00:00,11.7,97.0,11.3,11.7,1016.0,1002.5,0.0,0.0,0.0,79.0,...,0.0,13.3,15.8,16.0,16.5,0.3,0.3,0.3,0.3,
2022-10-30 07:00:00,11.8,97.0,11.4,11.1,1016.1,1002.6,0.0,0.0,0.0,81.0,...,0.0,13.0,15.6,16.0,16.5,0.3,0.3,0.3,0.3,
2022-10-30 08:00:00,13.0,94.0,12.0,11.6,1015.8,1002.3,0.4,0.4,0.0,84.0,...,0.1,13.4,15.5,16.0,16.5,0.3,0.3,0.3,0.3,
2022-10-30 09:00:00,13.7,94.0,12.7,12.3,1016.8,1003.3,0.1,0.1,0.0,96.0,...,0.1,13.9,15.4,16.0,16.5,0.3,0.3,0.3,0.3,
2022-10-30 10:00:00,15.2,88.0,13.3,13.1,1017.1,1003.7,0.0,0.0,0.0,100.0,...,0.2,14.9,15.4,16.0,16.5,0.3,0.3,0.3,0.3,
2022-10-30 11:00:00,15.8,83.0,13.0,14.5,1018.5,1005.1,0.2,0.2,0.0,30.0,...,0.3,15.3,15.4,16.0,16.5,0.4,0.3,0.3,0.3,
2022-10-30 12:00:00,16.2,80.0,12.7,14.3,1019.1,1005.7,0.1,0.1,0.0,36.0,...,0.4,15.7,15.4,16.0,16.5,0.4,0.3,0.3,0.3,


In [None]:
wind_energy = wind_energy.dropna()

Convert wind direction 10m to vector

In [None]:
# Convert degrees to radians and store the values into wd_rad
wd_rad_10 = wind_energy.pop('winddirection_10m')*np.pi / 180

# Calculate the wind x and y components and store then in two new columns
# `Wx` and `Wy`

wv_10 = wind_energy.pop('windspeed_10m')
wind_energy['Wx_10'] = wv_10*np.cos(wd_rad_10)
wind_energy['Wy_10'] = wv_10*np.sin(wd_rad_10)


Convert wind direction 100m to vector

In [None]:
# Convert degrees to radians and store the values into wd_rad
wd_rad_100 = wind_energy.pop('winddirection_100m')*np.pi / 180

# Calculate the wind x and y components and store then in two new columns
# `Wx` and `Wy`

wv_100 = wind_energy.pop('windspeed_100m')
wind_energy['Wx_100'] = wv_10*np.cos(wd_rad_100)
wind_energy['Wy_100'] = wv_10*np.sin(wd_rad_100)


##Scale the data

In [None]:
y_wind = wind_energy['eolien']

In [None]:
X_wind = wind_energy.drop('eolien', axis=1)

In [None]:
from sklearn.preprocessing import StandardScaler
 
std_scaler = StandardScaler()
 
X_wind_scaled = std_scaler.fit(X_wind)
X_wind_scaled = std_scaler.transform(X_wind)
X_wind_scaled = pd.DataFrame(X_wind_scaled, index=X_wind.index, columns = X_wind.columns)

In [None]:
wind_energy_scaled = pd.merge(X_wind_scaled, y_wind, left_index=True, right_index=True)

##Save to csv

In [None]:
wind_energy_scaled.to_csv('wind_energy_scaled.csv')

In [None]:
wind_energy_scaled = pd.read_csv("/content/drive/MyDrive/LeWagon/wind_energy_scaled.csv")

In [None]:
# Data manipulation
import numpy as np
import pandas as pd
pd.set_option("display.max_columns", None)

# Data Visualiation
import matplotlib.pyplot as plt
import seaborn as sns

# System
import os

# Deep Learning
import tensorflow as tf

In [None]:
from typing import Dict, List, Tuple, Sequence

In [None]:
wind_energy_scaled

Unnamed: 0,time,temperature_2m,relativehumidity_2m,dewpoint_2m,apparent_temperature,pressure_msl,surface_pressure,precipitation,rain,snowfall,cloudcover,cloudcover_low,cloudcover_mid,cloudcover_high,shortwave_radiation,direct_radiation,direct_normal_irradiance,diffuse_radiation,windspeed_10m,windspeed_100m,winddirection_10m,winddirection_100m,windgusts_10m,et0_fao_evapotranspiration,vapor_pressure_deficit,soil_temperature_0_to_7cm,soil_temperature_7_to_28cm,soil_temperature_28_to_100cm,soil_temperature_100_to_255cm,soil_moisture_0_to_7cm,soil_moisture_7_to_28cm,soil_moisture_28_to_100cm,soil_moisture_100_to_255cm,eolien
0,2013-03-13 00:00:00,-2.421369,0.025938,-2.914182,-2.384180,-1.054767,-1.139810,-0.262839,-0.260893,-0.039484,-0.155325,-0.902583,0.703199,0.246731,-0.653208,-0.525716,-0.631120,-0.717029,0.793214,0.969348,-1.649471,-1.642269,0.621635,-0.609462,-0.647985,-1.651989,-1.732444,-1.446414,-1.715239,0.583833,0.745491,1.057895,1.276617,533.0
1,2013-03-13 01:00:00,-2.525290,0.025938,-3.027487,-2.421353,-1.044193,-1.129068,-0.262839,-0.260893,-0.039484,-0.861529,-0.980592,-0.101030,-0.291642,-0.653208,-0.525716,-0.631120,-0.717029,0.625955,0.871060,-1.639567,-1.622364,0.452294,-0.609462,-0.670159,-1.651989,-1.732444,-1.446414,-1.715239,0.570448,0.745491,1.057895,1.276617,451.0
2,2013-03-13 02:00:00,-2.629212,0.091235,-3.159676,-2.433744,-1.012471,-1.107584,-0.262839,-0.260893,-0.039484,-1.268954,-1.006595,-0.560590,-0.658715,-0.653208,-0.525716,-0.631120,-0.717029,0.200205,0.625341,-1.659375,-1.632316,0.229071,-0.609462,-0.670159,-1.651989,-1.732444,-1.446414,-1.715239,0.570448,0.745491,1.057895,1.276617,330.0
3,2013-03-13 03:00:00,-2.762825,0.091235,-3.291866,-2.557652,-0.970175,-1.064616,-0.262839,-0.260893,-0.039484,-1.486248,-1.006595,-0.847815,-0.854487,-0.653208,-0.525716,-0.631120,-0.717029,-0.164723,0.418936,-1.718801,-1.652222,-0.071125,-0.688026,-0.692333,-1.651989,-1.732444,-1.467968,-1.715239,0.570448,0.745491,1.057895,1.276617,267.0
4,2013-03-13 04:00:00,-2.896438,0.091235,-3.461824,-2.693952,-1.001897,-1.107584,-0.262839,-0.260893,-0.039484,-1.513409,-1.006595,-0.847815,-0.903430,-0.653208,-0.525716,-0.631120,-0.717029,-0.362392,0.261675,-1.778227,-1.701985,-0.379018,-0.688026,-0.692333,-1.651989,-1.732444,-1.467968,-1.715239,0.570448,0.745491,1.057895,1.276617,216.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87454,2023-03-05 19:00:00,-0.936776,-0.169955,-1.176835,-0.847712,-0.240570,-0.269706,-0.262839,-0.260893,-0.039484,1.202759,1.385696,-0.445700,1.445834,-0.614765,-0.519146,-0.503557,-0.627254,-1.305124,-1.153671,0.816706,0.905618,-1.456646,-0.530898,-0.381895,-0.986979,-1.160014,-1.403307,-1.176334,-0.232618,-0.023749,0.636164,1.058620,218.0
87455,2023-03-05 20:00:00,-1.025851,0.352425,-1.025761,-0.934448,-0.261718,-0.301933,0.059654,0.063264,-0.039484,1.202759,1.541715,-0.388255,1.494777,-0.653208,-0.525716,-0.631120,-0.717029,-1.168276,-0.976753,0.965271,1.064861,-1.241120,-0.688026,-0.537114,-1.047435,-1.160014,-1.403307,-1.176334,-0.232618,-0.023749,0.636164,1.058620,209.0
87456,2023-03-05 21:00:00,-1.085235,0.548318,-1.006877,-0.934448,-0.293440,-0.323417,0.059654,0.063264,-0.039484,1.202759,1.593722,-0.273365,1.494777,-0.653208,-0.525716,-0.631120,-0.717029,-0.788142,-0.691718,1.133645,1.194246,-0.848556,-0.688026,-0.603637,-1.092776,-1.160014,-1.403307,-1.176334,-0.219234,-0.023749,0.636164,1.058620,166.0
87457,2023-03-05 22:00:00,-1.174311,0.678913,-1.063529,-1.021184,-0.356883,-0.398611,0.059654,0.063264,-0.039484,1.202759,1.593722,0.846812,1.470306,-0.653208,-0.525716,-0.631120,-0.717029,-0.423214,-0.495142,1.193071,1.234056,-0.486781,-0.688026,-0.647985,-1.123004,-1.177360,-1.403307,-1.176334,-0.205849,-0.023749,0.636164,1.058620,168.0


In [None]:
# Let's define the global variables of our dataset
TARGET = 'eolien'
N_TARGETS = 1
N_FEATURES = 32

In [None]:
# --------------------------------------------------- #
# Let's consider FOLDS with a length of 3 years       #
#      #
# --------------------------------------------------- #

FOLD_LENGTH = 24*365 * 7 # every hour
                        # 7 years

# --------------------------------------------------- #
# Let's consider FOLDS starting every day             #
# --------------------------------------------------- #
    
FOLD_STRIDE = 24 # every 24h
                   

# --------------------------------------------------- #
# Let's consider a train-test-split ratio of 2/3      #
# --------------------------------------------------- #

TRAIN_TEST_RATIO = 0.66

In [None]:
def get_folds(
    df: pd.DataFrame,
    fold_length: int,
    fold_stride: int) -> List[pd.DataFrame]:
    """    
    This function slides through the Time Series dataframe of shape (n_timesteps, n_features) to create folds
    - of equal `fold_length`
    - using `fold_stride` between each fold

    Args:
        df (pd.DataFrame): Overall dataframe
        fold_length (int): How long each fold should be in rows
        fold_stride (int): How many timesteps to move forward between taking each fold

    Returns:
        List[pd.DataFrame]: A list where each fold is a dataframe within
    """
    # $CHALLENGIFY_BEGIN
    
    folds = []
    for idx in range(0, len(wind_energy_scaled), fold_stride):
        # Exits the loop as soon as the last fold index would exceed the last index
        if (idx + fold_length) > len(wind_energy_scaled):
            break
        fold = df.iloc[idx:idx + fold_length, :]
        folds.append(fold)
    return folds

In [None]:
folds = get_folds(wind_energy_scaled, FOLD_LENGTH, FOLD_STRIDE)

print(f'The function generated {len(folds)} folds.')
print(f'Each fold has a shape equal to {folds[0].shape}.')

The function generated 1090 folds.
Each fold has a shape equal to (61320, 34).


In [None]:
fold = folds[0]
fold

Unnamed: 0,time,temperature_2m,relativehumidity_2m,dewpoint_2m,apparent_temperature,pressure_msl,surface_pressure,precipitation,rain,snowfall,cloudcover,cloudcover_low,cloudcover_mid,cloudcover_high,shortwave_radiation,direct_radiation,direct_normal_irradiance,diffuse_radiation,windspeed_10m,windspeed_100m,winddirection_10m,winddirection_100m,windgusts_10m,et0_fao_evapotranspiration,vapor_pressure_deficit,soil_temperature_0_to_7cm,soil_temperature_7_to_28cm,soil_temperature_28_to_100cm,soil_temperature_100_to_255cm,soil_moisture_0_to_7cm,soil_moisture_7_to_28cm,soil_moisture_28_to_100cm,soil_moisture_100_to_255cm,eolien
0,2013-03-13 00:00:00,-2.421369,0.025938,-2.914182,-2.384180,-1.054767,-1.139810,-0.262839,-0.260893,-0.039484,-0.155325,-0.902583,0.703199,0.246731,-0.653208,-0.525716,-0.63112,-0.717029,0.793214,0.969348,-1.649471,-1.642269,0.621635,-0.609462,-0.647985,-1.651989,-1.732444,-1.446414,-1.715239,0.583833,0.745491,1.057895,1.276617,533.0
1,2013-03-13 01:00:00,-2.525290,0.025938,-3.027487,-2.421353,-1.044193,-1.129068,-0.262839,-0.260893,-0.039484,-0.861529,-0.980592,-0.101030,-0.291642,-0.653208,-0.525716,-0.63112,-0.717029,0.625955,0.871060,-1.639567,-1.622364,0.452294,-0.609462,-0.670159,-1.651989,-1.732444,-1.446414,-1.715239,0.570448,0.745491,1.057895,1.276617,451.0
2,2013-03-13 02:00:00,-2.629212,0.091235,-3.159676,-2.433744,-1.012471,-1.107584,-0.262839,-0.260893,-0.039484,-1.268954,-1.006595,-0.560590,-0.658715,-0.653208,-0.525716,-0.63112,-0.717029,0.200205,0.625341,-1.659375,-1.632316,0.229071,-0.609462,-0.670159,-1.651989,-1.732444,-1.446414,-1.715239,0.570448,0.745491,1.057895,1.276617,330.0
3,2013-03-13 03:00:00,-2.762825,0.091235,-3.291866,-2.557652,-0.970175,-1.064616,-0.262839,-0.260893,-0.039484,-1.486248,-1.006595,-0.847815,-0.854487,-0.653208,-0.525716,-0.63112,-0.717029,-0.164723,0.418936,-1.718801,-1.652222,-0.071125,-0.688026,-0.692333,-1.651989,-1.732444,-1.467968,-1.715239,0.570448,0.745491,1.057895,1.276617,267.0
4,2013-03-13 04:00:00,-2.896438,0.091235,-3.461824,-2.693952,-1.001897,-1.107584,-0.262839,-0.260893,-0.039484,-1.513409,-1.006595,-0.847815,-0.903430,-0.653208,-0.525716,-0.63112,-0.717029,-0.362392,0.261675,-1.778227,-1.701985,-0.379018,-0.688026,-0.692333,-1.651989,-1.732444,-1.467968,-1.715239,0.570448,0.745491,1.057895,1.276617,216.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61315,2020-03-10 19:00:00,0.146977,0.678913,0.598281,0.131168,-0.314588,-0.301933,-0.262839,-0.260893,-0.039484,1.202759,1.593722,0.301085,-0.805544,-0.633987,-0.525716,-0.63112,-0.665729,1.675124,1.637706,0.549289,0.557274,1.314396,-0.452334,-0.492766,-0.276628,-0.552892,-0.972233,-1.034517,1.105826,1.201819,1.552338,2.410196,3639.0
61316,2020-03-10 20:00:00,0.176669,0.678913,0.654933,0.106386,-0.251144,-0.248222,-0.262839,-0.260893,-0.039484,1.202759,1.593722,0.128750,-0.903430,-0.653208,-0.525716,-0.63112,-0.717029,1.614302,1.608220,0.509672,0.517463,1.368277,-0.609462,-0.492766,-0.261514,-0.552892,-0.972233,-1.034517,1.105826,1.201819,1.552338,2.410196,3428.0
61317,2020-03-10 21:00:00,0.146977,0.809508,0.654933,0.093995,-0.240570,-0.237480,0.059654,0.063264,-0.039484,1.202759,1.593722,-0.330810,-0.903430,-0.653208,-0.525716,-0.63112,-0.717029,1.675124,1.667193,0.499768,0.507511,1.391369,-0.609462,-0.537114,-0.261514,-0.535545,-0.972233,-1.034517,1.105826,1.201819,1.552338,2.410196,3257.0
61318,2020-03-10 22:00:00,0.117285,0.809508,0.617165,0.081604,-0.198274,-0.194512,-0.262839,-0.260893,-0.039484,0.958304,1.593722,-0.819093,-0.903430,-0.653208,-0.525716,-0.63112,-0.717029,1.720740,1.706508,0.499768,0.517463,1.452948,-0.609462,-0.559288,-0.261514,-0.518199,-0.972233,-1.062881,1.105826,1.188781,1.552338,2.410196,3334.0


In [None]:
INPUT_LENGTH = 24 * 14 # for 14 days

In [None]:
def train_test_split(fold:pd.DataFrame,
                     train_test_ratio: float,
                     input_length: int) -> Tuple[pd.DataFrame]:
    """From a fold dataframe, take a train dataframe and test dataframe based on 
    the split ratio.
    - df_train should contain all the timesteps until round(train_test_ratio * len(fold))
    - df_test should contain all the timesteps needed to create all (X_test, y_test) tuples

    Args:
        fold (pd.DataFrame): A fold of timesteps
        train_test_ratio (float): The ratio between train and test 0-1
        input_length (int): How long each X_i will be

    Returns:
        Tuple[pd.DataFrame]: A tuple of two dataframes (fold_train, fold_test)
    """
    # $CHALLENGIFY_BEGIN
    
    # TRAIN SET
    # ======================
    last_train_idx = round(train_test_ratio * len(fold))
    fold_train = fold.iloc[0:last_train_idx, :]

    # TEST SET
    # ======================    
    first_test_idx = last_train_idx - input_length
    fold_test = fold.iloc[first_test_idx:, :]

    return (fold_train, fold_test)

In [None]:
TRAIN_TEST_RATIO

0.66

In [None]:
(fold_train, fold_test) = train_test_split(fold, TRAIN_TEST_RATIO, INPUT_LENGTH)

In [None]:
def get_Xi_yi(
    fold:pd.DataFrame, 
    input_length:int, 
    output_length:int) -> Tuple[pd.DataFrame]:
    """given a fold, it returns one sequence (X_i, y_i) as based on the desired 
    input_length and output_length with the starting point of the sequence being chosen at random based

    Args:
        fold (pd.DataFrame): A single fold
        input_length (int): How long each X_i should be 
        output_length (int): How long each y_i should be

    Returns:
        Tuple[pd.DataFrame]: A tuple of two dataframes (X_i, y_i)
    """
    # $CHALLENGIFY_BEGIN
    first_possible_start = 0
    last_possible_start = len(fold) - (input_length + output_length) + 1
    random_start = np.random.randint(first_possible_start, last_possible_start)
    X_i = fold.iloc[random_start:random_start+input_length]    
    y_i = fold.iloc[random_start+input_length:
                  random_start+input_length+output_length][[TARGET]]
    
    return (X_i, y_i)

In [None]:
X_train_i, y_train_i = get_Xi_yi(fold_train, INPUT_LENGTH, OUTPUT_LENGTH)
X_test_i, y_test_i = get_Xi_yi(fold_test, INPUT_LENGTH, OUTPUT_LENGTH)

NameError: ignored