# Data Preparation

In [1]:
import os, re
from dotenv import dotenv_values

from sqlalchemy import create_engine, text

from datetime import date,datetime,timedelta
import pandas as pd
import numpy as np

from matplotlib import pyplot as plt

import requests

In [2]:
# Settings from .env file

settings = dotenv_values()

# SQL CONFIG

settings = dotenv_values() # Loads settings from .env file
ROOT='..' # relative path to the root of the project

db_uri = (
    f"mysql+pymysql://{settings['SQL_USER']}:{settings['SQL_PWD']}"
    f"@{settings['SQL_HOST']}/{settings['SQL_DB']}"
    f"?ssl_ca={os.path.join(ROOT,settings['SQL_SSL_CA'])}"
    f"&ssl_cert={os.path.join(ROOT,settings['SQL_SSL_CERT'])}"
    f"&ssl_key={os.path.join(ROOT,settings['SQL_SSL_KEY'])}"
    f"&ssl_check_hostname=false"
)

engine = create_engine(db_uri,echo=False, future=False)

In [24]:
# Test Station ID
station_id = 6000990

# list of the 60 days before today (for History data)
history_days = pd.date_range(date.today() - timedelta(60),periods=60)

# list of today + 9 following days (for Forecast data)
forecast_days = pd.date_range(date.today(),periods=10)
forecast_days

# all days
days = pd.date_range(date.today() - timedelta(60),periods=70)

In [68]:
# Weather History dataframe (weather history, from Cloud SQL)

query = """
SELECT day, station_id, temperature, precipitation, maxwind
FROM weather
WHERE station_id = {}
AND day BETWEEN '{}' AND '{}' ;
"""

# We can format here, injection safe, pd.read_sql_query() used
f_query = query.format(
    station_id,
    history_days[0].strftime('%Y-%m-%d'),
    history_days[-1].strftime('%Y-%m-%d')
)

history_weather = pd.read_sql_query(f_query,engine)
history_weather.day = pd.to_datetime(history_weather.day)


In [69]:
### TODO WEATHER FORECAST

In [65]:
# station dataframe (from Cloud SQL)

query = "SELECT * FROM stations ;"
stations = pd.read_sql_query(query,engine)
stations.set_index('station_id',inplace=True,drop=True)
stations.head()

Unnamed: 0_level_0,label,alt,river_id,river_label,mean_nitrate,lat,lon
station_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
6000990,BELRUPT,292,U---0000,La Saône,4.513699,48.090273,6.101941
6000993,JONVELLE,230,U---0000,La Saône,9.433333,47.93638,5.923536
6000998,MONTHUREUX-SUR-SAONE 2,241,U---0000,La Saône,6.428571,48.018251,5.941492
6001000,CENDRECOURT,213,U---0000,La Saône,9.233333,47.840261,5.917378
6002500,PORT-SUR-SAONE,208,U---0000,La Saône,7.489474,47.691078,6.039292


In [66]:
# Data Constitution

# First, adding days
data = pd.DataFrame(days,columns=['day'])

# Adding mean_nitrate
data['mean_nitrate'] = stations.loc[station_id,'mean_nitrate']

# Adding (history) weather
data = pd.merge(data,history_weather,on='day',how='left')

In [67]:
data

Unnamed: 0,day,mean_nitrate,station_id,temperature,precipitation,maxwind
0,2021-12-11,4.513699,6000990.0,1.3,1.2,18.4
1,2021-12-12,4.513699,6000990.0,0.5,1.0,13.0
2,2021-12-13,4.513699,6000990.0,3.7,0.0,4.3
3,2021-12-14,4.513699,6000990.0,5.1,0.0,4.3
4,2021-12-15,4.513699,6000990.0,4.6,0.0,5.8
...,...,...,...,...,...,...
65,2022-02-14,4.513699,,,,
66,2022-02-15,4.513699,,,,
67,2022-02-16,4.513699,,,,
68,2022-02-17,4.513699,,,,
