In [74]:
import sweetviz as sv
import pandas as pd
import seaborn as sns
import dataprep.eda as eda
import matplotlib.pyplot as plt
from datetime import datetime

# Read data

In [122]:
A_est = pd.read_parquet('data/A/parquet/X_train_estimated.parquet', engine='pyarrow')
A_obs = pd.read_parquet('data/A/parquet/X_train_observed.parquet', engine='pyarrow')
B_est = pd.read_parquet('data/B/parquet/X_train_estimated.parquet', engine='pyarrow')
B_obs = pd.read_parquet('data/B/parquet/X_train_observed.parquet', engine='pyarrow')
C_est = pd.read_parquet('data/C/parquet/X_train_estimated.parquet', engine='pyarrow')
C_obs = pd.read_parquet('data/C/parquet/X_train_observed.parquet', engine='pyarrow')

A_target = pd.read_parquet('data/A/parquet/train_targets.parquet', engine='pyarrow')
B_target = pd.read_parquet('data/B/parquet/train_targets.parquet', engine='pyarrow')
C_target = pd.read_parquet('data/C/parquet/train_targets.parquet', engine='pyarrow')

### Create test data

In [123]:
# merge X_train_observed and X_train_estimated (without its first column) into one dataframe
A_train = pd.concat([A_obs, A_est.iloc[:,1:]], axis=0)
B_train = pd.concat([B_obs, B_est.iloc[:,1:]], axis=0)
C_train = pd.concat([C_obs, C_est.iloc[:,1:]], axis=0)

#### Convert index to datetime

In [124]:
def convert_timestamp_to_datetime(timestamp):
    try:
        # Convert Timestamp to Python datetime
        datetime_object = timestamp.to_pydatetime()
        return datetime_object
    except AttributeError as e:
        print(f"Error: {e}")
        return None

In [125]:
A_train['date_forecast'] = A_train['date_forecast'].apply(convert_timestamp_to_datetime)
A_train.set_index('date_forecast', inplace=True)
B_train['date_forecast'] = B_train['date_forecast'].apply(convert_timestamp_to_datetime)
B_train.set_index('date_forecast', inplace=True)
C_train['date_forecast'] = C_train['date_forecast'].apply(convert_timestamp_to_datetime)
C_train.set_index('date_forecast', inplace=True)

In [126]:
A_target['time'] = A_target['time'].apply(convert_timestamp_to_datetime)
A_target.set_index('time', inplace=True)
B_target['time'] = B_target['time'].apply(convert_timestamp_to_datetime)
B_target.set_index('time', inplace=True)
C_target['time'] = C_target['time'].apply(convert_timestamp_to_datetime)
C_target.set_index('time', inplace=True)

## Exploratory Data Analysis (EDA) using sweetViz and datapre.eda
### To compare observed and predicted

In [None]:
# Generate comparison reports using Sweetviz
SV_A_report = sv.compare([A_est, "A: X train estimated"], [A_obs, "A: X train observed"])
SV_B_report = sv.compare([B_est, "B: X train estimated"], [B_obs, "B: X train observed"])
SV_C_report = sv.compare([C_est, "C: X train estimated"], [C_obs, "C: X train observed"])

## Dataprep EDA

### Plot correlation

In [None]:
# Genereate EDA reports using dataprep
DP_A_report_est = eda.create_report(A_est)
DP_B_report_est = eda.create_report(B_est)
DP_C_report_est = eda.create_report(C_est)
DP_A_report_obs = eda.create_report(A_obs)
DP_B_report_obs = eda.create_report(B_obs)
DP_C_report_obs = eda.create_report(C_obs)

In [None]:
# Save reports
DP_A_report_est.save('DP_A_report_est.html')
DP_B_report_est.save('DP_B_report_est.html')
DP_C_report_est.save('DP_C_report_est.html')
DP_A_report_obs.save('DP_A_report_obs.html')
DP_B_report_obs.save('DP_B_report_obs.html')
DP_C_report_obs.save('DP_C_report_obs.html')