# 0 - Construction de la base de données

## Importation des modules

In [1]:
# Modules de base
import os
import json
import pandas as pd
import sys
import yaml

# Ajout du chemin
sys.path.append('..')

# Importation des modules ad hoc
from dashboard_template_database.builders.schema import SchemaBuilder
from dashboard_template_database.builders.tables import DuckdbTablesBuilder
from dashboard_template_database.storage.loader import Loader

# Chargement du fichier de configurations
with open("../config.yaml") as file:
    config = yaml.safe_load(file)

# Chargement du fichier de parmaètres
with open("../parameters/labels.json") as file:
    labels = json.load(file)


## Importation des données

In [2]:
# Initialisation du loader
loader = Loader()
# Importation des données
df_origin = loader.load(filepath=os.path.join('../', config['INPUT_DATA']))
# Conversion en datetime
df_origin['date'] = pd.to_datetime(df_origin['date'])
df_origin.head()

Unnamed: 0,indicator,country,date,value,kind,horizon,week,model,training
0,Gross Domestic Product,France,1960-04-01,0.37571,observed,,,,
1,Gross Domestic Product,France,1960-07-01,0.748561,observed,,,,
2,Gross Domestic Product,France,1960-10-01,1.185218,observed,,,,
3,Gross Domestic Product,France,1961-01-01,1.608374,observed,,,,
4,Gross Domestic Product,France,1961-04-01,1.600329,observed,,,,


## Construction du schéma

### Initialisation de la classe

In [3]:
# Initialisation du schéma
schema_builder = SchemaBuilder(df=df_origin, categorical_threshold=config['THRESHOLD'])

### Construction des méta-données

In [4]:
# Construction du jeu de métadonnées
df_metadata = schema_builder.create_metadata_table(column_labels=labels)

df_metadata.head()

2025-03-22 17:15:34,861 - INFO - Successfully extracted meta-data from column 'indicator'
2025-03-22 17:15:34,895 - INFO - The column 'indicator' is of type 'object' and the number of modalities 2 satisfies the categorical threshold criteria 200
2025-03-22 17:15:34,895 - INFO - Successfully extracted meta-data from column 'country'
2025-03-22 17:15:34,928 - INFO - The column 'country' is of type 'object' and the number of modalities 6 satisfies the categorical threshold criteria 200
2025-03-22 17:15:34,928 - INFO - Successfully extracted meta-data from column 'date'
2025-03-22 17:15:34,928 - INFO - Successfully extracted meta-data from column 'value'
2025-03-22 17:15:34,928 - INFO - Successfully extracted meta-data from column 'kind'
2025-03-22 17:15:34,960 - INFO - The column 'kind' is of type 'object' and the number of modalities 4 satisfies the categorical threshold criteria 200
2025-03-22 17:15:34,961 - INFO - Successfully extracted meta-data from column 'horizon'
2025-03-22 17:15:

Unnamed: 0,name,label,python_type,sql_type,is_categorical
0,country,Country,object,VARCHAR,True
1,date,Date,datetime64[ns],TIMESTAMP,False
2,horizon,Horizon,float64,DOUBLE,False
3,indicator,Indicator,object,VARCHAR,True
4,kind,Kind,object,VARCHAR,True


### Construction des tables de dimensions

In [5]:
# Construction des types de dimensions
dimension_tables = schema_builder.create_dimension_tables(column_labels=labels)
dimension_tables['indicator'].head()

2025-03-22 17:15:35,015 - INFO - Successfully extracted meta-data from column 'indicator'
2025-03-22 17:15:35,047 - INFO - The column 'indicator' is of type 'object' and the number of modalities 2 satisfies the categorical threshold criteria 200
2025-03-22 17:15:35,057 - INFO - Successfully extracted meta-data from column 'country'
2025-03-22 17:15:35,084 - INFO - The column 'country' is of type 'object' and the number of modalities 6 satisfies the categorical threshold criteria 200
2025-03-22 17:15:35,084 - INFO - Successfully extracted meta-data from column 'date'
2025-03-22 17:15:35,084 - INFO - Successfully extracted meta-data from column 'value'
2025-03-22 17:15:35,084 - INFO - Successfully extracted meta-data from column 'kind'
2025-03-22 17:15:35,113 - INFO - The column 'kind' is of type 'object' and the number of modalities 4 satisfies the categorical threshold criteria 200
2025-03-22 17:15:35,113 - INFO - Successfully extracted meta-data from column 'horizon'
2025-03-22 17:15:

Unnamed: 0,value,label
0,0,Gross Domestic Product
1,1,Private Consumption


### Construction de la table d'information

In [6]:
# Construction de la table d'informations
df_fact = schema_builder.create_fact_table(column_labels=labels)
df_fact.head()

  self.df_fact[column] = self.df_fact[column].replace(dict_label_value)
2025-03-22 17:15:35,534 - INFO - Successfully replace modalities by ids in column 'country'
  self.df_fact[column] = self.df_fact[column].replace(dict_label_value)
2025-03-22 17:15:35,661 - INFO - Successfully replace modalities by ids in column 'indicator'
  self.df_fact[column] = self.df_fact[column].replace(dict_label_value)
2025-03-22 17:15:35,823 - INFO - Successfully replace modalities by ids in column 'kind'
  self.df_fact[column] = self.df_fact[column].replace(dict_label_value)
2025-03-22 17:15:36,015 - INFO - Successfully replace modalities by ids in column 'model'
  self.df_fact[column] = self.df_fact[column].replace(dict_label_value)
2025-03-22 17:15:36,159 - INFO - Successfully replace modalities by ids in column 'training'
2025-03-22 17:15:36,159 - INFO - Successfully built fact table


Unnamed: 0,indicator,country,date,value,kind,horizon,week,model,training
0,0,0,1960-04-01,0.37571,0,,,0.0,0.0
1,0,0,1960-07-01,0.748561,0,,,0.0,0.0
2,0,0,1960-10-01,1.185218,0,,,0.0,0.0
3,0,0,1961-01-01,1.608374,0,,,0.0,0.0
4,0,0,1961-04-01,1.600329,0,,,0.0,0.0


### Création de l'ensemble des tables

In [7]:
# Création de l'ensemble des tables du schéma
df_metadata, dimension_tables, df_fact = schema_builder.build(column_labels=labels)

2025-03-22 17:15:36,183 - INFO - Successfully extracted meta-data from column 'indicator'
2025-03-22 17:15:36,273 - INFO - The column 'indicator' is of type 'object' and the number of modalities 2 satisfies the categorical threshold criteria 200
2025-03-22 17:15:36,273 - INFO - Successfully extracted meta-data from column 'country'
2025-03-22 17:15:36,307 - INFO - The column 'country' is of type 'object' and the number of modalities 6 satisfies the categorical threshold criteria 200
2025-03-22 17:15:36,307 - INFO - Successfully extracted meta-data from column 'date'
2025-03-22 17:15:36,307 - INFO - Successfully extracted meta-data from column 'value'
2025-03-22 17:15:36,307 - INFO - Successfully extracted meta-data from column 'kind'
2025-03-22 17:15:36,342 - INFO - The column 'kind' is of type 'object' and the number of modalities 4 satisfies the categorical threshold criteria 200
2025-03-22 17:15:36,342 - INFO - Successfully extracted meta-data from column 'horizon'
2025-03-22 17:15:

## Construction de la base de données

### Initialisation du builder

In [8]:
# Initialisation du builder
builder = DuckdbTablesBuilder(df=df_origin, categorical_threshold=config['THRESHOLD'], path=os.path.join('../', config['OUTPUT_DATA']))

### Création du schéma

In [9]:
# Construction du schéma duckDB
builder.build_duckdb_schema()

2025-03-22 17:15:38,239 - INFO - Successfully extracted meta-data from column 'indicator'
2025-03-22 17:15:38,304 - INFO - The column 'indicator' is of type 'object' and the number of modalities 2 satisfies the categorical threshold criteria 200
2025-03-22 17:15:38,305 - INFO - Successfully extracted meta-data from column 'country'
2025-03-22 17:15:38,340 - INFO - The column 'country' is of type 'object' and the number of modalities 6 satisfies the categorical threshold criteria 200
2025-03-22 17:15:38,342 - INFO - Successfully extracted meta-data from column 'date'
2025-03-22 17:15:38,343 - INFO - Successfully extracted meta-data from column 'value'
2025-03-22 17:15:38,344 - INFO - Successfully extracted meta-data from column 'kind'
2025-03-22 17:15:38,382 - INFO - The column 'kind' is of type 'object' and the number of modalities 4 satisfies the categorical threshold criteria 200
2025-03-22 17:15:38,383 - INFO - Successfully extracted meta-data from column 'horizon'
2025-03-22 17:15:

### Affichage du schéma

In [10]:
# Affichage du schéma
builder.display_schema()

2025-03-22 17:15:41,522 - INFO - 
 Created Tables:
2025-03-22 17:15:41,523 - INFO - 
 dim_country Structure:
2025-03-22 17:15:41,526 - INFO -   value: BIGINT
2025-03-22 17:15:41,527 - INFO -   label: VARCHAR
2025-03-22 17:15:41,528 - INFO - 
 dim_indicator Structure:
2025-03-22 17:15:41,530 - INFO -   value: BIGINT
2025-03-22 17:15:41,532 - INFO -   label: VARCHAR
2025-03-22 17:15:41,534 - INFO - 
 dim_kind Structure:
2025-03-22 17:15:41,536 - INFO -   value: BIGINT
2025-03-22 17:15:41,536 - INFO -   label: VARCHAR
2025-03-22 17:15:41,537 - INFO - 
 dim_model Structure:
2025-03-22 17:15:41,540 - INFO -   value: BIGINT
2025-03-22 17:15:41,542 - INFO -   label: VARCHAR
2025-03-22 17:15:41,542 - INFO - 
 dim_training Structure:
2025-03-22 17:15:41,546 - INFO -   value: BIGINT
2025-03-22 17:15:41,547 - INFO -   label: VARCHAR
2025-03-22 17:15:41,549 - INFO - 
 fact_table Structure:
2025-03-22 17:15:41,551 - INFO -   indicator: BIGINT
2025-03-22 17:15:41,554 - INFO -   country: BIGINT
2025-

### Exemple de requête

In [11]:
# Requête de la table d'information
print(builder.conn.execute("SELECT * FROM dim_model").fetchall())

[(1, 'ElasticNetCV'), (2, 'ExtraTrees'), (3, 'LassoCV'), (4, 'RandomForest'), (5, 'RidgeCV'), (6, 'XGBStandard'), (0, None)]
