In [None]:
import os
import dask.dataframe as dd
from sqlalchemy import create_engine

from dotenv import load_dotenv

# Carregar variáveis de ambiente
load_dotenv()

# Obter variáveis de ambiente
user = os.environ.get('DB_USER')
password = os.environ.get('DB_PASSWORD')
host = os.environ.get('DB_HOST')
port = os.environ.get('DB_PORT', '5432')
database = os.environ.get('DB_NAME')

# Criar string de conexão
connection_string = f'postgresql+psycopg2://{user}:{password}@{host}:{port}/{database}'

# Nome da tabela
table_name = 'ds_market'

# Carregar a tabela como um Dask DataFrame
# Nota: Dask requer uma coluna de índice para particionamento; ajuste conforme sua tabela
df_dask = dd.read_sql_table(
    table_name=table_name,
    con=connection_string,  # Passar a string de conexão diretamente
    index_col='date',  # Substitua 'id' por uma coluna indexável adequada
    npartitions=10
)

# Exemplo de EDA com Dask
print(df_dask.head())

In [4]:
# Get the summary statistics of the dataframe
print(df_dask.describe().compute())

              sales      yearweek    sell_price                 date
count  5.835424e+07  5.835424e+07  5.835424e+07             58354236
mean   1.126352e+00  2.013469e+05  4.333558e+00                  NaN
std    3.872972e+00  1.516413e+02  4.494516e+00                  NaN
min    0.000000e+00  2.011040e+05  0.000000e+00  2011-01-29 00:00:00
25%    0.000000e+00  2.012140e+05  2.400000e+00  2012-04-03 00:00:00
50%    0.000000e+00  2.013370e+05  4.050000e+00  2013-09-11 00:00:00
75%    1.000000e+00  2.015010e+05  7.180000e+00  2015-01-02 00:00:00
max    7.630000e+02  2.016160e+05  1.341500e+02  2016-04-24 00:00:00


In [5]:
# Get the column names
print(df_dask.columns)

Index(['id', 'item', 'category', 'department', 'store', 'store_code', 'region',
       'sales', 'yearweek', 'event', 'sell_price', 'date'],
      dtype='object')


In [6]:
# Check for missing values
print(df_dask.isnull().sum().compute())

id            0
item          0
category      0
department    0
store         0
store_code    0
region        0
sales         0
yearweek      0
event         0
sell_price    0
date          0
dtype: int64


In [7]:
# Get the data types of each column
print(df_dask.dtypes)

id            string[pyarrow]
item          string[pyarrow]
category      string[pyarrow]
department    string[pyarrow]
store         string[pyarrow]
store_code    string[pyarrow]
region        string[pyarrow]
sales                   int64
yearweek                int64
event         string[pyarrow]
sell_price            float64
date           datetime64[ns]
dtype: object
