In [14]:
#| hide
from featsql.core import *

# featsql

> Create features with sql

## Install

```sh
pip install featsql
```

## Imports

In [15]:
import pandas as pd
from sqlalchemy import create_engine
pd.set_option('display.max_columns', None)

## Configurando a engine

In [19]:
url_db = "sqlite:///../../data/mydatabase.db" 

engine = create_engine(url_db)

## Visão inicial do público

Primeiro vamos observar o formato da tabela spine

In [21]:
df_spine = pd.read_sql("SELECT * FROM tb_spine", engine)
df_spine.head()

Unnamed: 0,ID,SAFRA_REF,Target
0,4,2023-02-01,0
1,5,2023-02-01,0
2,6,2023-02-01,0
3,7,2023-02-01,0
4,10,2023-02-01,0


## Visão inicial da tabela de variáveis

A tabela de variáveis contém 4 variáveis, duas sendo numéricas e duas categórica. Existem mais ID's únicos e datas disponível nessa tabela do que na tabela spine.

In [22]:
df_data = pd.read_sql("SELECT * FROM tb_feat", engine)
df_data.head()

Unnamed: 0,ID,SAFRA,FEAT_NUM1,FEAT_NUM2,FEAT_CAT1,FEAT_CAT2
0,1,2023-01-01,33,15,C,B
1,2,2023-01-01,-36,82,C,B
2,3,2023-01-01,89,33,C,B
3,4,2023-01-01,-34,-94,C,B
4,5,2023-01-01,99,26,B,B


## Criação de variáveis numéricas

A função create_query_num() cria um texto com a query para a criação de variáveis com as operações soma, mínimo, máximo e média das variáveis listadas em feat_num_lista e com a janela de tempo listada em lista_janela.x

In [None]:
tb_publico = 'tb_spine'
tb_feat = 'tb_feat'
id = 'ID'
safra_ref = 'SAFRA_REF'
safra = 'SAFRA'
target = 'Target'
feat_num_lista = ['FEAT_NUM1','FEAT_NUM2']
lista_janela = [1,2,3]
query_final_num = create_query_num(tb_publico, tb_feat, lista_janela,feat_num_lista, id, safra_ref, target, safra)

In [None]:
df_num = pd.read_sql(query_final_num, engine)
df_num.head()

Unnamed: 0,ID,SAFRA_REF,Target,FEAT_NUM1_SUM_1M,FEAT_NUM1_MIN_1M,FEAT_NUM1_MAX_1M,FEAT_NUM1_AGV_1M,FEAT_NUM2_SUM_1M,FEAT_NUM2_MIN_1M,FEAT_NUM2_MAX_1M,FEAT_NUM2_AGV_1M,FEAT_NUM1_SUM_2M,FEAT_NUM1_MIN_2M,FEAT_NUM1_MAX_2M,FEAT_NUM1_AGV_2M,FEAT_NUM2_SUM_2M,FEAT_NUM2_MIN_2M,FEAT_NUM2_MAX_2M,FEAT_NUM2_AGV_2M,FEAT_NUM1_SUM_3M,FEAT_NUM1_MIN_3M,FEAT_NUM1_MAX_3M,FEAT_NUM1_AGV_3M,FEAT_NUM2_SUM_3M,FEAT_NUM2_MIN_3M,FEAT_NUM2_MAX_3M,FEAT_NUM2_AGV_3M
0,4,2023-02-01,0,-34,-34,-34,-34.0,-94,-94,-94,-94.0,-34,-34,-34,-34.0,-94,-94,-94,-94.0,-34,-34,-34,-34.0,-94,-94,-94,-94.0
1,5,2023-02-01,0,99,99,99,99.0,26,26,26,26.0,99,99,99,99.0,26,26,26,26.0,99,99,99,99.0,26,26,26,26.0
2,6,2023-02-01,0,18,18,18,18.0,-55,-55,-55,-55.0,18,18,18,18.0,-55,-55,-55,-55.0,18,18,18,18.0,-55,-55,-55,-55.0
3,7,2023-02-01,0,-44,-44,-44,-44.0,4,4,4,4.0,-44,-44,-44,-44.0,4,4,4,4.0,-44,-44,-44,-44.0,4,4,4,4.0
4,10,2023-02-01,0,67,67,67,67.0,85,85,85,85.0,67,67,67,67.0,85,85,85,85.0,67,67,67,67.0,85,85,85,85.0


In [None]:
print(query_final_num)


    WITH 
    tb_public AS (
        SELECT 
            *
        FROM tb_spine
    ),
    
        -- Criação de variáveis de janela de 1M
        tb_janela_1M as(
            SELECT 
                tb_public.ID,
                tb_public.SAFRA_REF,
                tb_public.Target,
                
             -- Criação de variáveis numéricas a partir da coluna FEAT_NUM1 para a janela 1
            SUM(COALESCE(tb_feat.FEAT_NUM1,0)) AS FEAT_NUM1_SUM_1M,
            MIN(COALESCE(tb_feat.FEAT_NUM1,0)) AS FEAT_NUM1_MIN_1M,
            MAX(COALESCE(tb_feat.FEAT_NUM1,0)) AS FEAT_NUM1_MAX_1M,
            AVG(COALESCE(tb_feat.FEAT_NUM1,0)) AS FEAT_NUM1_AGV_1M,
            
             -- Criação de variáveis numéricas a partir da coluna FEAT_NUM2 para a janela 1
            SUM(COALESCE(tb_feat.FEAT_NUM2,0)) AS FEAT_NUM2_SUM_1M,
            MIN(COALESCE(tb_feat.FEAT_NUM2,0)) AS FEAT_NUM2_MIN_1M,
            MAX(COALESCE(tb_feat.FEAT_NUM2,0)) AS FEAT_NUM2_MAX_1M,
            AVG(COALES

## Criação de variáveis categóricas

A função create_query_cat() cria um texto com a query para a criação de variáveis com a moda de cada uma das variáveis listadas  em feat_num_lista na janela de tempo fornecida em lista_janela.

In [None]:
tb_publico = 'tb_spine'
tb_feat = 'tb_feat'
id = 'ID'
safra_ref = 'SAFRA_REF'
safra = 'SAFRA'
target = 'Target'
feat_num_lista = ['FEAT_CAT1','FEAT_CAT2']
lista_janela = [1,2,3]
query_final_cat = create_query_cat(tb_publico, tb_feat, lista_janela,feat_num_lista, id, safra_ref, target, safra)

In [None]:
df_cat = pd.read_sql(query_final_cat, engine)
df_cat.head()

Unnamed: 0,ID,SAFRA_REF,Target,FEAT_CAT1_MODA_1M,FEAT_CAT2_MODA_1M,FEAT_CAT1_MODA_2M,FEAT_CAT2_MODA_2M,FEAT_CAT1_MODA_3M,FEAT_CAT2_MODA_3M
0,4,2023-02-01,0,C,B,C,B,C,B
1,5,2023-02-01,0,B,B,B,B,B,B
2,6,2023-02-01,0,B,A,B,A,B,A
3,7,2023-02-01,0,C,C,C,C,C,C
4,10,2023-02-01,0,A,A,A,A,A,A


In [None]:
print(query_final_cat)


    WITH 
    tb_public as (
        SELECT 
            ID,
            SAFRA_REF,
            Target
        FROM tb_spine
    ),
    
    tb_janela_FEAT_CAT1_1M as(
        SELECT
            tb_public.ID,
            tb_public.SAFRA_REF,
            tb_public.Target,
            tb_feat.FEAT_CAT1,
            COUNT(*) AS frequency_FEAT_CAT1
        FROM tb_public
        LEFT JOIN tb_feat
        ON tb_public.ID = tb_feat.ID
            AND (strftime('%Y-%m-%d', date(tb_feat.SAFRA, '+1 months')) >= tb_public.SAFRA_REF)
            AND (tb_feat.SAFRA < tb_public.SAFRA_REF)
        GROUP BY tb_public.ID, tb_public.SAFRA_REF, tb_feat.FEAT_CAT1
    ),

    tb_row_FEAT_CAT1_1M as (
        SELECT 
            *,    
            ROW_NUMBER() OVER (
                PARTITION BY 
                    ID,
                    SAFRA_REF        
                    ORDER BY frequency_FEAT_CAT1 DESC
            ) as row_num_FEAT_CAT1_1M
        FROM tb_janela_FEAT_CAT1_1M
    ),
    
    tb_mod