In [1]:
%load_ext autoreload
%autoreload 2


from netcbs_ import *
import numpy as np
import polars as pl

In [2]:
# Create df_sample example
df_sample = pl.DataFrame(
    {
        "RINPERSOON": range(100_000_000, 100_010_000),
        "RINPERSOONS": ["R"]*10_000
    }
)

df_agg = pl.DataFrame(
    {
        "RINPERSOON":   range(100_000_000, 101_000_000),
        "RINPERSOONS":  ["R"]*1_000_000,
        "Income":       30000 + np.random.randn(1_000_000)*5000
    }
)


## How to construct the query
# 1. Start with "Sample ->"
# 2. Then add the relationships between the tables, e.g., "Sample -> Schoolmates[all]".
# In square brackets you can specify the type of the relationships: 
# write [all] for all, or [301,302] for parents and co-parents
# 3. You can add several tables: "Sample -> Schoolmates[all] -> Family[301]"
# 4. Finally, you must specify the column you want to aggregate: "Sample -> Schoolmates[all] -> Family[301] -> Income"

## Other parameters
# df_sample: the sample dataframe (with the people you want to have information on)
# df_agg: the dataframe with the information you want to aggregate. For example, the income of all people in the country
# year: the year of the data you want to use
# agg_func: the aggregation function you want to use. For example, pl.mean or pl.sum
# return_pandas: if True, the function returns a pandas dataframe. If False, it returns a polars dataframe
# lazy: if True, the operations are concatenated lazily and computed at the end. If False, the operations are computed immediately

## Example
query =  "Sample -> Schoolmates[all] -> Family[301,302,303] -> Income"
df = transform(query, 
               df_sample = df_sample, 
               df_agg = df_agg, 
               year=2021, 
               agg_func=pl.mean, 
               return_pandas=False, 
               lazy=True)

df

    

RINPERSOON,RINPERSOONS,Income
i64,str,f64
100000000,"""R""",
100000001,"""R""",
100000002,"""R""",39045.510787
100000003,"""R""",
100000004,"""R""",
…,…,…
100009995,"""R""",
100009996,"""R""",
100009997,"""R""",
100009998,"""R""",
