In [3]:
import random
import polars as pl
import pandas as pd
import pyreadstat #not a dependency of netCBS, but needed to read CBS files

import netcbs as net

In [4]:
# Print contexts and codebook
print(net.context2types)
print(net.codebook)

{'Family': {301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322}, 'Colleagues': {201}, 'Neighbors': {101, 102}, 'Schoolmates': {501, 502, 503, 504, 505, 506}, 'Housemates': {401, 402}}
{101: 'Neighbor - 10 closest addresses', 102: 'Neighborhood acquaintance - 20 random neighbors within 200 meters', 201: 'Colleague', 301: 'Parent', 302: 'Co-parent', 303: 'Grandparent', 304: 'Child', 305: 'Grandchild', 306: 'Full sibling', 307: 'Half sibling', 308: 'Unknown sibling', 309: 'Cousin', 310: 'Nephew/Niece', 311: 'Aunt/Uncle', 312: 'Partner - married', 313: 'Partner - not married', 314: 'Parent-in-law', 315: 'Child-in-law', 316: 'Sibling-in-law', 317: 'Stepparent', 318: 'Stepchild', 319: 'Stepsibling', 320: 'Married full cousin', 321: 'Married cousin', 322: 'Married aunt/uncle', 401: 'Housemate', 402: 'Housemate - institution', 501: 'Classmate primary education', 502: 'Classmate special education', 503: 'Classmate secondary education', 5

# 1. Case example using CBS data 
We may want to know the income and wealth of the families of children in the Netherlands. 

We need:
- The sample: e.g. filtering the `GBAPERSOONTAB` dataset to keep only children
- The income (variables `INPPERSBRUT` and `INPPERSBRUT`) of everybody in the Netherlands (dataset `INPATAB`)

The query: `"[INPPERSPRIM, INPPERSBRUT] -> Family[301,303] -> sample"`

The code:
```python
    df_result = netcbs.transform(
        query,
        df_sample=df_sample,
        df_agg=df_agg,
        year=2021,
        cbsdata_path="G:/Bevolking",
        return_pandas=False,
        agg_funcs=("avg", "sum", "count"),
        format_file="parquet", #for faster loading, but requires CBS to keep updating the parquet files, use "csv" otherwise
    )
```


The query:
- Starting from the sample, merges with the family data (301=parents, 303=grandparents). 
- Merges the previous dataset with the income and wealth data
- Aggregates using the mean, the sum, the max and the count


In [None]:
# Read sample: For example children up to 10 years old in 2020
df_sample, meta = pyreadstat.read_sav("G:/Bevolking/GBAPERSOONTAB/2020/GBAPERSOON2020TABV3.sav", usecols=["RINPERSOON", "RINPERSOONS", "GBAGEBOORTEJAAR"])
df_sample = pl.DataFrame(df_sample).filter(pl.col("GBAGEBOORTEJAAR") > 2010)


# Read income for everybody in the Netherlands
df_agg, meta = pyreadstat.read_csv("G:/InkomenBestedingen/INPATAB/INPA2020TABV2.sav", usecols=["RINPERSOON", "RINPERSOONS", "INPPERSPRIM", "INPPERSBRUT"])
df_agg = pl.DataFrame(df_agg).filter(pl.col("RINPERSOONS") == "R") #make sure RINPERSOONS is "R", the network files only have data for those rows

query = "[INPPERSPRIM, INPPERSBRUT] -> Family[301,303] -> sample"

df = net.transform(
               query, 
               df_sample = df_sample,  # the sample
               df_agg = df_agg,  # the income and wealth data
               year=2020, # the year of the family data 
               agg_funcs=[pl.mean, pl.sum, pl.max, pl.len],  # the aggregation functions
)


# 2. Synthetic data example
This example uses synthetic data (data that resembles CBS) to show how to use the netCBS library. For each context (Family, Colleagues, Neighbors, Schoolmates, Housemates), we generated a "network file" containing 1,000,000 relationships (see section below). Each relationship is taken at random from any of the context types (see netdbs.contexts2types).

This example runs outside of the RA

In [7]:
random.seed(0)
# Create a synthetic sample (the level of your analysis)
df_sample = pl.DataFrame(
    {
        "RINPERSOON": [str(_) for _ in range(100_000_000, 100_010_000)],
        "outcome": [random.choice([0,1]) for _ in range(10_000)],
        "age":     [random.normalvariate(30, 10) for _ in range(10_000)]
    }
    
)

# Data for all the Netherlands on the measures you want to aggreage
df_agg = pl.DataFrame(
    {
        "RINPERSOON":   [str(_) for _ in range(100_000_000, 101_000_000)],
        "income":       [random.normalvariate(30000, 5000) for _ in range(1_000_000)],
        "number_children": [random.choice([0,1,2,3]) for _ in range(1_000_000)],
        "count":        [1]*1_000_000 
    }
)


### Run query

This is the most important part of the code. Here we will aggregate the variable of interest. In this case, we will aggregate the number of relationships per context.

In [10]:
## Query: The income of the parent's of the schoolmates of the children in the sample

## How to construct the query
# 1. Start with the variables that you want to aggregate, e.g. "[Income, Age] ->"
# 2. Then add the relationships between the tables, e.g., "[Income, Age] -> Family[301]".
# In square brackets you can specify the type of the relationships: 
# write [all] for all, or [301,302] for parents and co-parents
# 3. You can add several tables: "[Income, Age] -> Family[301] -> Schoolmates[all]"
# 4. Finally, you must write "-> Sample" 

## Other parameters
# df_sample: the sample dataframe (with the people you want to have information on)
# df_agg: the dataframe with the information you want to aggregate. For example, the income of all people in the country
# year: the year of the data you want to use
# agg_func: the aggregation function you want to use. For example, pl.mean or pl.sum
# return_pandas: if True, the function returns a pandas dataframe. If False, it returns a polars dataframe
# lazy: if True, the operations are concatenated lazily and computed at the end. If False, the operations are computed immediately
# cbdata_path: the path to the CBS data. Usually this is "G:/Bevolking". In this example, we use synthetic data saved in "cbsdata/Bevolking". 


## The transform function validates the query before running it

# Example
query =  "[income, number_children, count] -> Family[all] -> Schoolmates[all] -> Sample"

df = net.transform(query, 
               df_sample = df_sample, 
               df_agg = df_agg, 
               year=2021,
               cbsdata_path='cbsdata/Bevolking', # Path to the CBS data ("G:/Bevolking"), in this example is synthetic data locally 
               agg_funcs=("mean", "sum", "max", "min", "count"), 
               format_file="csv",
               return_pandas=False)

df    

INFO:netcbs.netcbs:Processing context Schoolmates[all] (1/2)
INFO:netcbs.netcbs:Processing context Family[all] (2/2)
INFO:netcbs.netcbs:Processing final aggregation


RINPERSOON,outcome,age,mean_income,mean_number_children,mean_count,sum_income,sum_number_children,sum_count,max_income,max_number_children,max_count,min_income,min_number_children,min_count,count_income,count_number_children,count_count
str,i64,f64,f64,f64,f64,f64,"decimal[38,0]","decimal[38,0]",f64,i64,i64,f64,i64,i64,i64,i64,i64
"""100005871""",1,38.302852,29378.934443,1.0,1.0,58757.868886,2,2,32453.547067,2,1,26304.321819,0,1,2,2,2
"""100009844""",0,57.722023,32637.175537,1.833333,1.0,195823.053219,11,6,36920.956612,3,1,29061.989453,1,1,6,6,6
"""100006224""",0,47.085106,26871.525606,1.6,1.0,268715.256062,16,10,34806.296361,3,1,19386.880325,0,1,10,10,10
"""100005192""",0,25.427688,31885.264189,0.928571,1.0,446393.698645,13,14,39302.258266,3,1,23387.04539,0,1,14,14,14
"""100009907""",1,32.091691,29749.584311,1.6,1.0,297495.84311,16,10,37687.684728,3,1,24787.094065,0,1,10,10,10
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""100009252""",0,29.594255,,,,,,,,,,,,,,,
"""100009311""",0,45.687213,,,,,,,,,,,,,,,
"""100009364""",1,-0.743768,,,,,,,,,,,,,,,
"""100009619""",1,30.126526,,,,,,,,,,,,,,,


## 3. Create synthetic data (not needed at CBS!)

Don't run this code unless you want to create new synthetic data.
Let's create some synthetic data to explain how the code works

For each context (Family, Colleagues, Neighbors, Schoolmates, Housemates), we will generate a "network file" containing 1,000,000 relationsihps. Each relationship is taken at random from any of the context types (see netdbs.contexts2types).


In [3]:
net.create_synthetic_data("Family", 2021, 1_000_000, outpath="cbsdata/Bevolking")
net.create_synthetic_data("Colleagues", 2021, 1_000_000, outpath="cbsdata/Bevolking")
net.create_synthetic_data("Neighbors", 2021, 1_000_000, outpath="cbsdata/Bevolking")
net.create_synthetic_data("Schoolmates", 2021, 1_000_000, outpath="cbsdata/Bevolking")
net.create_synthetic_data("Housemates", 2021, 1_000_000, outpath="cbsdata/Bevolking")
