In [30]:
import random
import polars as pl

import netcbs as net

In [31]:
# Print contexts and codebook
print(net.context2types)
print(net.codebook)

{'Family': {301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322}, 'Colleagues': {201}, 'Neighbors': {101, 102}, 'Schoolmates': {501, 502, 503, 504, 505, 506}, 'Housemates': {401, 402}}
{101: 'Neighbor - 10 closest addresses', 102: 'Neighborhood acquaintance - 20 random neighbors within 200 meters', 201: 'Colleague', 301: 'Parent', 302: 'Co-parent', 303: 'Grandparent', 304: 'Child', 305: 'Grandchild', 306: 'Full sibling', 307: 'Half sibling', 308: 'Unknown sibling', 309: 'Full cousin', 310: 'Cousin', 311: 'Aunt/Uncle', 312: 'Partner - married', 313: 'Partner - not married', 314: 'Parent-in-law', 315: 'Child-in-law', 316: 'Sibling-in-law', 317: 'Stepparent', 318: 'Stepchild', 319: 'Stepsibling', 320: 'Married full cousin', 321: 'Married cousin', 322: 'Married aunt/uncle', 401: 'Housemate', 402: 'Housemate - institution', 501: 'Classmate primary education', 502: 'Classmate special education', 503: 'Classmate secondary education', 50

### Select the sample dataframe and the dataframe with the variable to aggregate

For this example we will be using synthetic data. For each context (Family, Colleagues, Neighbors, Schoolmates, Housemates), we generated a "network file" containing 1,000,000 relationships (see section below). Each relationship is taken at random from any of the context types (see netdbs.contexts2types).

We then create two files: one with the IDs (RINPERSOON) in the sample, one with the IDs (RINPERSOON) and the variable to aggregate. In the CBS RA you will use real data.

In [36]:
# Your sample (the level of your analysis)
df_sample = pl.DataFrame(
    {
        "RINPERSOON": [str(_) for _ in range(100_000_000, 100_010_000)],
        "RINPERSOONS": ["R"]*10_000,
        "outcome": [random.choice([0,1]) for _ in range(10_000)],
        "age":     [random.normalvariate(30, 10) for _ in range(10_000)]
    }
    
)

# Data for all the Netherlands on the measures you want to aggreage
df_agg = pl.LazyFrame(
    {
        "RINPERSOON":   [str(_) for _ in range(100_000_000, 101_000_000)],
        "RINPERSOONS":  ["R"]*1_000_000,
        "income":       [random.normalvariate(30000, 5000) for _ in range(1_000_000)],
        "number_children": [random.choice([0,1,2,3]) for _ in range(1_000_000)],
        "count":        [1]*1_000_000 
    }
)



### Run query

This is the most important part of the code. Here we will aggregate the variable of interest. In this case, we will aggregate the number of relationships per context.

In [37]:
## Query: The income of the parent's of the schoolmates of the children in the sample

## How to construct the query
# 1. Start with the variables that you want to aggregate, e.g. "[Income, Age] ->"
# 2. Then add the relationships between the tables, e.g., "[Income, Age] -> Family[301]".
# In square brackets you can specify the type of the relationships: 
# write [all] for all, or [301,302] for parents and co-parents
# 3. You can add several tables: "[Income, Age] -> Family[301] -> Schoolmates[all]"
# 4. Finally, you must write "-> Sample" 

## Other parameters
# df_sample: the sample dataframe (with the people you want to have information on)
# df_agg: the dataframe with the information you want to aggregate. For example, the income of all people in the country
# year: the year of the data you want to use
# agg_func: the aggregation function you want to use. For example, pl.mean or pl.sum
# return_pandas: if True, the function returns a pandas dataframe. If False, it returns a polars dataframe
# lazy: if True, the operations are concatenated lazily and computed at the end. If False, the operations are computed immediately
# cbdata_path: the path to the CBS data. Usually this is "G:/Bevolking". In this example, we use synthetic data saved in "cbsdata/Bevolking". 


## The transform function validates the query before running it

# Example
query =  "[income, number_children, count] -> Family[all] -> Schoolmates[all] -> Sample"

df = net.transform(query, 
               df_sample = df_sample, 
               df_agg = df_agg, 
               year=2021,
               cbsdata_path='cbsdata/Bevolking', # Path to the CBS data ("G:/Bevolking"), in this example is synthetic data locally 
               agg_funcs=[pl.mean, pl.sum, pl.max], 
               return_pandas=False, 
               lazy=True)

df    

INFO:netcbs.netcbs:Dropping duplicated entries (if any). Check this before submitting the query or set lazy==False
INFO:netcbs.netcbs:Dropping duplicated entries (if any). Check this before submitting the query or set lazy==False


RINPERSOON,RINPERSOONS,outcome,age,mean_income,mean_number_children,mean_count,sum_income,sum_number_children,sum_count,max_income,max_number_children,max_count
str,str,i64,f64,f64,f64,f64,f64,i64,i64,f64,i64,i64
"""100000000""","""R""",0,40.172373,28421.175621,1.875,1.0,227369.404967,15,8,33634.868331,3,1
"""100000001""","""R""",1,33.479395,26408.690763,1.5,1.0,158452.144579,9,6,31544.134458,2,1
"""100000002""","""R""",0,29.041263,24888.908492,0.5,1.0,99555.633966,2,4,28661.583601,2,1
"""100000003""","""R""",1,14.494025,36714.226866,2.0,1.0,146856.907463,8,4,42391.515627,3,1
"""100000004""","""R""",0,16.608723,35368.323366,2.2,1.0,176841.616832,11,5,40536.70032,3,1
…,…,…,…,…,…,…,…,…,…,…,…,…
"""100009995""","""R""",0,13.135631,29757.908388,1.6,1.0,148789.541939,8,5,35591.833033,3,1
"""100009996""","""R""",1,47.612453,31296.180819,1.25,1.0,250369.446551,10,8,38006.859021,3,1
"""100009997""","""R""",1,33.239322,19702.47802,2.333333,1.0,59107.43406,7,3,32228.863963,3,1
"""100009998""","""R""",1,33.342422,32885.125261,1.222222,1.0,295966.127349,11,9,40461.686903,3,1


## Create synthetic data (not needed at CBS!)

Let's create some synthetic data to explain how the code works

For each context (Family, Colleagues, Neighbors, Schoolmates, Housemates), we will generate a "network file" containing 1,000,000 relationsihps. Each relationship is taken at random from any of the context types (see netdbs.contexts2types).


In [3]:
net.create_synthetic_data("Family", 2021, 1_000_000, outpath="cbsdata/Bevolking")
net.create_synthetic_data("Colleagues", 2021, 1_000_000, outpath="cbsdata/Bevolking")
net.create_synthetic_data("Neighbors", 2021, 1_000_000, outpath="cbsdata/Bevolking")
net.create_synthetic_data("Schoolmates", 2021, 1_000_000, outpath="cbsdata/Bevolking")
net.create_synthetic_data("Housemates", 2021, 1_000_000, outpath="cbsdata/Bevolking")


## Reading data directly with polars

In [38]:
import polars as pl
pl.read_csv("cbsdata/Bevolking/FAMILIENETWERKTAB/FAMILIENETWERK2021TABV1.csv", 
             n_rows=10, separator=";", dtypes={"RINPERSOON": str})

RINPERSOON,RINPERSOONS,RINPERSOONRELATIE,RINPERSOONSRELATIE,RELATIE
str,str,i64,str,i64
"""100439107""","""R""",100342679,"""R""",310
"""100086293""","""R""",100108614,"""R""",310
"""100150055""","""R""",100058381,"""R""",317
"""100453751""","""R""",100448572,"""R""",313
"""100325504""","""R""",100041245,"""R""",315
"""100211113""","""R""",100402661,"""R""",314
"""100312768""","""R""",100415389,"""R""",317
"""100302277""","""R""",100091753,"""R""",315
"""100024777""","""R""",100126339,"""R""",313
"""100387455""","""R""",100439892,"""R""",316
