In [61]:
import os
from pathlib import Path

import numpy as np
import pandas as pd
from pandas.api.types import CategoricalDtype
import xarray as xr

from fake_data_for_learning.contingency_tables import calculate_contingency_table

## Data wrangling example(s) with [pandas](https://pandas.pydata.org/pandas-docs/stable/) and [xarray](https://docs.xarray.dev/en/stable/index.html)

In [62]:
# Set (local) data directory
datadir = Path(os.getcwd()) / 'data'
path_or_url = datadir / 'default.csv'
if not path_or_url.exists():
    path_or_url = 'https://raw.githubusercontent.com/munichpavel/risk-ai-workshop/main/notebooks/data/default.csv'

In [63]:
df = pd.read_csv(path_or_url, sep=',')
print(f'Loaded data table of shape {df.shape}')
n_records = df.shape[0]
df.head()

Loaded data table of shape (10000, 4)


Unnamed: 0,gender,occupation,activity,default
0,1,1,0,0
1,1,1,0,0
2,1,1,0,0
3,0,0,1,1
4,1,1,0,0


### Basic slicing and dicing with pandas

See also https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html.

#### Filtering rows

In [64]:
# Create a boolean mask, and calculate subpopulation ratio
mask_female = df['gender'] == 0
subpopulation_ratio = sum(mask_female) / n_records
print(f'Female subpopulation ratio is {subpopulation_ratio}')

Female subpopulation ratio is 0.4857


In [65]:
# or using more sql-like functionality
grouped = df.groupby('gender')
grouped_size = grouped.size()
subpopulation_ratio = grouped_size[0] / n_records
print(f'Female subpopulation ratio is {subpopulation_ratio}')

Female subpopulation ratio is 0.4857


## Data wrangling with xarray example(s)

In [66]:
default_values = [0, 1]
gender_values = [0, 1]
occupation_values = [0, 1]
activity_values = [0, 1]
df_categories = dict(
    default=CategoricalDtype(categories=default_values, ordered=True),
    gender=CategoricalDtype(categories=gender_values, ordered=True),
    activity=CategoricalDtype(categories=activity_values, ordered=True),
    occupation=CategoricalDtype(categories=occupation_values, ordered=True)
)
for column, c_type in df_categories.items():
    df[column] = df[column].astype(c_type)
    
contingency = calculate_contingency_table(df[df_categories.keys()])
contingency

In [67]:
subpopulation_ratio = contingency.sel(gender=0).sum().values / n_records
print(f'Female population ratio with xarray: {subpopulation_ratio}')

Female population ratio with xarray: 0.4857


## Exercise: Data wrangling with pandas

Difficulty: (*)

Calculate the subpopulation ratio within the artifical [credit default data](https://github.com/munichpavel/risk-ai-workshop/blob/main/notebooks/data/default.csv) of 

* males among total population
* females of occupation 0 ("education") who default (default = 1) among total population
* males of occupation 0 ("education") who default (default=1) among total population

In [68]:
mask_male2 = df["gender"] == 1
males_subpopulationratio = sum(mask_male2) / n_records
print(f'Males population ratio with pandas: {males_subpopulationratio}')


Males population ratio: 0.5143


In [69]:
females_ed_def = df[(df['gender']== 0) & (df['occupation'] == 0) & (df['default'] == 1)] 
females_sapac = females_ed_def["gender"] == 0
females_subpopulationratio = len(females_ed_def) / n_records

print(f'Females population ratio with pandas: {females_subpopulationratio}')


Females population ratio: 0.1641


In [74]:
males_ed_def = df[(df.gender== 1) & (df.occupation == 0) & (df.default == 1)] 
males_sapac = males_ed_def["gender"] == 1
males_subpopulationratio = sum(males_sapac) / n_records

print(f'Males population ratio with pandas: {males_subpopulationratio}')


Males population ratio with pandas: 0.0618


## Exercise: Data wrangling with xarray

Difficulty: (**)

Calculate the subpopulation ratio within the artifical [credit default data](https://github.com/munichpavel/risk-ai-workshop/blob/main/notebooks/data/default.csv) of 

* males among total population
* female education workers (occupation 0) among education workers
* female health workers (occupation 1) among health workers

In [71]:
male_subpopulation_ratio = contingency.sel(gender=1).sum().values / n_records
print(f'Female population ratio with xarray: {male_subpopulation_ratio}')

Female population ratio with xarray: 0.5143


In [76]:
a = contingency.sel(gender=0, occupation = 0).sum().values
b = contingency.sel(occupation = 0).sum().values

female_subpopulation_ratio =  a / b
print(f'Female population ratio with xarray and occupation = 0: {female_subpopulation_ratio}')

Female population ratio with xarray and occupation = 0: 0.7353715326989269


In [78]:
female_subpopulation_ratio1 = contingency.sel(gender=0, occupation = 1).sum().values / contingency.sel(occupation = 1).sum().values
print(f'Female population ratio with xarray and occupation = 1: {female_subpopulation_ratio1}')

Female population ratio with xarray and occupation = 1: 0.24204702627939143
