In [1]:
#%%
%load_ext autoreload
%autoreload 2

In [None]:
import matplotlib.pyplot as plt
import matplotlib as mpl
import torch
import pandas as pd
from functools import partial
import logging

from mono_multi.setup import (
    ACS_TASKS,
)

from mono_multi.utils import (
    load_data_if_needed,
)

TASKS = ACS_TASKS

  from pkg_resources import DistributionNotFound, get_distribution


## How much do features of the tasks overlap?

In [7]:

from folktables import ACSIncome, ACSEmployment, ACSMobility, ACSPublicCoverage, ACSTravelTime

In [25]:
from itertools import combinations  
tasks_dict = {"ACSIncome": ACSIncome, "ACSEmployment": ACSEmployment, "ACSMobility": ACSMobility, "ACSPublicCoverage":ACSPublicCoverage, "ACSTravelTime": ACSTravelTime}
for (t1,t2) in combinations(tasks_dict.keys(), 2):
    print(t1, t2) 
    common_features = set(tasks_dict[t1].features).intersection(set(tasks_dict[t2].features))
    num_different_features = min(len(set(tasks_dict[t1].features).difference(set(tasks_dict[t2].features))), len(set(tasks_dict[t2].features).intersection(set(tasks_dict[t1].features))))
    print(f"\tcommon features: {len(common_features)} vs difference at least: {num_different_features}")


features = [set(task.features) for task in tasks_dict.values()]
set.intersection(*features)

ACSIncome ACSEmployment
	common features: 6 vs difference at least: 4
ACSIncome ACSMobility
	common features: 8 vs difference at least: 2
ACSIncome ACSPublicCoverage
	common features: 5 vs difference at least: 5
ACSIncome ACSTravelTime
	common features: 7 vs difference at least: 3
ACSEmployment ACSMobility
	common features: 15 vs difference at least: 1
ACSEmployment ACSPublicCoverage
	common features: 15 vs difference at least: 1
ACSEmployment ACSTravelTime
	common features: 10 vs difference at least: 6
ACSMobility ACSPublicCoverage
	common features: 16 vs difference at least: 5
ACSMobility ACSTravelTime
	common features: 9 vs difference at least: 9
ACSPublicCoverage ACSTravelTime
	common features: 10 vs difference at least: 9


{'AGEP', 'MAR', 'RAC1P', 'SCHL', 'SEX'}

## Load Task Data

In [6]:
try:
    data_all
except NameError:
    print("'data_all' not yet defined")
    data_all = load_data_if_needed(data=None, tasks=TASKS)
else:
    print("Already defined, update if needed.")
    data_all = load_data_if_needed(data=data_all, tasks=TASKS)

'data_all' not yet defined
ACSIncome
ACSEmployment
ACSMobility
ACSTravelTime
ACSPublicCoverage
ACSHealthInsurance
ACSIncomePovertyRatio


In [29]:
for (t1,t2) in combinations(tasks_dict.keys(), 2):
    print(t1, t2) 
    common_individuals = set(data_all[t1][1].index).intersection(set(data_all[t2][1].index))
    print(f"\tcommon features: {len(common_individuals)}")

ACSIncome ACSEmployment
	common features: 16586
ACSIncome ACSMobility
	common features: 5080
ACSIncome ACSPublicCoverage
	common features: 6361
ACSIncome ACSTravelTime
	common features: 14810
ACSEmployment ACSMobility
	common features: 6222
ACSEmployment ACSPublicCoverage
	common features: 11289
ACSEmployment ACSTravelTime
	common features: 14754
ACSMobility ACSPublicCoverage
	common features: 4091
ACSMobility ACSTravelTime
	common features: 4268
ACSPublicCoverage ACSTravelTime
	common features: 5097


In [30]:
features = [set(data_all[t][1].index) for t in tasks_dict.keys()]
set.intersection(*features)

set()

In [59]:
from more_itertools import powerset
for subset in powerset(tasks_dict.keys()):
    if len(subset) > 1: 
        print(f"{len(subset)} - {' '.join(subset)}")
        individuals = [set(data_all[t][1].index) for t in subset]
        common = set.intersection(*individuals)
        print(f"{len(common)} individuals in common")
        for i,t in enumerate(subset): 
            print(f"\t {100*len(common)/len(individuals[i]):.2f}% of {t}")

2 - ACSIncome ACSEmployment
16586 individuals in common
	 9.96% of ACSIncome
	 5.13% of ACSEmployment
2 - ACSIncome ACSMobility
5080 individuals in common
	 3.05% of ACSIncome
	 8.18% of ACSMobility
2 - ACSIncome ACSPublicCoverage
6361 individuals in common
	 3.82% of ACSIncome
	 5.59% of ACSPublicCoverage
2 - ACSIncome ACSTravelTime
14810 individuals in common
	 8.90% of ACSIncome
	 10.10% of ACSTravelTime
2 - ACSEmployment ACSMobility
6222 individuals in common
	 1.92% of ACSEmployment
	 10.02% of ACSMobility
2 - ACSEmployment ACSPublicCoverage
11289 individuals in common
	 3.49% of ACSEmployment
	 9.92% of ACSPublicCoverage
2 - ACSEmployment ACSTravelTime
14754 individuals in common
	 4.56% of ACSEmployment
	 10.06% of ACSTravelTime
2 - ACSMobility ACSPublicCoverage
4091 individuals in common
	 6.59% of ACSMobility
	 3.59% of ACSPublicCoverage
2 - ACSMobility ACSTravelTime
4268 individuals in common
	 6.87% of ACSMobility
	 2.91% of ACSTravelTime
2 - ACSPublicCoverage ACSTravelTime


In [49]:
len(individuals[0])

323611

In [51]:
common

set()