In [1]:
import numpy as np
import pandas as pd
from scipy.stats import norm
from sklearn.decomposition import PCA

from folktables import ACSDataSource, ACSIncome

In [2]:
state_list = ['AL','AK','AZ','AR','CA','CO','CT','DE','FL','GA','HI','ID','IL','IN','IA','KS','KY','LA','ME','MD','MA','MI','MN','MS','MO','MT','NE','NV','NH','NJ','NM','NY','NC','ND','OH','OK','OR','PA','RI','SC','SD','TN','TX','UT','VT','VA','WA','WV','WI','WY']

In [78]:
data_source = ACSDataSource(survey_year='2018', horizon='5-Year', survey='person')
ca_data = data_source.get_data(states=["MD"], download=True)
mi_data = data_source.get_data(states=["MI"], download=True)
ca_features, ca_labels, _ = ACSIncome.df_to_pandas(ca_data)
mi_features, mi_labels, _ = ACSIncome.df_to_pandas(mi_data)

In [90]:
df1 = ca_features
df2 = mi_features

In [92]:
m1, m2, dataset_distance(binarize(df1), binarize(df2))

(0.006026212308073893,
 0.5916416060085228,
 -0.0040034265541733635,
 0.5942882164796857)

In [101]:
data_source = ACSDataSource(survey_year='2018', horizon='5-Year', survey='person')
results = {}
for i in range(len(state_list) - 1):
    d1 = data_source.get_data(states=[state_list[i]], download=True)
    d1, _, _ = ACSIncome.df_to_pandas(d1)
    d1 = binarize(d1)
    for j in range(i + 1,len(state_list)):
        d2 = data_source.get_data(states=[state_list[j]], download=True)
        d2, _, _ = ACSIncome.df_to_pandas(d2)
        d2 = binarize(d2)
        
        m1, s1, m2, s2 = dataset_distance(d1, d2)

        results[state_list[i] + state_list[j]] = [abs((m1 - m2)), abs(s1 / s2 if s1/s2 >= 1 else s2/s1)]

Downloading data for 2018 5-Year person survey for AL...
Downloading data for 2018 5-Year person survey for AK...
Downloading data for 2018 5-Year person survey for AZ...
Downloading data for 2018 5-Year person survey for AR...
Downloading data for 2018 5-Year person survey for CA...
Downloading data for 2018 5-Year person survey for CO...
Downloading data for 2018 5-Year person survey for CT...
Downloading data for 2018 5-Year person survey for DE...
Downloading data for 2018 5-Year person survey for FL...
Downloading data for 2018 5-Year person survey for GA...
Downloading data for 2018 5-Year person survey for HI...
Downloading data for 2018 5-Year person survey for ID...
Downloading data for 2018 5-Year person survey for IL...
Downloading data for 2018 5-Year person survey for IN...
Downloading data for 2018 5-Year person survey for IA...
Downloading data for 2018 5-Year person survey for KS...
Downloading data for 2018 5-Year person survey for KY...
Downloading data for 2018 5-Yea

In [102]:
results

{'ALAK': [0.13359368440430652, 1.0234680398591833],
 'ALAZ': [0.026451389431127355, 1.0158834301517154],
 'ALAR': [0.020607089537565935, 1.034379901792378],
 'ALCA': [0.056596897097447044, 1.0950536938288793],
 'ALCO': [0.0175065266701876, 1.025364501210984],
 'ALCT': [0.008937682285182443, 1.011156055845769],
 'ALDE': [0.025050626290209183, 1.0173514552367993],
 'ALFL': [0.014906922187561969, 1.021306028030709],
 'ALGA': [0.0343340008393747, 1.0015891731800264],
 'ALHI': [0.15628031861297167, 1.053474113770027],
 'ALID': [0.05082726342560389, 1.043688250760341],
 'ALIL': [0.010834260971008463, 1.0049611309426216],
 'ALIN': [0.014217658866227608, 1.0126674805346307],
 'ALIA': [0.04247665537112408, 1.0190817304636122],
 'ALKS': [0.045015405166421496, 1.027289914017121],
 'ALKY': [2.238024792278307e-05, 1.0255657874184392],
 'ALLA': [0.05383734488911503, 1.0022101302325241],
 'ALME': [0.03969969009164674, 1.0271341624825867],
 'ALMD': [0.029710788743591128, 1.0052951166232633],
 'ALMA': 

In [103]:
inverse = [(value[0], key) for key, value in results.items()]
print(max(inverse)[1])

AKND


In [105]:
import pickle as pkl
with open("shifts", "wb") as output_file:
    pkl.dump(results, output_file)

In [21]:
def dataset_distance(data1, data2):
    data = pd.concat([data1, data2], axis=0)
    
    pca = PCA(n_components=2)
    pc2 = pca.fit_transform(data)
    pc = pc2[:, 0]
    pc = pc.reshape(-1, 1)

    mean1 = np.mean(pc[:len(data1)])
    std1 = np.std(pc[:len(data1)])
    mean2 = np.mean(pc[len(data1):len(data1)+len(data2)])
    std2 = np.std(pc[len(data1):len(data1)+len(data2)])

    return mean1, std1, mean2, std2

In [83]:
def binarize(df2):
    df = df2.copy()
    
    removed_columns = []

    bins = [0, 40, 100]
    groups = pd.cut(df['AGEP'], bins=bins)
    df['age'] = groups
    num_groups = len(df['age'].cat.categories)
    df['age'] = df['age'].cat.rename_categories(range(num_groups))
    removed_columns.append('AGEP')
    
    unemp = [9, 8, 7, 6]
    df['workclass'] = np.where(df['COW'].isin(unemp), 0, 1)
    removed_columns.append("COW")
    
    edu = [18, 19, 20, 21, 22, 23, 24]
    df['education'] = np.where(df['SCHL'].isin(edu), 1, 0)
    removed_columns.append("SCHL")
    
    bins = [0, 39, 1000]
    groups = pd.cut(df['WKHP'], bins=bins)
    df['hours-per-week'] = groups
    num_groups = len(df['hours-per-week'].cat.categories)
    df['hours-per-week'] = df['hours-per-week'].cat.rename_categories(range(num_groups))
    removed_columns.append('WKHP')
    
    mar = [1]
    df['married'] = np.where(df['MAR'].isin(mar), 0, 1)
    removed_columns.append("MAR")
    
    occp = [5710, 5740, 7010, 4700, 4710, 4750, 4760, 4800, 4810, 4820, 4840, 4850, 4920, 4930, 4950, 4965, 9130, 3725, 3960, 9800, 9810, 9825, 9830]
    df['occupation'] = np.where(df['OCCP'].isin(occp), 1, 0)
    removed_columns.append("OCCP")
    
    race = [1]
    df['race'] = np.where(df['RAC1P'].isin(race), 1, 0)
    removed_columns.append("RAC1P")
    
    sex = [2]
    df['sex'] = np.where(df['SEX'].isin(sex), 1, 0)
    removed_columns.append("SEX")
    
    pob = [i for i in range(1, 60)]
    df['birthplace'] = np.where(df['POBP'].isin(pob), 1, 0)
    removed_columns.append("POBP")
    removed_columns.append("RELP")
    
    df = df.drop(removed_columns, axis=1)
    return df