In [None]:
import numpy as np
import pandas as pd

In [None]:
ground_truth_df = pd.read_csv('demo_ground_truth_csv.csv')
sample_data_df = pd.read_csv('demographic_attributes.csv')

In [None]:
# There is missing data in the education column (5.5%).
# Since these values are undefined for a given person,
# a person-level weight cannot be calculated for these entries.
# We will drop these rows from the sample dataframe.
sample_data_df.dropna(inplace=True)

In [None]:
# Resetting dataframe indices so they can be directly used
# for populating the demographics matrix, M
sample_data_df.reset_index(drop=True, inplace=True)

In [None]:
# Extract possible values for each of the three main categories.
# NOTE: The ground truth data includes four other age classes for
# individuals under 18. These four classes are wholly absent from
# the sample dataset (demographic_attributes.csv). Since these
# groups are entirely absent from the sample dataset, they will
# be omitted when we calculate the population fractions from the
# ground truth dataset for weighting and unbiasing purposes.
age_list = sample_data_df['age'].unique()
edu_list = sample_data_df['education'].unique()
eth_list = sample_data_df['ethnicity'].unique()

In [None]:
# Create dictionary with keys and corresponding possible values 
class_dict = {'age': age_list, 'education': edu_list, 'ethnicity': eth_list}

In [None]:
# pre-define numpy arrays for computational efficiency
total_cats = len(age_list) + len(edu_list) + len(eth_list)
x_ground = np.zeros(total_cats)
x_sample = np.zeros(total_cats)
M = np.zeros((len(sample_data_df), total_cats))
j = 0
for key in class_dict.keys(): # loop through each demographic
    categories = list(class_dict[key]) # create list of categories for each demo
    # filter census-based df to include only rows for a given demo 
    filtered_df = ground_truth_df[ground_truth_df['demographic category'].isin(categories)]
    total_people = filtered_df['number of individuals'].sum() # total amount of data for a given demo
    for category in categories: # loop through each class of each demographic
        # calculate population fractions for each class in both ground truth and sample data
        x_ground[j] = filtered_df['number of individuals'].loc[filtered_df['demographic category'] == category]/total_people
        x_sample[j] = sample_data_df[key].value_counts()[category]/len(sample_data_df)
        # return indices which correspond to observances of each class
        indices = sample_data_df.index[sample_data_df[key] == category].tolist()
        for index in indices:
            # populate matrix indicating which demo classes each member belongs to
            M[index,j] = 1
        j += 1

In [None]:
X = x_ground*np.reciprocal(x_sample) # calculates ratio of each category in ground to that in the sample data.
# For each class represented in X, if value > 1, then the class is under-represented in the sample
# (i.e., x_sample < x_ground). If value < 1, then the class is over-represented in the sample
# (i.e., x_sample > x_ground).

In [None]:
# TO CALCULATE WEIGHTS:
# Compute dot product of binary class matrix and vector of ground/sample ratios.
# A given person's weight, then, is equal to the sum of the ground/sample ratios
# of the demographic classes to which they belong. 
weights = np.linalg.multi_dot([M, np.transpose(X)])
# prefer to normalize weights to have mean of unity for clarity
weights_normalized = weights/np.mean(weights) 
person_ids = sample_data_df['person id'].values
data = np.stack((person_ids, weights_normalized), axis=1)
# Create the requested dataset of IDs and weights and output to CSV
weights_df = pd.DataFrame(data=data, columns=['person id', 'weight'])
weights_df.to_csv('STEPHEN_GILES_VA_DS_challenge_weights.csv', index=False)