Household Survey Converer

Data was obtained at 

https://www12.statcan.gc.ca/nhs-enm/2011/dp-pd/prof/details/download-telecharger/comprehensive/comp-csv-tab-nhs-enm.cfm?Lang=E

This code will convert that spreadsheet into useful data for Syntheco, and eventually into Neurohub as a standard bit of information.

In [124]:
import csv
import os
import random
import math
import pandas as pd
import numpy as np
import itertools

Utility Functions

random_round_int - converts a floating point number to an integer where a random up or down based on 50%
fill_counts_uniform - takes an array of integers and a total and fills in the -1s with a uniform distribution

In [47]:
def random_round_int(float_num):
    rand_prob = random.random()
    if rand_prob <= .50:
        return int(math.floor(float_num))
    else:
        return int(math.ceil(float_num))

In [48]:
def fill_counts_uniform(counts,total):
    denom = 0
    total_uni = total
    counts_new = []
    for x in counts:
        if x == -1:
            denom += 1
        else:
            total_uni = total_uni - x
    
    #print(denom)
    #print(total_uni)
    if denom == 0:
        return counts
    
    uni_value = float(total_uni)/float(denom)
    if uni_value < 0:
        uni_value = 0
    #print(uni_value)
    
    counts_new = []
    for x in counts:
        if x < 0:
            counts_new.append(random_round_int(uni_value))
        else:
            counts_new.append(x)
    
    return counts_new

Here are input variable definitions

In [49]:
data_location = "C:/Users/shawn/Programs/Syntheco_darcy/Data"
quebec_datatable = os.path.join(data_location,"99-004-XWE2011001-401-QUE.csv")
output_marginal_table = os.path.join(data_location,"HHInc_Marginal_QUE.csv")
output_marginal_key_file = os.path.join(data_location,"HHInc_Marginal_QUE_key.csv")

Now begins the main section of the tool.

In [76]:
data = pd.read_csv(quebec_datatable, encoding='cp1252',dtype={'Geo_Code':str,'Flag Total':str})

This now uses only pandas to do the filtering, TODO: add the filling in of values like before



In [160]:
def gather_counts_from_topic_metric(data_, desired_topic_, desired_metric_,tract_mask_,n_):
    is_topic = data_['Topic'] == desired_topic_
    is_tract = data_['Geo_Code'].str.startswith(str(tract_mask_))
    df = data_[is_topic & is_que]
    
    idx = df.index.get_indexer_for(df[df.Characteristic==desired_metric].index)
    idx = np.asarray(list(itertools.chain.from_iterable([list(range(x,x+n_)) for x in idx])))
    df = df.iloc[idx]
    
    # Convert the categories to an index paramter
    cats = df["Characteristic"].head(14).tolist()
    for c in cats:
        df = df.replace(c,cats.index(c))
    return {"Categories":cats,"Data":df}

In [164]:
df_data = gather_counts_from_topic_metric(data,
                                     "Income of households in 2010",
                                     "Household total income in 2010 of private households",
                                     "462",14)

with open("HHInc2011_marginals.csv","w",newline='') as f:
    df_data['Data'].pivot_table('Total',['Geo_Code'],'Characteristic').to_csv(f,quoting=csv.QUOTE_ALL)

with open("HHInc2011_metadata.csv","w",newline='') as f:
    csv_write = csv.writer(f)
    csv_write.writerow(["index","value"])
    for x in df_data['Categories']:
        csv_write.writerow([df_data['Categories'].index(x),x.strip()])

In [18]:
### Create Marginal Tables for household income

# map the income of households
desired_topic = "Income of households in 2010"
desired_metric = "Household total income in 2010 of private households"

category_map, counts = gather_counts_from_topic_metric(desired_topic,desired_metric,"462")

### Write Category Key File
with open(output_marginal_key_file,"w", newline='') as csv_file:
    csv_write = csv.writer(csv_file)
    csv_write.writerow(["index","value"])
    for x in category_map:
        csv_write.writerow([category_map.index(x),x.strip()])

        
with open(output_marginal_table,"w",newline='') as csv_file:
    csv_write = csv.writer(csv_file)
    ## Write the header
    csv_write.writerow(["CT","Total"]+[i for i in range(1,14)])
    for code,cs in counts.items():
        csv_write.writerow([code,cs['Total']] + cs['counts'])

doing 4620725.07
doing 4620725.08
doing 4620652.04
doing 4620887.05
doing 4620887.06
doing 4620688.03
doing 4620688.04
doing 4620001.00
doing 4620002.00
doing 4620003.00
doing 4620004.00
doing 4620005.00
doing 4620006.00
doing 4620007.00
doing 4620008.00
doing 4620009.00
doing 4620010.00
doing 4620011.00
doing 4620013.00
doing 4620015.00
doing 4620016.00
doing 4620017.00
doing 4620018.00
doing 4620019.00
doing 4620021.00
doing 4620022.00
doing 4620023.00
doing 4620024.00
doing 4620025.00
doing 4620026.00
doing 4620027.00
doing 4620028.00
doing 4620029.00
doing 4620030.00
doing 4620031.00
doing 4620032.00
doing 4620033.00
doing 4620034.00
doing 4620035.00
doing 4620036.00
doing 4620037.00
doing 4620038.00
doing 4620039.00
doing 4620041.00
doing 4620042.00
doing 4620043.00
doing 4620044.00
doing 4620045.00
doing 4620046.00
doing 4620047.00
doing 4620048.00
doing 4620049.00
doing 4620050.00
doing 4620051.00
doing 4620052.00
doing 4620053.00
doing 4620054.00
doing 4620056.00
doing 4620057.

doing 4620590.02
doing 4620591.01
doing 4620591.02
doing 4620603.01
doing 4620603.02
doing 4620604.01
doing 4620604.02
doing 4620605.01
doing 4620605.02
doing 4620610.01
doing 4620610.02
doing 4620610.03
doing 4620610.04
doing 4620611.01
doing 4620611.02
doing 4620617.01
doing 4620617.02
doing 4620630.01
doing 4620630.02
doing 4620632.01
doing 4620632.02
doing 4620638.01
doing 4620638.02
doing 4620638.03
doing 4620646.01
doing 4620646.02
doing 4620646.03
doing 4620650.01
doing 4620650.02
doing 4620650.03
doing 4620652.01
doing 4620700.01
doing 4620756.02
doing 4620757.00
doing 4620775.00
doing 4620825.01
doing 4620825.02
doing 4620825.03
doing 4620825.04
doing 4620827.02
doing 4620852.01
doing 4620852.02
doing 4620857.01
doing 4620857.02
doing 4620868.01
doing 4620868.02
doing 4620879.01
doing 4620879.02
doing 4620881.01
doing 4620881.02
doing 4620889.01
doing 4620889.02
doing 4620889.03
doing 4620684.10
doing 4620684.11
doing 4620888.01
doing 4620888.02
doing 4620888.03
doing 4620687.

KeyboardInterrupt: 