In [4]:
import numpy as np
import pandas as pd
import os, sys

In [5]:
dataset_name = 'ipums_census_small'
inp_fname = 'ipums_census_small.csv'

In [6]:
input_dir = './data'
output_dir = f'./../../processed/{dataset_name}/'
outp_fname = os.path.join(output_dir, f'{dataset_name}.csv')

# Read Data

In [7]:
cols = [
   'year',
'gq',
'gqtypeg',
'farm',
'ownershg',
'value',
'rent',
'ftotinc',
'nfams',
'ncouples',
'nmothers',
'nfathers',
'momloc',
'stepmom',
'momrule',
'poploc',
'steppop',
'poprule',
'sploc',
'sprule',
'famsize',
'nchild',
'nchlt5',
'famunit',
'eldch',
'yngch',
'nsibs',
'relateg',
'age',
'sex',
'raceg',
'marst',
'chborn',
'bplg',
'school',
'educrec',
'schltype',
'empstatg',
'labforce',
'occ1950',
'occscore',
'sei',
'ind1950',
'classwkg',
'wkswork2',
'hrswork2',
'yrlastwk',
'workedyr',
'inctot',
'incwage',
'incbus',
'incfarm',
'incss',
'incwelfr',
'incother',
'poverty',
'migrat5g',
'migplac5',
'movedin',
'vetstat',
'tranwork',
]

In [8]:
data = pd.read_csv(os.path.join(input_dir, inp_fname))
data.head()

Unnamed: 0,year,gq,gqtypeg,farm,ownershg,value,rent,ftotinc,nfams,ncouples,...,incfarm,incss,incwelfr,incother,poverty,migrat5g,migplac5,movedin,vetstat,tranwork
0,98,HH_in_1970_definition_but_sampled_as_larger_unit,NA_(household),Non-Farm,Rented,999999,325,9505,1,1,...,0,0,0,0,128,Same_state_countydifferent_house,California,0,No_Service,?
1,98,HH_in_1970_definition_but_sampled_as_larger_unit,NA_(household),Non-Farm,Rented,999999,287,8005,1,0,...,0,0,0,0,211,?,?,1,No_Service,?
2,98,HH_in_1970_definition_but_sampled_as_larger_unit,NA_(household),Non-Farm,Owned_or_being_bought_(loan),85000,0,29635,1,1,...,999999,99999,99999,99999,304,?,?,0,?,?
3,98,HH_in_1970_definition_but_sampled_as_larger_unit,NA_(household),Non-Farm,Owned_or_being_bought_(loan),999999,0,52895,1,1,...,0,0,0,0,501,Same_house,Same_house,0,No_Service,?
4,98,HH_in_1970_definition_but_sampled_as_larger_unit,NA_(household),Non-Farm,Rented,999999,185,7610,2,0,...,0,0,0,0,119,?,?,0,No_Service,Auto


In [9]:
data.shape

(7485, 61)

In [10]:
id_col = "id"
target_col = "movedin"

# Prepare Data

In [11]:
data = data.replace(999999, np.nan)
data = data.replace(99999, np.nan)
data = data.replace("?", np.nan)

In [12]:
data.head()

Unnamed: 0,year,gq,gqtypeg,farm,ownershg,value,rent,ftotinc,nfams,ncouples,...,incfarm,incss,incwelfr,incother,poverty,migrat5g,migplac5,movedin,vetstat,tranwork
0,98,HH_in_1970_definition_but_sampled_as_larger_unit,NA_(household),Non-Farm,Rented,,325,9505.0,1,1,...,0.0,0.0,0.0,0.0,128,Same_state_countydifferent_house,California,0,No_Service,
1,98,HH_in_1970_definition_but_sampled_as_larger_unit,NA_(household),Non-Farm,Rented,,287,8005.0,1,0,...,0.0,0.0,0.0,0.0,211,,,1,No_Service,
2,98,HH_in_1970_definition_but_sampled_as_larger_unit,NA_(household),Non-Farm,Owned_or_being_bought_(loan),85000.0,0,29635.0,1,1,...,,,,,304,,,0,,
3,98,HH_in_1970_definition_but_sampled_as_larger_unit,NA_(household),Non-Farm,Owned_or_being_bought_(loan),,0,52895.0,1,1,...,0.0,0.0,0.0,0.0,501,Same_house,Same_house,0,No_Service,
4,98,HH_in_1970_definition_but_sampled_as_larger_unit,NA_(household),Non-Farm,Rented,,185,7610.0,2,0,...,0.0,0.0,0.0,0.0,119,,,0,No_Service,Auto


In [13]:
data[target_col].value_counts()

0    4802
2     860
1     726
5     412
6     401
7     213
8      71
Name: movedin, dtype: int64

# Insert Id Column

In [14]:
# insert Id column 
if id_col not in data.columns:
    N = data.shape[0]
    data.insert(0, id_col, np.arange(N))
    print(data.head())
data[id_col] = data[id_col].astype(str)

   id  year                                                gq         gqtypeg  \
0   0    98  HH_in_1970_definition_but_sampled_as_larger_unit  NA_(household)   
1   1    98  HH_in_1970_definition_but_sampled_as_larger_unit  NA_(household)   
2   2    98  HH_in_1970_definition_but_sampled_as_larger_unit  NA_(household)   
3   3    98  HH_in_1970_definition_but_sampled_as_larger_unit  NA_(household)   
4   4    98  HH_in_1970_definition_but_sampled_as_larger_unit  NA_(household)   

       farm                      ownershg    value  rent  ftotinc  nfams  ...  \
0  Non-Farm                        Rented      NaN   325   9505.0      1  ...   
1  Non-Farm                        Rented      NaN   287   8005.0      1  ...   
2  Non-Farm  Owned_or_being_bought_(loan)  85000.0     0  29635.0      1  ...   
3  Non-Farm  Owned_or_being_bought_(loan)      NaN     0  52895.0      1  ...   
4  Non-Farm                        Rented      NaN   185   7610.0      2  ...   

   incfarm  incss  incwelf

# Save Main Data File

In [15]:
data.to_csv(outp_fname, index=False)