In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('cleaned_info_2019.csv', index_col=0)
df.head()

Unnamed: 0,geoid,ethnicity,recency_date,age,children,married,num_tradelines,loc_type,renter,home_value,vacant
0,60750601001,WHITE,2018-08-01,35-39,1,0,0,SFDU,1,1151000,0
1,60750161004,WHITE,2018-09-01,35-39,0,0,0,SFDU,1,1272000,0
2,60750111002,UNKNOWN,2019-02-01,75+,0,1,1,MFDU,0,1193000,0
3,60750104003,WHITE,2019-02-01,50-54,1,1,0,MFDU,0,693000,0
4,60750301023,WHITE,2019-05-01,30-34,0,0,3,MFDU,0,1458000,0


In [3]:
df.geoid.value_counts()

60750607001    7920
60750326022    4799
60750615001    4430
60750261004    4179
60750209001    3673
               ... 
60756007002      10
60756004023      10
60756009001       7
60756002002       5
60756004012       2
Name: geoid, Length: 587, dtype: int64

# One-Hot Encoding

In [4]:
from sklearn.feature_extraction import DictVectorizer

In [5]:
def one_hot(data, cols):
    """
    Return the one-hot encoded dataframe of our input data.
    
    Parameters
    -----------
    data: a dataframe that may include non-numerical features
    cols: a list of column names to encode
    
    Returns
    -----------
    A one-hot encoded dataframe that only contains numeric features
    
    """
    # From lab section 11-12 10/26/20
    v = DictVectorizer()
    v.fit(data[cols].to_dict(orient='records'))
    cat_data = v.transform(data[cols].to_dict(orient='records')).toarray()
    cd_names = v.get_feature_names() #cols for cat_data columns
    cat_data = pd.DataFrame(cat_data, columns=cd_names)
#     data = data.drop(columns='').reset_index()
    return pd.concat([data, cat_data], axis = 1)

In [6]:
df = one_hot(df, ['ethnicity', 'age', 'loc_type'])
df.head()

Unnamed: 0,geoid,ethnicity,recency_date,age,children,married,num_tradelines,loc_type,renter,home_value,...,ethnicity=LATINX,ethnicity=OTHER,ethnicity=UNKNOWN,ethnicity=WHITE,loc_type=MFDU,loc_type=Nursing Home,loc_type=Retirement Home,loc_type=SFDU,loc_type=Trailer,loc_type=Undefined
0,60750601001,WHITE,2018-08-01,35-39,1,0,0,SFDU,1,1151000,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
1,60750161004,WHITE,2018-09-01,35-39,0,0,0,SFDU,1,1272000,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
2,60750111002,UNKNOWN,2019-02-01,75+,0,1,1,MFDU,0,1193000,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,60750104003,WHITE,2019-02-01,50-54,1,1,0,MFDU,0,693000,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
4,60750301023,WHITE,2019-05-01,30-34,0,0,3,MFDU,0,1458000,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0


In [7]:
df.columns

Index(['geoid', 'ethnicity', 'recency_date', 'age', 'children', 'married',
       'num_tradelines', 'loc_type', 'renter', 'home_value', 'vacant',
       'age=25-29', 'age=30-34', 'age=35-39', 'age=40-44', 'age=45-49',
       'age=50-54', 'age=55-59', 'age=60-64', 'age=65+', 'age=65-69',
       'age=70-74', 'age=75+', 'age=<25', 'ethnicity=ASIAN', 'ethnicity=BLACK',
       'ethnicity=LATINX', 'ethnicity=OTHER', 'ethnicity=UNKNOWN',
       'ethnicity=WHITE', 'loc_type=MFDU', 'loc_type=Nursing Home',
       'loc_type=Retirement Home', 'loc_type=SFDU', 'loc_type=Trailer',
       'loc_type=Undefined'],
      dtype='object')

In [8]:
df.drop(columns=['ethnicity', 'age', 'home_value', 'recency_date', 'num_tradelines', 'loc_type'], inplace=True)

In [9]:
df.head()

Unnamed: 0,geoid,children,married,renter,vacant,age=25-29,age=30-34,age=35-39,age=40-44,age=45-49,...,ethnicity=LATINX,ethnicity=OTHER,ethnicity=UNKNOWN,ethnicity=WHITE,loc_type=MFDU,loc_type=Nursing Home,loc_type=Retirement Home,loc_type=SFDU,loc_type=Trailer,loc_type=Undefined
0,60750601001,1,0,1,0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
1,60750161004,0,0,1,0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
2,60750111002,0,1,0,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,60750104003,1,1,0,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
4,60750301023,0,0,0,0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0


In [10]:
grouped = df.groupby('geoid').sum()
grouped.head(2)

Unnamed: 0_level_0,children,married,renter,vacant,age=25-29,age=30-34,age=35-39,age=40-44,age=45-49,age=50-54,...,ethnicity=LATINX,ethnicity=OTHER,ethnicity=UNKNOWN,ethnicity=WHITE,loc_type=MFDU,loc_type=Nursing Home,loc_type=Retirement Home,loc_type=SFDU,loc_type=Trailer,loc_type=Undefined
geoid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
60750101001,44,21,748,0,110.0,142.0,105.0,78.0,59.0,43.0,...,53.0,2.0,50.0,474.0,643.0,0.0,0.0,120.0,0.0,0.0
60750101002,197,244,1379,0,163.0,233.0,231.0,161.0,163.0,205.0,...,140.0,4.0,120.0,1142.0,1328.0,0.0,60.0,507.0,0.0,0.0


In [11]:
grouped.columns

Index(['children', 'married', 'renter', 'vacant', 'age=25-29', 'age=30-34',
       'age=35-39', 'age=40-44', 'age=45-49', 'age=50-54', 'age=55-59',
       'age=60-64', 'age=65+', 'age=65-69', 'age=70-74', 'age=75+', 'age=<25',
       'ethnicity=ASIAN', 'ethnicity=BLACK', 'ethnicity=LATINX',
       'ethnicity=OTHER', 'ethnicity=UNKNOWN', 'ethnicity=WHITE',
       'loc_type=MFDU', 'loc_type=Nursing Home', 'loc_type=Retirement Home',
       'loc_type=SFDU', 'loc_type=Trailer', 'loc_type=Undefined'],
      dtype='object')

In [12]:
grouped['hh_100'] = grouped[['ethnicity=ASIAN', 'ethnicity=BLACK', 'ethnicity=LATINX',
                             'ethnicity=OTHER', 'ethnicity=UNKNOWN', 'ethnicity=WHITE']
                           ].sum(axis=1) / 100

In [13]:
grouped.drop(columns=['ethnicity=UNKNOWN', 'loc_type=Nursing Home', 'loc_type=Undefined'], inplace=True)
grouped.head()

Unnamed: 0_level_0,children,married,renter,vacant,age=25-29,age=30-34,age=35-39,age=40-44,age=45-49,age=50-54,...,ethnicity=ASIAN,ethnicity=BLACK,ethnicity=LATINX,ethnicity=OTHER,ethnicity=WHITE,loc_type=MFDU,loc_type=Retirement Home,loc_type=SFDU,loc_type=Trailer,hh_100
geoid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
60750101001,44,21,748,0,110.0,142.0,105.0,78.0,59.0,43.0,...,162.0,22.0,53.0,2.0,474.0,643.0,0.0,120.0,0.0,7.63
60750101002,197,244,1379,0,163.0,233.0,231.0,161.0,163.0,205.0,...,459.0,30.0,140.0,4.0,1142.0,1328.0,60.0,507.0,0.0,18.95
60750102001,102,188,588,1,97.0,116.0,127.0,120.0,76.0,100.0,...,106.0,21.0,57.0,3.0,844.0,622.0,0.0,502.0,0.0,11.24
60750102002,144,149,1252,2,182.0,284.0,199.0,147.0,137.0,122.0,...,143.0,15.0,79.0,0.0,1268.0,1214.0,0.0,432.0,0.0,16.46
60750102003,73,102,622,0,107.0,121.0,102.0,77.0,77.0,66.0,...,110.0,10.0,48.0,4.0,631.0,640.0,0.0,246.0,0.0,8.86


Gonna drop golden gate park

In [14]:
grouped.loc[60759803001]

children                     6.0
married                      0.0
renter                      68.0
vacant                       0.0
age=25-29                   12.0
age=30-34                    9.0
age=35-39                    7.0
age=40-44                   13.0
age=45-49                    6.0
age=50-54                    2.0
age=55-59                    6.0
age=60-64                    1.0
age=65+                      0.0
age=65-69                    4.0
age=70-74                    0.0
age=75+                      1.0
age=<25                      9.0
ethnicity=ASIAN              5.0
ethnicity=BLACK              1.0
ethnicity=LATINX             6.0
ethnicity=OTHER              0.0
ethnicity=WHITE             53.0
loc_type=MFDU               46.0
loc_type=Retirement Home     0.0
loc_type=SFDU               24.0
loc_type=Trailer             0.0
hh_100                       0.7
Name: 60759803001, dtype: float64

In [15]:
grouped.drop(index=60759803001, inplace=True)

In [16]:
grouped.to_csv('clean_data/ig_counts_2019.csv')

# To Rates

1. Divide Relevent columns by hh_100
2. Set NaN to 0 (caused by dividing by 0)


In [17]:
grouped[grouped.hh_100 == 0]

Unnamed: 0_level_0,children,married,renter,vacant,age=25-29,age=30-34,age=35-39,age=40-44,age=45-49,age=50-54,...,ethnicity=ASIAN,ethnicity=BLACK,ethnicity=LATINX,ethnicity=OTHER,ethnicity=WHITE,loc_type=MFDU,loc_type=Retirement Home,loc_type=SFDU,loc_type=Trailer,hh_100
geoid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


In [18]:
# df[['B','C']] = df[['B','C']].div(df.A, axis=0)
grouped.iloc[:, :-1] = grouped.iloc[:, :-1].div(grouped.hh_100, axis=0).fillna(0)
grouped

Unnamed: 0_level_0,children,married,renter,vacant,age=25-29,age=30-34,age=35-39,age=40-44,age=45-49,age=50-54,...,ethnicity=ASIAN,ethnicity=BLACK,ethnicity=LATINX,ethnicity=OTHER,ethnicity=WHITE,loc_type=MFDU,loc_type=Retirement Home,loc_type=SFDU,loc_type=Trailer,hh_100
geoid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
60750101001,5.766710,2.752294,98.034076,0.000000,14.416776,18.610747,13.761468,10.222805,7.732634,5.635649,...,21.231979,2.883355,6.946265,0.262123,62.123198,84.272608,0.000000,15.727392,0.0,7.63
60750101002,10.395778,12.875989,72.770449,0.000000,8.601583,12.295515,12.189974,8.496042,8.601583,10.817942,...,24.221636,1.583113,7.387863,0.211082,60.263852,70.079156,3.166227,26.754617,0.0,18.95
60750102001,9.074733,16.725979,52.313167,0.088968,8.629893,10.320285,11.298932,10.676157,6.761566,8.896797,...,9.430605,1.868327,5.071174,0.266904,75.088968,55.338078,0.000000,44.661922,0.0,11.24
60750102002,8.748481,9.052248,76.063183,0.121507,11.057108,17.253949,12.089915,8.930741,8.323208,7.411908,...,8.687728,0.911300,4.799514,0.000000,77.035237,73.754557,0.000000,26.245443,0.0,16.46
60750102003,8.239278,11.512415,70.203160,0.000000,12.076749,13.656885,11.512415,8.690745,8.690745,7.449210,...,12.415350,1.128668,5.417607,0.451467,71.218962,72.234763,0.000000,27.765237,0.0,8.86
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60759802001,13.157895,24.561404,23.684211,0.000000,2.631579,9.649123,12.280702,11.403509,10.526316,11.403509,...,21.052632,0.877193,7.894737,0.000000,60.526316,26.315789,0.000000,73.684211,0.0,1.14
60759805011,22.307692,20.769231,31.346154,0.192308,7.500000,10.384615,7.115385,10.961538,10.961538,11.153846,...,22.500000,16.538462,17.307692,2.115385,34.038462,1.538462,7.692308,90.769231,0.0,5.20
60759806001,14.653784,13.526570,19.967794,0.161031,9.017713,13.848631,16.908213,11.916264,7.890499,9.017713,...,23.027375,48.953301,10.466989,0.966184,9.822866,38.808374,0.000000,61.191626,0.0,6.21
60759809001,13.573001,10.564930,84.225972,0.807043,9.684519,10.711665,9.757887,8.070433,8.804109,10.124725,...,9.757887,16.287601,14.159941,0.733676,51.577403,21.570066,0.000000,39.325018,0.0,13.63


In [19]:
grouped.to_csv('clean_data/ig_rates_2019.csv')