In [238]:
import numpy as np
import pandas as pd
from sklearn import linear_model

In [307]:
# synthetic refugee employment outcomes
refugee_data = pd.read_csv('./synth-refugees-10000.csv')
# all city features
city_data = pd.read_csv('./merged_city_data_normalized.tsv', sep='\t').set_index('GeoID')
# affiliate cities and their geoids
aff_cities = pd.read_csv('./Affiliate-City-to-Id2.csv', names=['Place Name', 'GeoID']).set_index('GeoID')

In [308]:
# remove duplicate rows by GeoID
city_data = city_data.groupby(city_data.index).first()

# get just the min_max_normalized features
city_data = city_data[city_data.columns[city_data.columns.str.contains('min_max_normalized')]]

# remove NaN
city_data = city_data.fillna(0)

# get city_data for affiliates only - this is X
aff_city_data = city_data[city_data.index.isin(aff_cities.index)]

In [309]:
# compute refugee employment rates for affiliates - this is y
aff_employment_rates = pd.DataFrame(columns=['GeoID', 'employment_rate']).set_index('GeoID')
for geoid, _ in aff_cities.iterrows():
    refugees = refugee_data[refugee_data['geoid'] == geoid]
    rate = 0
    if len(refugees) != 0:
        rate =  float(len(refugees[refugees['employed'] == 1])) / len(refugees)
    aff_employment_rates.loc[geoid] = rate

In [310]:
# Train model 
regr = linear_model.LinearRegression()
regr.fit(aff_city_data, aff_employment_rates)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [311]:
all_city_pred = regr.predict(city_data)
city_data['Employment Rate Prediction'] = all_city_pred

         Population min_max_normalized  Unemployment rate min_max_normalized  \
GeoID                                                                          
100820                        0.003106                              0.149660   
100988                        0.001793                              0.105442   
101132                        0.001017                              0.394558   
101708                        0.000341                              0.377551   
101852                        0.001882                              0.465986   
102116                        0.000257                              0.255102   
102956                        0.002190                              0.299320   
103004                        0.000464                              0.670068   
103076                        0.006483                              0.180272   
104660                        0.000349                              0.170068   
105980                        0.002419  

In [312]:
city_data.to_csv('merged_city_data_normalized_with_employment.csv')