## MLPP HW1 - Diagnostic - Part 2

### Data Augmentation using ACS API

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import requests
import json
from urllib.request import urlopen
import warnings
warnings.simplefilter('ignore')

%matplotlib inline

In [2]:
def three_months(filename, date_col='Creation Date', start='09/30/2017', end='01/01/2018'):
    '''
    Loads data from pickle file and returns filtered df based on date
    '''
    df = pd.read_pickle(filename)
    start = datetime.strptime(start, '%m/%d/%Y')
    end = datetime.strptime(end, '%m/%d/%Y')
    df['date'] = pd.to_datetime(df[date_col], format='%m/%d/%Y')
    df = df[(df['date'] > start) & (df['date'] < end)]
    
    return df

In [3]:
def get_census_block(lat, long):
    '''
    Given a latitude and longitude, find the corresponding FIPS code
    '''
    FIPS_url = 'https://geo.fcc.gov/api/census/block/find?latitude={}&longitude={}&showall=true&format=json'.format(str(lat),str(long))
    try:
        response = urlopen(FIPS_url)
        FIPS = response.read().decode('utf-8')
        FIPS = json.loads(FIPS)
        return FIPS['Block']['FIPS']
    except:
        print(FIPS_url)

In [4]:
def scrape_fips_blocks(df, lat='Latitude', lon='Longitude'):
    '''
    Append fips block numbers to each request in a df
    '''
    blocks = []
    for index, row in df.iterrows():
        x = row[lat]
        y = row[lon]
        blocks.append(get_census_block(x, y))
    df['FIPS_BLOCK_NUMBER'] = pd.Series(blocks, index=df.index)
    return df

In [5]:
def scrape(df):
    '''
    Function that retrieves the INCOME AND BENEFITS (IN 2015 INFLATION-ADJUSTED DOLLARS)
    '''
    avg_income_list = []
    avg_family_size = []
    pct_white = []
    health_coverage = []
    unemployment = []
    for index, row in df.iterrows():
        state = row['FIPS_BLOCK_NUMBER'][0:2]
        county = row['FIPS_BLOCK_NUMBER'][2:5]
        tract = row['FIPS_BLOCK_NUMBER'][5:11]
        query = 'get=NAME,DP03_0051E,DP02_0016E,DP05_0032E,DP03_0096PE,DP03_0005PE&for=tract:{}&in=state:{}+county:{}&'.format(tract, state, county)
        key = 'key=02483eaf62001ddc247c04dc50cfa681d83bce62'
        url = 'https://api.census.gov/data/2015/acs5/profile?'+query+key
        r = requests.get(url)
        if r.status_code != 204: # 204 corresponds to no content.
            json = r.json()
            if json[1][1] is not '-':
                avg_income_list.append(int(json[1][1]))
            else:
                avg_income_list.append(np.nan)
            if json[1][2] is not '-':
                avg_family_size.append(float(json[1][2]))
            else:
                avg_family_size.append(np.nan)
            if json[1][3] is not '-':
                pct_white.append(float(json[1][3]))
            else:
                pct_white.append(np.nan)
            if json[1][4] is not '-':
                health_coverage.append(float(json[1][4]))
            else:
                health_coverage.append(np.nan)
            if json[1][5] is not '-':
                unemployment.append(float(json[1][5]))
            else:
                unemployment.append(np.nan)
    df['avg_monthly_income'] = pd.Series(avg_income_list, index=df.index)
    df['avg_family_size'] = pd.Series(avg_family_size, index=df.index)
    df['pct_white'] = pd.Series(pct_white, index=df.index)
    df['pct_health_coverage'] = pd.Series(health_coverage, index=df.index)
    df['unemployment_rate'] = pd.Series(unemployment, index=df.index)
    return df

In [6]:
#buildings = three_months('vacant_buildings_17.pkl', 'DATE SERVICE REQUEST WAS RECEIVED')
#lights = three_months('alley_lights_17.pkl')

#buildings = scrape_fips_blocks(buildings)
#lights = scrape_fips_blocks(lights)

#buildings = scrape(buildings)
#lights = srapce(lights)

buildings = pd.read_pickle('buildings_acs.pkl')
lights = pd.read_pickle('lights_acs.pkl')