https://data.cityofchicago.org/Community-Economic-Development/Business-Licenses/r5kz-chrr

# Feature: Number of Nearby Business Nonrenewals

Spatial: within 1 mile, within same community area, within same ward, within same census tract

Time: in same year, in previous year, in past 2 years total

## 1. Setup

In [1]:
# Setup autoreload
%load_ext autoreload
%autoreload 2

In [2]:
# Import libraries
import datetime
import itertools
import math
import numpy as np
import pandas as pd
import geopandas as gpd
from sklearn.metrics.pairwise import haversine_distances

# Import pipeline library, hardcoded config file values
import pipeline_library as library
import pipeline_config as config

# Tweak display settings for tables
pd.options.display.max_columns = 999

In [3]:
# Code-done alert
from IPython.display import Audio
sound_file = 'applause2.wav'
# Audio(sound_file, autoplay=True)

## 2. Read data

In [4]:
DATA_PATH = "../../data/Business_Licenses.csv"
DTYPE_DICT = {
    'ZIP CODE': str,
    'BUSINESS ACTIVITY ID': str,
    'BUSINESS ACTIVITY': str,
}
DATE_COLS = ['LICENSE TERM START DATE', 'LICENSE TERM EXPIRATION DATE', 'DATE ISSUED']

df = pd.read_csv(DATA_PATH,
                 dtype=DTYPE_DICT,
                 parse_dates=DATE_COLS)
df.shape

(970564, 34)

### 2.1 Check that no account-site has two different addresses

Barring NAs

In [5]:
LOCATION_COLS = ['ADDRESS', 'ACCOUNT NUMBER', 'SITE NUMBER', 'ZIP CODE', 'WARD', 'POLICE DISTRICT', 
                 'LATITUDE', 'LONGITUDE', 'LOCATION']

df[LOCATION_COLS] \
    .dropna() \
    .drop_duplicates() \
    .groupby(['ACCOUNT NUMBER', 'SITE NUMBER']) \
    .size() \
    .reset_index() \
    .sort_values(by=0, ascending=False) \
    .head()

Unnamed: 0,ACCOUNT NUMBER,SITE NUMBER,0
0,1,1,1
129585,321577,1,1
129576,321569,3,1
129577,321569,5,1
129578,321570,1,1


Every account-site has a unique address. We're ok!

## 3. Preprocess supporting data to construct features with

### 3.1 Extract location features for each account-site

In [6]:
LOCATION_COLS = ['ACCOUNT NUMBER', 'SITE NUMBER', 'ADDRESS', 'CITY', 'STATE', 'ZIP CODE', 'WARD', 'POLICE DISTRICT', 
                 'LATITUDE', 'LONGITUDE', 'LOCATION']
NA_COLS = ['LATITUDE', 'LONGITUDE', 'LOCATION']

def get_unique_locations(input_df, location_cols, na_cols):
    
    df = input_df.copy(deep=True)
    df = df[location_cols] \
        .dropna(subset=na_cols) \
        .drop_duplicates() \
        .sort_values(by=['ACCOUNT NUMBER', 'SITE NUMBER'])
    
    return df

addresses = get_unique_locations(df, LOCATION_COLS, NA_COLS)

addresses.head(10)

Unnamed: 0,ACCOUNT NUMBER,SITE NUMBER,ADDRESS,CITY,STATE,ZIP CODE,WARD,POLICE DISTRICT,LATITUDE,LONGITUDE,LOCATION
156224,1,1,17 W ADAMS ST # 1ST,CHICAGO,IL,60603,42.0,1.0,41.879342,-87.628412,"(41.879341938770445, -87.62841188861722)"
26538,1,2,17 W ADAMS ST BSMT & 1ST,CHICAGO,IL,60603,42.0,1.0,41.879342,-87.628412,"(41.879341938770445, -87.62841188861722)"
363441,2,2,11601 W TOUHY AVE T1 CO,CHICAGO,IL,60666,41.0,16.0,42.008536,-87.914428,"(42.008536400868735, -87.91442843927047)"
701027,4,1,1028 W DIVERSEY PKWY,CHICAGO,IL,60614,44.0,19.0,41.932727,-87.655042,"(41.93272677149699, -87.65504177558735)"
714842,6,1,3714 S HALSTED ST 1ST #,CHICAGO,IL,60609,11.0,9.0,41.827185,-87.64617,"(41.82718501563474, -87.64617045635079)"
778227,8,1,1200 W HUBBARD ST,CHICAGO,IL,60642,27.0,12.0,41.890097,-87.657441,"(41.89009738805929, -87.65744102745799)"
763702,9,1,10429 S EWING AVE,CHICAGO,IL,60617,10.0,4.0,41.705506,-87.535139,"(41.70550648249998, -87.53513945323924)"
10226,10,1,13200 S HOUSTON AVE,CHICAGO,IL,60633,10.0,4.0,41.65542,-87.54852,"(41.65541987670403, -87.5485201545961)"
771301,11,1,6 E DIVISION ST,CHICAGO,IL,60610,43.0,18.0,41.904056,-87.628434,"(41.904055847757704, -87.62843428649991)"
106208,12,1,1767 W WILSON AVE,CHICAGO,IL,60640,47.0,19.0,41.965067,-87.673582,"(41.965067484862175, -87.67358197551935)"


In [7]:
def export_df(input_df, filepath, cols_to_export=None):

    df = input_df.copy(deep=True)
    
    if not cols_to_export: # export all columns
        pass
    else:
        df = df[cols_to_export]
    
    df.to_csv(filepath, index=False)
    return None

# Save as csv
FILEPATH = '../../data/location_data.csv'
export_df(addresses, FILEPATH)

### 3.2 Get nonrenewal event-level data

Now that we have address data for each account-site, we can merge it onto the business nonrenewals dataset and aggregate it by any categorical location feature in the data. 

Below, I demonstrate aggregating the number of nonrenewals by two location methods:
1. By some categorical location feature (e.g. zip code, census tract number)
2. By some distance measure (e.g. within 1 mile)

In [8]:
# Load dataframe of nonrenewals
fails = pd.read_csv('../../data/not_renewed_2yrs.csv') \
    .merge(addresses, how='left', on=['ACCOUNT NUMBER', 'SITE NUMBER'])
fails.head()

Unnamed: 0,ACCOUNT NUMBER,SITE NUMBER,YEAR,not_renewed_2yrs,ADDRESS,CITY,STATE,ZIP CODE,WARD,POLICE DISTRICT,LATITUDE,LONGITUDE,LOCATION
0,1,1,2002,0.0,17 W ADAMS ST # 1ST,CHICAGO,IL,60603,42.0,1.0,41.879342,-87.628412,"(41.879341938770445, -87.62841188861722)"
1,1,1,2003,0.0,17 W ADAMS ST # 1ST,CHICAGO,IL,60603,42.0,1.0,41.879342,-87.628412,"(41.879341938770445, -87.62841188861722)"
2,1,1,2004,0.0,17 W ADAMS ST # 1ST,CHICAGO,IL,60603,42.0,1.0,41.879342,-87.628412,"(41.879341938770445, -87.62841188861722)"
3,1,1,2005,0.0,17 W ADAMS ST # 1ST,CHICAGO,IL,60603,42.0,1.0,41.879342,-87.628412,"(41.879341938770445, -87.62841188861722)"
4,1,1,2006,1.0,17 W ADAMS ST # 1ST,CHICAGO,IL,60603,42.0,1.0,41.879342,-87.628412,"(41.879341938770445, -87.62841188861722)"


In [9]:
# Select rows where nonrenewal occured and join on location attributes
fail_events = fails.loc[fails['not_renewed_2yrs'] == 1].reset_index(drop=True)
fail_events.head(10)

Unnamed: 0,ACCOUNT NUMBER,SITE NUMBER,YEAR,not_renewed_2yrs,ADDRESS,CITY,STATE,ZIP CODE,WARD,POLICE DISTRICT,LATITUDE,LONGITUDE,LOCATION
0,1,1,2006,1.0,17 W ADAMS ST # 1ST,CHICAGO,IL,60603,42.0,1.0,41.879342,-87.628412,"(41.879341938770445, -87.62841188861722)"
1,4,1,2003,1.0,1028 W DIVERSEY PKWY,CHICAGO,IL,60614,44.0,19.0,41.932727,-87.655042,"(41.93272677149699, -87.65504177558735)"
2,8,1,2005,1.0,1200 W HUBBARD ST,CHICAGO,IL,60642,27.0,12.0,41.890097,-87.657441,"(41.89009738805929, -87.65744102745799)"
3,9,1,2003,1.0,10429 S EWING AVE,CHICAGO,IL,60617,10.0,4.0,41.705506,-87.535139,"(41.70550648249998, -87.53513945323924)"
4,10,1,2010,1.0,13200 S HOUSTON AVE,CHICAGO,IL,60633,10.0,4.0,41.65542,-87.54852,"(41.65541987670403, -87.5485201545961)"
5,11,1,2003,1.0,6 E DIVISION ST,CHICAGO,IL,60610,43.0,18.0,41.904056,-87.628434,"(41.904055847757704, -87.62843428649991)"
6,12,1,2007,1.0,1767 W WILSON AVE,CHICAGO,IL,60640,47.0,19.0,41.965067,-87.673582,"(41.965067484862175, -87.67358197551935)"
7,13,1,2005,1.0,3655 N SHEFFIELD AVE 1,CHICAGO,IL,60613,44.0,19.0,41.948844,-87.654231,"(41.94884429698063, -87.65423079329328)"
8,13,4,2012,1.0,3600 N SHEFFIELD AVE,CHICAGO,IL,60613,44.0,19.0,41.947332,-87.65448,"(41.947331826783035, -87.65448031498194)"
9,13,6,2013,1.0,3600 N SHEFFIELD AVE,CHICAGO,IL,60613,44.0,19.0,41.947332,-87.65448,"(41.947331826783035, -87.65448031498194)"


In [10]:
# function to aggregate by location feature and year
def count_by_year(input_df, loc_col, year_col='YEAR', fillna=0):
    '''
    Input: df of event-level data. Each row is a nonrenewal in the given location and year.
    Output: Count of events by specified location and year. Year is balanced, so a count
        for every year appears for each unique value in loc_col.
    '''
    df = input_df.copy(deep=True).groupby([loc_col, year_col]).size().reset_index()
    df = df.set_index([loc_col, year_col]) \
        .reindex(pd.MultiIndex.from_tuples(itertools.product(
            df[loc_col].unique(), df[year_col].unique()
        ))) \
        .reset_index() \
        .rename(columns={'level_0': loc_col, 'level_1': year_col, 0: 'count'}) \
        .fillna(fillna) \
        .sort_values(by=[loc_col, year_col])
    return df

## 4. Construct labels

### 4.1. Number of nonrenewals in the same zipcode, same year

In [11]:
fails_by_zip_year = count_by_year(fail_events, 'ZIP CODE')
fails_by_zip_year = fails_by_zip_year.rename(columns={'count': 'not_renewed_in_same_zip'})
fails_by_zip_year.head(30)

Unnamed: 0,ZIP CODE,YEAR,not_renewed_in_same_zip
3,6902,2002,0.0
4,6902,2003,0.0
5,6902,2004,0.0
2,6902,2005,0.0
6,6902,2006,0.0
7,6902,2007,0.0
8,6902,2008,0.0
9,6902,2009,0.0
1,6902,2010,0.0
10,6902,2011,0.0


In [12]:
not_renewed_in_same_zip = fails[['ACCOUNT NUMBER', 'SITE NUMBER', 'YEAR', 'ZIP CODE']] \
    .merge(fails_by_zip_year, how='left', on=['ZIP CODE', 'YEAR']) \
    .sort_values(by=['ACCOUNT NUMBER', 'SITE NUMBER', 'YEAR'])

not_renewed_in_same_zip.head(30)

Unnamed: 0,ACCOUNT NUMBER,SITE NUMBER,YEAR,ZIP CODE,not_renewed_in_same_zip
0,1,1,2002,60603,38.0
1,1,1,2003,60603,149.0
2,1,1,2004,60603,132.0
3,1,1,2005,60603,189.0
4,1,1,2006,60603,185.0
5,2,2,2002,60666,6.0
6,2,2,2003,60666,16.0
7,2,2,2004,60666,20.0
8,2,2,2005,60666,18.0
9,2,2,2006,60666,19.0


In [13]:
# Export to CSV
FILEPATH = '../../data/not_renewed_in_same_zip.csv'
export_df(not_renewed_in_same_zip, FILEPATH)

### 4.2. Number of nonrenewals within a distance radius, same year

Strategy:
1. Get cartesian product of all failure event locations, giving pairs of lat/long points
2. Filter for pairs that occur in the same year. Or merge (1) and (2) to block by year.
2. Implement Haversine formula to get distance between both points
3. Filter for pairs under a given distance (e.g. 1 mile)
4. Aggregate by business-year to get a count of failures in the same year within that distance.

In [106]:
# Load dataframe of nonrenewals
fails = pd.read_csv('../../data/not_renewed_2yrs.csv') \
    .merge(addresses, how='left', on=['ACCOUNT NUMBER', 'SITE NUMBER'])
fails.head()

Unnamed: 0,ACCOUNT NUMBER,SITE NUMBER,YEAR,not_renewed_2yrs,ADDRESS,CITY,STATE,ZIP CODE,WARD,POLICE DISTRICT,LATITUDE,LONGITUDE,LOCATION
0,1,1,2002,0.0,17 W ADAMS ST # 1ST,CHICAGO,IL,60603,42.0,1.0,41.879342,-87.628412,"(41.879341938770445, -87.62841188861722)"
1,1,1,2003,0.0,17 W ADAMS ST # 1ST,CHICAGO,IL,60603,42.0,1.0,41.879342,-87.628412,"(41.879341938770445, -87.62841188861722)"
2,1,1,2004,0.0,17 W ADAMS ST # 1ST,CHICAGO,IL,60603,42.0,1.0,41.879342,-87.628412,"(41.879341938770445, -87.62841188861722)"
3,1,1,2005,0.0,17 W ADAMS ST # 1ST,CHICAGO,IL,60603,42.0,1.0,41.879342,-87.628412,"(41.879341938770445, -87.62841188861722)"
4,1,1,2006,1.0,17 W ADAMS ST # 1ST,CHICAGO,IL,60603,42.0,1.0,41.879342,-87.628412,"(41.879341938770445, -87.62841188861722)"


In [113]:
def count_by_dist_radius(input_df, dist=1):
    '''
    Counts the number of business nonrenewals within a specified distance in km for each business-year.
    
    Input: input_df - df of business-year level data with binary feature for "not renewed in 2 years".
        Dataframe must also have these cols: ACCOUNT NUMBER, SITE NUMBER, YEAR, LATITUDE, LONGITUDE
        
    Output: input_df with count column appended to it. 
    '''
    
    df = input_df.copy(deep=True)
    
    # Select columns, transforms lat/long in degrees to radians
    df = df[['ACCOUNT NUMBER', 'SITE NUMBER', 'YEAR', 'LATITUDE', 'LONGITUDE', 'not_renewed_2yrs']]
    df['LATITUDE_rad'] = np.radians(df['LATITUDE'])
    df['LONGITUDE_rad'] = np.radians(df['LONGITUDE'])    
    R = 6371 # circumference of the Earth in km
    
    year_dfs = []

    for i in df['YEAR'].unique():
        year_df = df.loc[df['YEAR'] == i]
        fails_only = year_df.loc[year_df['not_renewed_2yrs'] == 1]
        
        # Get pairwise distance between all businesses that year and all nonrenewals that year
        # Then count number of nonrenewals within threshold distance (using row-wise sum)
        #  and join back on year_df 
        dist_df = haversine_distances(year_df[['LATITUDE_rad', 'LONGITUDE_rad']],
                                      fails_only[['LATITUDE_rad', 'LONGITUDE_rad']]) * R
        dist_df = pd.DataFrame(np.where(dist_df <= dist, 1, 0).sum(axis=1))
        year_df = year_df \
            .reset_index(drop=True) \
            .join(dist_df) \
            .drop(labels=['LATITUDE', 'LONGITUDE', 'LATITUDE_rad', 'LONGITUDE_rad', 'not_renewed_2yrs'], axis=1)
        
        year_dfs.append(year_df)
    
    # Concatenate all year-specific dfs to get counts for all business-years
    # Then merge onto original df by business-year id cols
    all_years_df = pd.concat(year_dfs)
    result = input_df.merge(all_years_df, how='left', on=['ACCOUNT NUMBER', 'SITE NUMBER', 'YEAR']) \
        .rename(columns={0: f'num_not_renewed_in_{dist}km'})
    
    return result

In [111]:
fails.head()

Unnamed: 0,ACCOUNT NUMBER,SITE NUMBER,YEAR,not_renewed_2yrs,ADDRESS,CITY,STATE,ZIP CODE,WARD,POLICE DISTRICT,LATITUDE,LONGITUDE,LOCATION
0,1,1,2002,0.0,17 W ADAMS ST # 1ST,CHICAGO,IL,60603,42.0,1.0,41.879342,-87.628412,"(41.879341938770445, -87.62841188861722)"
1,1,1,2003,0.0,17 W ADAMS ST # 1ST,CHICAGO,IL,60603,42.0,1.0,41.879342,-87.628412,"(41.879341938770445, -87.62841188861722)"
2,1,1,2004,0.0,17 W ADAMS ST # 1ST,CHICAGO,IL,60603,42.0,1.0,41.879342,-87.628412,"(41.879341938770445, -87.62841188861722)"
3,1,1,2005,0.0,17 W ADAMS ST # 1ST,CHICAGO,IL,60603,42.0,1.0,41.879342,-87.628412,"(41.879341938770445, -87.62841188861722)"
4,1,1,2006,1.0,17 W ADAMS ST # 1ST,CHICAGO,IL,60603,42.0,1.0,41.879342,-87.628412,"(41.879341938770445, -87.62841188861722)"


In [114]:
count_by_dist_radius(fails.head(1000), 1) 



Unnamed: 0,ACCOUNT NUMBER,SITE NUMBER,YEAR,not_renewed_2yrs,ADDRESS,CITY,STATE,ZIP CODE,WARD,POLICE DISTRICT,LATITUDE,LONGITUDE,LOCATION,num_not_renewed_in_1km
0,1,1,2002,0.0,17 W ADAMS ST # 1ST,CHICAGO,IL,60603,42.0,1.0,41.879342,-87.628412,"(41.879341938770445, -87.62841188861722)",0
1,1,1,2003,0.0,17 W ADAMS ST # 1ST,CHICAGO,IL,60603,42.0,1.0,41.879342,-87.628412,"(41.879341938770445, -87.62841188861722)",0
2,1,1,2004,0.0,17 W ADAMS ST # 1ST,CHICAGO,IL,60603,42.0,1.0,41.879342,-87.628412,"(41.879341938770445, -87.62841188861722)",0
3,1,1,2005,0.0,17 W ADAMS ST # 1ST,CHICAGO,IL,60603,42.0,1.0,41.879342,-87.628412,"(41.879341938770445, -87.62841188861722)",0
4,1,1,2006,1.0,17 W ADAMS ST # 1ST,CHICAGO,IL,60603,42.0,1.0,41.879342,-87.628412,"(41.879341938770445, -87.62841188861722)",1
5,2,2,2002,0.0,11601 W TOUHY AVE T1 CO,CHICAGO,IL,60666,41.0,16.0,42.008536,-87.914428,"(42.008536400868735, -87.91442843927047)",0
6,2,2,2003,0.0,11601 W TOUHY AVE T1 CO,CHICAGO,IL,60666,41.0,16.0,42.008536,-87.914428,"(42.008536400868735, -87.91442843927047)",0
7,2,2,2004,0.0,11601 W TOUHY AVE T1 CO,CHICAGO,IL,60666,41.0,16.0,42.008536,-87.914428,"(42.008536400868735, -87.91442843927047)",0
8,2,2,2005,0.0,11601 W TOUHY AVE T1 CO,CHICAGO,IL,60666,41.0,16.0,42.008536,-87.914428,"(42.008536400868735, -87.91442843927047)",0
9,2,2,2006,0.0,11601 W TOUHY AVE T1 CO,CHICAGO,IL,60666,41.0,16.0,42.008536,-87.914428,"(42.008536400868735, -87.91442843927047)",0


### 4.3 Number of businesses within a distance radius, same year

Since the `num_not_renewed_in_1km` metric measures the number of businesses that failed in that year within a certain distance, the same function can be modified to count the number of businesses within a certain distance that did *not* fail that year. That gives us `num_businesses_1km` (or some other distance radius).

In [None]:
Audio(sound_file, autoplay=True)