# Generating `not_renewed_2yrs`, step-by-step

A walkthrough of `reshape_and_create_label()` in `feature_generation.py`

In [93]:
import itertools
import numpy as np
import pandas as pd

# Tweak display settings for tables
pd.options.display.max_columns = 999

In [94]:
DATA_PATH = "../../data/Business_Licenses.csv"
DTYPE_DICT = {'ZIP CODE': str,}
DATE_COLS = ['LICENSE TERM EXPIRATION DATE', 'DATE ISSUED']

raw_df = pd.read_csv(DATA_PATH,
                 low_memory=False,
                 parse_dates=DATE_COLS)
raw_df.shape

(970564, 34)

In [95]:
# Aggregate by account-site and get min/max issue, expiry dates for licenses
df = raw_df.copy(deep=True) \
    .loc[df['DATE ISSUED'] <= pd.to_datetime('12/31/2018')] \
    .groupby(['ACCOUNT NUMBER', 'SITE NUMBER']) \
    .agg({'DATE ISSUED': ['min', 'max'],
          'LICENSE TERM EXPIRATION DATE': 'max'}) \
    .reset_index(col_level=1)

df.head()

KeyError: 'DATE ISSUED'

In [None]:
# Flatten column names into something usable
df.columns = df.columns.to_flat_index()
df = df.rename(columns={
    ('', 'ACCOUNT NUMBER'): "account",
    ('' , 'SITE NUMBER'): 'site',
    ('DATE ISSUED', 'min'): 'min_license_date',
    ('DATE ISSUED', 'max'): 'max_license_date',
    ('LICENSE TERM EXPIRATION DATE', 'max'): 'expiry'})

df.head()

In [None]:
# Extract min/max license dates into list of years_open
df['years_open'] = pd.Series(map(lambda x, y: [z for z in range(x, y+2)],
                                 df['min_license_date'].dt.year,
                                 df['max_license_date'].dt.year))

df.head()

In [None]:
# make account-site id var
# melt step below doesn't work well without merging these two cols
df['account_site'] = df['account'].astype('str') + "-" + df['site'].astype('str')
df = df[df.columns.tolist()[-1:] + df.columns.tolist()[:-1]]
df = df.drop(labels=['account', 'site'], axis=1)

df.head()

In [None]:
# Expand list of years_open into one row for each account-site-year
# https://mikulskibartosz.name/how-to-split-a-list-inside-a-dataframe-cell-into-rows-in-pandas-9849d8ff2401
df = df \
    .years_open \
    .apply(pd.Series) \
    .merge(df, left_index=True, right_index=True) \
    .drop(labels=['years_open'], axis=1) \
    .melt(id_vars=['account_site', 'min_license_date', 'max_license_date',
                   'expiry'],
          value_name='YEAR') \
    .drop(labels=['variable'], axis=1) \
    .dropna() \
    .sort_values(by=['account_site', 'YEAR'])

df.head()

In [None]:
# Split account_site back into ACCOUNT NUMBER, SITE NUMBER
year_df = df.copy(deep=True)
year_df['ACCOUNT NUMBER'], year_df['SITE NUMBER'] = year_df['account_site'].str.split('-', 1).str
year_df['ACCOUNT NUMBER'] = year_df['ACCOUNT NUMBER'].astype('int')
year_df['SITE NUMBER'] = year_df['SITE NUMBER'].astype('int')
year_df['YEAR'] = year_df['YEAR'].astype('int')
year_df = year_df[['ACCOUNT NUMBER', 'SITE NUMBER', 'account_site', 'YEAR',
         'min_license_date', 'max_license_date', 'expiry']] \
    .sort_values(by=['ACCOUNT NUMBER', 'SITE NUMBER'])

year_df.head(20)

In [None]:
# Assume buffer period is last 2 years of input data
threshold_year = raw_df['DATE ISSUED'].dt.year.max() - 1
buffer_df = raw_df.loc[raw_df['DATE ISSUED'].dt.year >= threshold_year]

# Get list of account-site numbers in buffer
buffer_ids = (buffer_df['ACCOUNT NUMBER'].astype('str') \
    + '-' + buffer_df['SITE NUMBER'].astype('str'))

Final output: Label indicating a business is not renewed within 2 years of a given business-year, up to the most recent year before the buffer. If we have data up to 12/31/2018, and are predicting on licenses issued up to 12/31/2016, then our business-year data should have rows for years up to 2016 (i.e. we cannot predict on 2017 onwards because we don't have a full 2 years of buffer data from that point).

Rules for generating `not_renewed_2yrs`:
1. If most recent expiry date is before the buffer, business was not renewed as of the year after the most recent license date for that business.
2. If the most recent expiry date is within the buffer:
    - If the most recent license issue date is before the buffer, the business was not renewed as of the year after the most recent license issue date.
    - If the most recent license issue date is within the buffer (i.e. license duration < 2yrs), the business was renewed within 2 yrs of all years up to the buffer threshold.

In [None]:
year_df['not_renewed_2yrs'] = np.where(
    year_df['expiry'].dt.year < threshold_year,
        np.where(year_df['YEAR'] >= year_df['max_license_date'].dt.year + 1, 1, 0),
    np.where(
        year_df['account_site'].isin(buffer_ids),
            0,
            np.where(year_df['YEAR'] >= year_df['max_license_date'].dt.year + 1, 1, 0)
    )
)

year_df.head(20)

In [None]:
# Drop unnecessary columns
year_df = year_df.drop(labels=['account_site', 'min_license_date', 'max_license_date', 
                     'expiry'], axis=1) \
    .loc[year_df['YEAR'] < threshold_year] \
    .reset_index(drop=True)

year_df.head(20)