
- **pct_bb_[year]** - The percentage of households in the county with access to broadband of any type. Derived from ACS table B28002: PRESENCE AND TYPES OF INTERNET SUBSCRIPTIONS IN HOUSEHOLD.
- **cfips** - The CFIPS code.
- **pct_college_[year]** - The percent of the population in the county over age 25 with a 4-year college degree. Derived from ACS table S1501: EDUCATIONAL ATTAINMENT.
- **pct_foreign_born_[year]** - The percent of the population in the county born outside of the United States. Derived from ACS table DP02: SELECTED SOCIAL CHARACTERISTICS IN THE UNITED STATES.
- **pct_it_workers_[year]** - The percent of the workforce in the county employed in information related industries. Derived from ACS table S2405: INDUSTRY BY OCCUPATION FOR THE CIVILIAN EMPLOYED POPULATION 16 YEARS AND OVER.
- **median_hh_inc_[year]** - The median household income in the county. Derived from ACS table S1901: INCOME IN THE PAST 12 MONTHS (IN 2021 INFLATION-ADJUSTED DOLLARS).



# Data

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
census_starter = pd.read_csv('./data/census_starter.csv')
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')


# train['id_unique'] = train.apply(lambda x: x['row_id'].split('_')[0],axis=1)
# train['id_state'] = train.apply(lambda x: x['row_id'].split('_')[0][:-3],axis=1)
# train['id_county'] = train.apply(lambda x: x['row_id'].split('_')[0][-3:],axis=1)
# train['id_date'] = train.apply(lambda x: x['row_id'].split('_')[1],axis=1)
    

# test['id_unique'] = test.apply(lambda x: x['row_id'].split('_')[0],axis=1)
# test['id_state'] = test.apply(lambda x: x['row_id'].split('_')[0][:-3],axis=1)
# test['id_county'] = test.apply(lambda x: x['row_id'].split('_')[0][-3:],axis=1)
# test['id_date'] = test.apply(lambda x: x['row_id'].split('_')[1],axis=1)

# Metric

In [5]:
def SMAPE(y_true, y_pred):
    """
    Symmetric Mean Absolute Percentage Error (SMAPE)
    """
    y_true = np.array(y_true)
    denominator = (y_true + np.abs(y_pred)) / 200.0
    diff = np.abs(y_true - y_pred) / denominator
    diff[denominator == 0] = 0.0
    return np.mean(diff)


# Baseline

In [6]:
train["first_day_of_month"] = pd.to_datetime(train["first_day_of_month"])
test["first_day_of_month"] = pd.to_datetime(test["first_day_of_month"])

train_periods = np.sort(train.first_day_of_month.unique())
train['periods'] = train.first_day_of_month.map(dict(zip(train_periods,range(len(train_periods)))))

test_periods = np.sort(test.first_day_of_month.unique())
test['periods'] = test.first_day_of_month.map(dict(zip(test_periods,range(len(train_periods),len(train_periods)+len(test_periods)))))

In [7]:
K = 8
train_train = train[train.periods<=train.periods.max()-K].reset_index()
train_validate = train[train.periods>train.periods.max()-K].reset_index()

train_train.shape, train_validate.shape, test.shape


((97185, 9), (25080, 9), (25080, 4))

In [8]:
## Validate the baseline
train_train = train_train.sort_values(by="first_day_of_month").reset_index(drop=True)
last_target = train_train.groupby("cfips").tail(1)[["cfips", "microbusiness_density"]]
last_target_dict = dict(zip(last_target['cfips'].values , last_target['microbusiness_density'].values))
train_validate['pred_microbusiness_density'] = train_validate['cfips'].map(last_target_dict)


In [9]:
condition = train_validate['periods'] == train_validate['periods'].min()

SMAPE(y_pred=train_validate.loc[condition,'pred_microbusiness_density'], y_true=train_validate.loc[condition,'microbusiness_density'])

1.5255092632267926

In [10]:
## Test
# train = train.sort_values(by="first_day_of_month").reset_index(drop=True)
# last_target = train.groupby("cfips").tail(1)[["cfips", "microbusiness_density"]]
# last_target_dict = dict(zip(last_target['cfips'].values , last_target['microbusiness_density'].values))
# test['microbusiness_density'] = test['cfips'].map(last_target_dict)
# test[["row_id", "microbusiness_density"]].to_csv("submission.csv", index=False)
test

Unnamed: 0,row_id,cfips,first_day_of_month,periods
0,1001_2022-11-01,1001,2022-11-01,39
1,1003_2022-11-01,1003,2022-11-01,39
2,1005_2022-11-01,1005,2022-11-01,39
3,1007_2022-11-01,1007,2022-11-01,39
4,1009_2022-11-01,1009,2022-11-01,39
...,...,...,...,...
25075,56037_2023-06-01,56037,2023-06-01,46
25076,56039_2023-06-01,56039,2023-06-01,46
25077,56041_2023-06-01,56041,2023-06-01,46
25078,56043_2023-06-01,56043,2023-06-01,46


# Linear Regression

In [11]:
from sklearn.linear_model import LinearRegression

In [12]:
train = pd.read_csv('./data/train.csv')
train["first_day_of_month"] = pd.to_datetime(train["first_day_of_month"])
train = train.sort_values(['cfips','first_day_of_month']).reset_index(drop=True)


test = pd.read_csv('./data/test.csv')
test["first_day_of_month"] = pd.to_datetime(test["first_day_of_month"])
test = test.sort_values(['cfips','first_day_of_month']).reset_index(drop=True)



In [14]:
# DISPLAY = 1
# # ERROR THRESHOLD AS PERCENTAGE = THRESHOLD / 78
# THRESHOLD = 12

# IDS = train.cfips.unique()
# x_train = np.arange(39).reshape((-1,1))
# x_test = np.arange(38,47).reshape((-1,1))

# for i in range(DISPLAY):
#     c = np.random.choice(IDS)
#     df = train.loc[train.cfips==c]
#     last = df.microbusiness_density.values[-1]

#     # FIT LINEAR REGRESSION
#     model = LinearRegression()
#     model.fit(x_train, df.microbusiness_density)
#     p = model.predict(x_train)

#     # COMPUTE TRAIN ERROR
#     err = p - df.microbusiness_density.values
#     rng = df.microbusiness_density.max() - df.microbusiness_density.min()

#     # DETERMIN IF TIME SERIES IS LINEAR OR NOT
#     s = 0
#     for k in range(39):
#         e = np.abs( err[k] )
#         r = e/(rng/2)
#         s += r

#     # INFER TEST DATA WITH LINEAR REGRESSION
#     p2 = model.predict(x_test)
#     shift =  last - p2[0]
#     # if s<THRESHOLD: 
#     preds2 = p2[1:]+shift
#     # else: 
#     preds1 = [last]*8

#     # PLOT STUFF
#     plt.figure(figsize=(20,5))
#     plt.plot(df.first_day_of_month,df.microbusiness_density,'-o',label='train data')
#     plt.plot(df.first_day_of_month,p,'--',label='linear regression')
#     plt.plot(test.first_day_of_month.values[:8],preds1,'-o',label='test pred', c='r')
#     plt.plot(test.first_day_of_month.values[:8],preds2,'--',label='test pred', c='b')
#     pre = ''; post=''
#     if s>THRESHOLD: 
#         pre='NO, we WILL NOT USE linear regression for\n'
#         post=' (We will predict last train value)'
#     else: 
#         pre='YES, we WILL USE linear regression for\n'
#     plt.title(f'{pre}CFIPS {c}{post}',size=18)
#     plt.xlabel('Date',size=16)
#     plt.ylabel('Microbusiness Density',size=16)
#     plt.legend()
#     plt.show()

#     plt.hist(err,bins=20,label='error')
#     plt.plot([-rng/2,-rng/2],[0,10],'--',color='black',label='range')
#     plt.plot([rng/2,rng/2],[0,10],'--',color='black')
#     plt.xlim((-rng * 0.75,rng * 0.75))
#     plt.legend()
#     plt.title(f'Linear Regression\nTrain Error vs. Train Range. (avg={100*s/78:2.1f}%)',size=18)
#     plt.show()
#     print('\n\n\n\n\n\n')
    

In [16]:
DISPLAY = 8
# ERROR THRESHOLD AS PERCENTAGE = THRESHOLD / 78
THRESHOLD = 8
ACTIVE_THRESHOLD = 9_000

IDS = train.cfips.unique()
x_train = np.arange(39).reshape((-1,1))
x_test = np.arange(38,47).reshape((-1,1))

preds = np.zeros((len(IDS),8))
last_preds = np.zeros((len(IDS),8))
lin_trend = 0

ct = 0

for i,c in enumerate(IDS):
    df = train.loc[train.cfips==c]
    last = df.microbusiness_density.values[-1]
    active = df.active.values[-1]
    last_preds[i,] = [last]*8
    
    # FIT LINEAR REGRESSION
    model = LinearRegression()
    model.fit(x_train,df.microbusiness_density)
    p = model.predict(x_train)
    
    # COMPUTE TRAIN ERROR
    err = p - df.microbusiness_density.values
    rng = df.microbusiness_density.max() - df.microbusiness_density.min()
    
    # DETERMIN IF TIME SERIES IS LINEAR OR NOT
    s = 0
    for k in range(39):
        e = np.abs( err[k] )
        r = e/(rng/2)
        s += r
    # IF S<=8 THEN AVERAGE TRAIN ERROR IS LESS THAN 10% OF RANGE OF TRAIN
    # AND WE ASSUME THIS COUNTY HAS A LINEAR TREND
    if (s>THRESHOLD)|(active<ACTIVE_THRESHOLD): 
        preds[i,] = [last]*8
        continue
        
    # INFER TEST DATA WITH LINEAR REGRESSION
    p2 = model.predict(x_test)
    shift =  last - p2[0]
    preds[i,] = p2[1:]+shift
    
    # COUNT STUFF
    ct += 1
    lin_trend += 1
    if ct>=DISPLAY+1: continue
        
    # PLOT STUFF
    plt.figure(figsize=(20,5))
    plt.plot(df.first_day_of_month,df.microbusiness_density,'-o',label='train data')
    plt.plot(df.first_day_of_month,p,'--',label='linear regression')
    plt.plot(test.first_day_of_month.values[:8],preds[i,],'-o',label='test pred')
    pre='YES, we WILL USE linear regression for\n'
    plt.title(f'{pre}CFIPS {c}',size=18)
    plt.xlabel('Date',size=16)
    plt.ylabel('Microbusiness Density',size=16)
    plt.legend()
    plt.show()
    
    plt.hist(err,bins=20,label='error')
    plt.plot([-rng/2,-rng/2],[0,10],'--',color='black',label='range')
    plt.plot([rng/2,rng/2],[0,10],'--',color='black')
    plt.xlim((-rng * 0.75,rng * 0.75))
    plt.legend()
    plt.title(f'Train Error vs. Train Range. (avg={100*s/78:2.1f}%)',size=18)
    plt.show()
    print('\n\n\n\n\n\n')
    
# PRINT HOW MANY LINEAR TIME SERIES WE FOUND
print(f'There are {lin_trend} counties with both a linear trend and large population.')