In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# import plotly.express as px

import lightgbm as lgb
import optuna

SMAPE_ENABLED = True
LAGS = [1, 2, 3, 4, 5, 6, 7, 8]

In [None]:
def to_percent(X, y):
    yhat = y / X['lags(1)']
    yhat[X['lags(1)'] == 0] = 0 # denominator cannot be 0
    return yhat

def from_percent(X, y):
    yhat = y * X[f'lags(1)']
    return yhat

In [None]:
def smape(y_true, y_pred):
    smap = np.zeros(len(y_true))
    
    num = np.abs(y_true - y_pred)
    dem = ((np.abs(y_true) + np.abs(y_pred)) / 2)
    
    pos_ind = (y_true != 0) | (y_pred != 0)
    smap[pos_ind] = num[pos_ind] / dem[pos_ind]
    
    return 100 * np.mean(smap)

In [None]:
def lgb_objective(trial):
    params = {
        'n_iter'           : 200,
        'verbosity'        : -1,
        'objective'        : 'l1',
        'random_state'     : 42,
        'extra_trees'      : True,
        'colsample_bytree' : trial.suggest_float('colsample_bytree', 0.1, 1.0),
        'colsample_bynode' : trial.suggest_float('colsample_bynode', 0.1, 1.0),
        'max_depth'        : trial.suggest_int('max_depth', 3, 10),
        'learning_rate'    : trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'lambda_l1'        : trial.suggest_float('lambda_l1', 1e-2, 10.0),
        'lambda_l2'        : trial.suggest_float('lambda_l2', 1e-2, 10.0),
        'num_leaves'       : trial.suggest_int('num_leaves', 8, 1024),
        'min_data_in_leaf' : trial.suggest_int('min_data_in_leaf', 5, 250),}
    
    model  = lgb.LGBMRegressor(**params)
    X, y   = df_all.drop(columns=[target]), df_all[target]
    
    train_times = list(range(38))
    valid_times = [38]
    
    y_train = y[X['scale'].isin(train_times)]
    y_valid = y[X['scale'].isin(valid_times)]
    
    X_train = X[X['scale'].isin(train_times)]
    X_valid = X[X['scale'].isin(valid_times)]
    
    if SMAPE_ENABLED:
        y_train = to_percent(X_train, y_train)
    
    model.fit(X_train, y_train)
    y_pred = model.predict(X_valid)
    
    if SMAPE_ENABLED:
        y_pred = from_percent(X_valid, y_pred)
    
    return smape(y_valid, y_pred)

### Data reading

In [None]:
root = '../data/kaggle'
date_col = 'first_day_of_month'
cat_cols = ['county', 'state']
target = 'microbusiness_density'
idx = 'row_id'

In [None]:
df_train = pd.read_csv(os.path.join(root, 'train.csv'), index_col=idx)
df_test = pd.read_csv(os.path.join(root, 'test.csv'), index_col=idx)
df_subm = pd.read_csv(os.path.join(root, 'sample_submission.csv'), index_col=idx)
df_census = pd.read_csv(os.path.join(root, 'census_starter.csv'), index_col='cfips')

### Adding missed state and county columns to test

In [None]:
state_dict = df_train[['cfips', 'state', 'county']]
state_dict = state_dict.set_index('cfips')
state_dict = state_dict.drop_duplicates()
state_dict = state_dict.to_dict()

df_test['state'] = df_test['cfips'].map(state_dict['state'])
df_test['county'] = df_test['cfips'].map(state_dict['county'])

In [None]:
df_test.head()

### Time features

In [None]:
df_all = pd.concat([df_train, df_test], axis=0)

df_all[date_col] = pd.to_datetime(df_all[date_col])

df_all['year'] = df_all[date_col].dt.year
df_all['month'] = df_all[date_col].dt.month
df_all['scale'] = (df_all[date_col] - df_all[date_col].min()).dt.days
df_all['scale'] = df_all['scale'].factorize()[0]

In [None]:
# df_all

### Lag features

In [None]:
for i in LAGS:
    df_all[f'lags({i})'] = df_all.groupby('cfips')[target].shift(i)
    
df_all['active'] = df_all.groupby('cfips')['active'].shift(8)

In [None]:
# df_all.query("cfips == 1001")

In [None]:
df_all.head()

In [None]:
df_all.info()

### Categorical features

In [None]:
df_all = df_all.drop(columns=[date_col])
df_all[cat_cols] = df_all[cat_cols].astype('category')

In [None]:
df_all.head()

### Adding census data

In [None]:
df_all = df_all.reset_index()
df_all = df_all.set_index('cfips')

df_all[df_census.columns] = df_census

df_all = df_all.reset_index()
df_all = df_all.set_index(idx)

In [None]:
df_all.head()

### First month of the train data not used

In [None]:
df_all = df_all[df_all['scale'] != 0]

### Hyperparameter optimization

In [None]:
study = optuna.create_study(direction='minimize', study_name='Regressor')
study.optimize(lgb_objective, n_trials=30, show_progress_bar=True)

### Hyperparameter optimized model

In [None]:
study.best_value

In [None]:
study.best_params

In [None]:
# params = {
#     'n_iter': 200,
#     'verbosity': -1,
#     'objective': 'l1',
#     'random_state': 42,
#     'extra_trees': True,
#     'colsample_bytree': 0.8841279649367693,
#     'colsample_bynode': 0.10142964450634374,
#     'max_depth': 8,
#     'learning_rate': 0.013647749926797374,
#     'lambda_l1': 1.8386216853616875,
#     'lambda_l2': 7.557660410418351,
#     'num_leaves': 61,
#     'min_data_in_leaf': 213}

# model = lgb.LGBMRegressor(**params)


# {'colsample_bytree': 0.8705820109760347,
#  'colsample_bynode': 0.9382086757638848,
#  'max_depth': 9,
#  'learning_rate': 0.018267590517121894,
#  'lambda_l1': 4.679378881283281,
#  'lambda_l2': 6.791735883783194,
#  'num_leaves': 541,
#  'min_data_in_leaf': 243}

### Prediction

In [None]:
X, y   = df_all.drop(columns=[target]), df_all[target]
X_test, y_test = X[y.isnull()], y[y.isnull()]

train_times = list(range(38))

X_train = X[X['scale'].isin(train_times)]
y_train = y[X['scale'].isin(train_times)]

if SMAPE_ENABLED:
    y_train = to_percent(X_train, y_train)

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

if SMAPE_ENABLED:
    y_pred = from_percent(X_test, y_pred)
    
df_subm.loc[y_pred.index, target] = y_pred

### Let's swap bad predictions with last value

In [None]:
df_all[df_all['scale'] == 39]

In [None]:
X, y   = df_all.drop(columns=[target]), df_all[target]
valid_times = list(range(31, 39))
results = []

for valid_time in valid_times:
    train_times = list(range(valid_time))
    
    X_train = X[X['scale'].isin(train_times)]
    X_valid = X[X['scale'].isin([valid_time])]

    y_train = y[X['scale'].isin(train_times)]
    y_valid = y[X['scale'].isin([valid_time])]
    
    if SMAPE_ENABLED:
        y_train = to_percent(X_train, y_train)

    model.fit(X_train, y_train)
    y_pred = model.predict(X_valid)

    if SMAPE_ENABLED:
        y_pred = from_percent(X_valid, y_pred)
        
    X_valid['y_true'] = y_valid
    X_valid['y_base'] = X_valid['lags(1)']
    X_valid['y_pred'] = y_pred
    
    results.append(X_valid[['scale', 'month', 'state', 'county', 'cfips', 'y_true', 'y_base', 'y_pred']])
    
df_record = pd.concat(results, axis=0)
df_record.head()

In [None]:
df_record

In [None]:
base_err = df_record.groupby('cfips').apply(lambda x: smape(x.y_true, x.y_base))
pred_err = df_record.groupby('cfips').apply(lambda x: smape(x.y_true, x.y_pred))

blacklist = base_err[(base_err + 1e-3) < pred_err].index

print(f'Avg SMAPE(model): {pred_err.mean():.3f}')
print(f'Avg SMAPE(base): {base_err.mean():.3f}')
print(len(blacklist))

In [None]:
# blacklist


In [None]:
idx = df_test[df_test['cfips'].isin(blacklist)].index

df_subm.loc[idx, target] = df_all.loc[idx, 'lags(1)']

### Fill nan values

In [None]:
'''
It only predicts first month of the test data but you 
can predict other months with predicted data
'''

df_subm[target] = df_subm[target].fillna(0)

### Submit

In [None]:
df_subm.head()

In [None]:
df_subm.to_csv('submission.csv')

### Analysis

In [None]:
url = 'https://raw.githubusercontent.com/jackparmer/iso-3166-state-codes/master/codes.csv'
df_state = pd.read_csv(url, index_col='state')
df_state.index = df_state.index.str.strip()

In [None]:
apply_func = lambda x: pd.Series([smape(x.y_true, x.y_base), smape(x.y_true, x.y_pred)])

#### SMAPE by timeline (scale or month)

In [None]:
compare_by = 'scale'
data = df_record.groupby(compare_by).apply(apply_func)

plt.figure(figsize=(16, 4))
plt.xlabel(compare_by, fontsize=14)
plt.ylabel('SMAPE', fontsize=14)
plt.plot(data, marker='o')
plt.legend(['base', 'pred'])
plt.show()

#### Geographic analysis

In [None]:
compare_by = 'state'

data = df_record.groupby(compare_by).apply(apply_func)
data = data.rename(columns={0: 'base', 1: 'pred'})
data = data.reset_index()

data['state'] = data['state'].map(df_state['code'])

#### Prediction error map

In [None]:
fig = px.choropleth(data,
                    locations='state',
                    color='pred',
                    locationmode='USA-states', 
                    scope='usa')
fig.show()

#### Base prediction (last lag as a prediction) error map

In [None]:
fig = px.choropleth(data,
                    locations='state',
                    color='base',
                    locationmode='USA-states', 
                    scope='usa')
fig.show()

#### Record count by states

In [None]:
data = df_record.groupby('state').size()
data = data.reset_index()
data = data.rename(columns={0: 'count'})

data['state'] = data['state'].map(df_state['code'])

In [None]:
fig = px.choropleth(data,
                    locations='state',
                    color='count',
                    locationmode='USA-states', 
                    scope='usa')
fig.show()