# Data

In [71]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from feature_engine.outliers import Winsorizer

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.options.display.float_format = '{:.3f}'.format
import warnings; warnings.filterwarnings('ignore')


In [72]:
# pd.DataFrame({'a':[np.nan,1,2,3,np.nan]}).ffill()

In [73]:
def SMAPE_1 (y_true, y_pred):
    """
    Symmetric Mean Absolute Percentage Error (SMAPE)
    """
    y_true = np.array(y_true)
    denominator = (y_true + np.abs(y_pred)) / 200.0
    diff = np.abs(y_true - y_pred) / denominator
    diff[denominator == 0] = 0.0
    return np.mean(diff)

In [74]:
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')
sample_submission = pd.read_csv('./data/sample_submission.csv')


train['is_test'] = 0 ; test['is_test'] = 1

data = pd.concat((
        train,
        test)
        )\
    .reset_index(drop=True)\
    .assign(
        cfips = lambda df: df['cfips'].astype(str).str.zfill(5),
        date = lambda df: pd.to_datetime(df["first_day_of_month"]),
        mdensity_t0 = lambda df: df['microbusiness_density'],
        )\
    .sort_values(['cfips','date'], ascending=True)\
    .assign(
        state_i = lambda df: df['cfips'].apply(lambda x: x[:2]),
        county_i = lambda df: df['cfips'].apply(lambda x: x[2:]),
        year = lambda df: df['date'].dt.year,
        month = lambda df: df['date'].dt.month,
        dcount = lambda df: df.groupby('cfips')['row_id'].cumcount(),
        
        mdensity_lag1 = lambda df: df.groupby('cfips')['mdensity_t0'].shift(1),
        mdensity_lag2 = lambda df: df.groupby('cfips')['mdensity_t0'].shift(2),
        mdensity_lag3 = lambda df: df.groupby('cfips')['mdensity_t0'].shift(3),
        
        

        target_1 = lambda df: np.where( df['mdensity_lag1']==0, 0, (df['mdensity_t0']/(df['mdensity_lag1'])).clip(0,99) - 1),
        target_2 = lambda df: np.where( df['mdensity_lag2']==0, 0, (df['mdensity_t0']/(df['mdensity_lag2'])).clip(0,99) - 1),
        target_3 = lambda df: np.where( df['mdensity_lag3']==0, 0, (df['mdensity_t0']/(df['mdensity_lag3'])).clip(0,99) - 1),
        

        )\
    [['cfips','date','dcount','county_i','state_i','month','year','is_test','active','mdensity_t0', 
    'mdensity_lag1','mdensity_lag2','mdensity_lag3',
    'target_1','target_2', 'target_3'
        ]]
    # .sort_index(ascending=True)

assert all(data.groupby('cfips')['county_i'].nunique() == 1)
assert all(data.groupby('cfips')['state_i'].nunique() == 1)
assert data['cfips'].nunique() == 3135 # there are 3135 county,state tuples
assert data['dcount'].nunique() == 47 # there are 47 series for each county state tuple
assert data.query('is_test==0')['dcount'].nunique() == 39 # there are 39 series in the train set. 
assert data.query('is_test==1')['dcount'].nunique() == 8  # there are 8 series in the test set. 

#The private leaderboard will include 03-2023, 04-2023, 05-2023
#The public leaderboard includes the first month 11-2022. Probably it will be updated later as 12-2022,01-2023 and 02-2023


# Clean Outliers

In [75]:
data['target_1'] = data['target_1'].fillna(0)
data['target_2'] = data['target_2'].fillna(0)
data['target_3'] = data['target_3'].fillna(0)

capper = Winsorizer(capping_method='iqr',tail='both', fold=5)
data['target_1'] = capper.fit_transform(data[['target_1']])
data['target_2'] = capper.fit_transform(data[['target_2']])
data['target_3'] = capper.fit_transform(data[['target_3']])


In [76]:
# # check
# data['target_ratio'] = data['target_ratio'].abs()
# data.groupby('dcount')['target_ratio'].sum().plot()

# data['target_ratio_capped_1'] = data['target_ratio_capped_1'].abs()
# data.groupby('dcount')['target_ratio_capped_1'].sum().plot()


In [77]:
# # check
# temp = data.groupby(['cfips']).agg({'target_ratio':['mean','median','std']})
# temp.columns = ['mean','median','std']
# temp['ratio'] = temp['std']/(temp['median']+1e-10)
# temp = temp.sort_values('std',ascending=False)

# LEVEL1,LEVEL2,LEVEL3 = 1,2,5
# capper = Winsorizer(capping_method='iqr',tail='both', fold=LEVEL1)
# data[f'target_ratio_clean_{LEVEL1}'] = capper.fit_transform(data[['target_ratio']])
# capper = Winsorizer(capping_method='iqr',tail='both', fold=LEVEL2)
# data[f'target_ratio_clean_{LEVEL2}'] = capper.fit_transform(data[['target_ratio']])
# capper = Winsorizer(capping_method='iqr',tail='both', fold=LEVEL3)
# data[f'target_ratio_clean_{LEVEL3}'] = capper.fit_transform(data[['target_ratio']])

# for i in range(0,20):
#     try:
#         plt.figure()
#         x = data[(data['cfips'] == temp.index[i]) & (data['is_test'] == 0)][['target_ratio',f'target_ratio_clean_{LEVEL1}',f'target_ratio_clean_{LEVEL2}',f'target_ratio_clean_{LEVEL3}']]
#         # plt.plot(x[['target_ratio']].values.reshape(-1, 1))
#         plt.plot(x[[f'target_ratio_clean_{LEVEL1}']].values.reshape(-1, 1),'--', label=f'{LEVEL1}')
#         plt.plot(x[[f'target_ratio_clean_{LEVEL2}']].values.reshape(-1, 1),'--', label=f'{LEVEL2}')
#         plt.plot(x[[f'target_ratio_clean_{LEVEL3}']].values.reshape(-1, 1),'--', label=f'{LEVEL3}')
#         plt.legend()
#     except Exception as e:
#         print(e)
#         print(i)

    


# Extra Data

In [78]:
# census_starter = pd.read_csv('./data/census_starter.csv')

# census_starter = census_starter.assign(
#     cfips = lambda x: x['cfips'].astype(str)
#     )\
#     .set_index(['cfips']).sort_index(ascending=True)

# colname_tuples = [('_'.join(e.split('_')[:-1]),e.split('_')[-1]) for e in census_starter.columns.tolist()]
# new_index = pd.MultiIndex.from_tuples(colname_tuples, names=['category','year_info'])
# census_starter = census_starter.set_axis(new_index, axis=1).stack(level=1)
# census_starter = census_starter.reset_index()
# census_starter['year_available'] = census_starter['year_info'].astype(int) + 2



# census_starter = pd.read_csv('./data/census_starter.csv')

# census_starter = census_starter.assign(
#     cfips = lambda x: x['cfips'].astype(str)
#     )\
#     .set_index(['cfips']).sort_index(ascending=True)

# new_index = pd.MultiIndex.from_tuples([('_'.join(e.split('_')[:-1]),e.split('_')[-1]) for e in census_starter.columns.tolist()], names=['category','year'])
# census_starter = census_starter.set_axis(new_index, axis=1).stack(level=1)
# mean_census = census_starter.groupby(level='year').mean()
# mean_census

# Cross validation

In [79]:
import numpy as np
import pandas as pd
from sklearn.dummy import DummyRegressor
from sklearn.model_selection import cross_val_score
from mlxtend.evaluate.time_series import GroupTimeSeriesSplit, plot_splits, print_cv_info, print_split_info

In [113]:
TARGETS = ['target_1', 'target_2', 'target_3']
LAG_DENSITY = ['mdensity_lag1', 'mdensity_lag2', 'mdensity_lag3',]

TEST_DATE = ['2022-11-01','2022-12-01','2023-01-01']
TEST_PERIOD = [39, 40, 41]

LEAKAGE = ['mdensity_t0']

TRAIN_SIZE = 3

# sample = data[data.cfips.isin(['01001'])] # sample = data[data.cfips.isin(['01001','56045'])]
sample = data.copy()
sample.loc[sample.is_test==1,TARGETS]  = np.nan

sample = sample.set_index(['date','cfips']).sort_index()['2022-01':'2023-01']
sample = sample[['dcount', 'year','county_i'] + LAG_DENSITY + TARGETS + LEAKAGE]

sample_train= sample[sample['dcount']< 39] ; sample_test= sample[sample['dcount']>=39]

train_X = sample_train.drop(TARGETS,axis='columns') ; train_y = sample_train[TARGETS]
test_X = sample_test.drop(TARGETS,axis='columns') ; test_y = sample_test[TARGETS]
    

def visualize_splits(model_no):
        print(
        "model no",model_no,
        '\ngs train period:',np.unique(train_X['dcount'][train_index]).tolist(),
        '\ngs validation_period:', np.unique(train_X['dcount'][val_index]).tolist(),
        # '\ninference train period:', list(range(train_X['dcount'].max()-TRAIN_SIZE+1, train_X['dcount'].max()+1)),
        # '\ninference test period:', [train_X['dcount'].max()+1+model_no]
        )
        
from collections import defaultdict
errors = defaultdict(list)
test_preds = defaultdict(list)

for model_i in range(3):
    
    cv_args = {"test_size": 1, "n_splits": 3, "train_size": TRAIN_SIZE, 'gap_size': model_i}
    # plot_splits(sample, None, sample['dcount'], **cv_args)
    # print_split_info(sample, None, sample['dcount'], **cv_args)
    cv = GroupTimeSeriesSplit(**cv_args)
    # y_val = []
    for fold_i, (train_index, val_index) in enumerate(cv.split(train_X, train_y, train_X['dcount'])):
        # SPLIT DATA
        # visualize_splits(model_no = model_i)
        X_train, y_train = train_X.iloc[train_index], train_y.iloc[train_index, model_i]
        X_val, y_val = train_X.iloc[val_index], train_y.iloc[val_index, model_i]
        
        # MODEL
        model = DummyRegressor(strategy="constant", constant=0)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)
    
        y_pred_target = (y_pred+1) * X_val[f'mdensity_lag{model_i+1}']
        y_val_target = (y_val+1) * X_val[f'mdensity_lag{model_i+1}']

        # print(SMAPE_1(y_true=y_val, y_pred=y_pred))
        # print(SMAPE_1(y_true=y_val_target, y_pred=y_pred_target))
        errors[f'{model_i}'].append(SMAPE_1(y_true=y_val_target, y_pred=y_pred_target))
    # INFERENCE

    model.fit(train_X, train_y.iloc[:, model_i])
    # y_pred_test = model.predict(test_X[test_X.dcount == TEST_PERIOD[model_i]])
    test_preds[f'target_{model_i}']= model.predict(test_X)

    

df_test_preds = pd.DataFrame(test_preds, index=test_X.index)
df_test_preds = (df_test_preds + 1) * test_X[LAG_DENSITY].values

# # check
# df_output = pd.DataFrame(np.concatenate([df_test_preds.loc[TEST_DATE[i],f'target_{i}'] for i in range(3)]), index=test_X.index, columns = ['mdensity_t0'])
# df_check = pd.concat((train_X[['mdensity_t0']], df_output), axis=0)
# random_id = np.random.choice(df_check.index.levels[1])
# df_check.loc[(slice(None),[random_id]),['mdensity_t0']].plot()
# error_analysis
df_errors = pd.DataFrame(errors)
statistics_errors = df_errors.agg(['mean',np.std],axis=0)
display(statistics_errors)
local_score = statistics_errors.loc['mean'][0].round(3)
local_score

Unnamed: 0,0,1,2
mean,1.059,1.751,2.499
std,0.153,0.199,0.395


1.059

In [114]:
# df_output = pd.DataFrame(np.concatenate([df_test_preds.loc[TEST_DATE[i],f'target_{i}'] for i in range(3)]), index=test_X.index, columns = ['microbusiness_density'])
# df_output = df_output.reset_index().assign(
#     row_id = lambda df: df.apply(lambda df: "{}_{}".format(int(df['cfips']),df['date'].date()), axis='columns'))[['row_id','microbusiness_density']]

# df_output

# submission = pd.concat((
#     df_output,
#     sample_submission[~sample_submission.row_id.isin(df_output.row_id)])
# )

# submission.to_csv(f"data/0203_median_local_{local_score}.csv",index=None)

In [88]:
# model.predict(X_val)

# Notes

In [89]:
from sklearn.base import BaseEstimator

class DummyModel(BaseEstimator):

    def __init__(self):
        pass
    
    def fit(self, X, y):
        return self

    def predict(self, X):
        return np.zeros(X.shape[0])

def check_score(X, y_true, y_pred):
    m_density_target = (y_true + 1) * X[f'm_density_lag_1']
    m_density_pred = (y_pred + 1) * X[f'm_density_lag_1']
    error = SMAPE_1(m_density_target, m_density_pred)
    print('SMAPE SCORE',error)
    return error

# check_score(
#         X = train_X,
#         y_pred=DummyModel().fit(train_X, train_y).predict(train_X), 
#         y_true=train_y['target_1'])
