# Data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from feature_engine.outliers import Winsorizer

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.options.display.float_format = '{:.3f}'.format
import warnings; warnings.filterwarnings('ignore')


In [2]:
# pd.DataFrame({'a':[np.nan,1,2,3,np.nan]}).ffill()

In [3]:
def SMAPE_1 (y_true, y_pred):
    """
    Symmetric Mean Absolute Percentage Error (SMAPE)
    """
    y_true = np.array(y_true)
    denominator = (y_true + np.abs(y_pred)) / 200.0
    diff = np.abs(y_true - y_pred) / denominator
    diff[denominator == 0] = 0.0
    return np.mean(diff)

In [337]:
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')
sample_submission = pd.read_csv('./data/sample_submission.csv')


train['is_test'] = 0 ; test['is_test'] = 1

data = pd.concat((
        train,
        test)
        )\
    .reset_index(drop=True)\
    .assign(
        cfips = lambda df: df['cfips'].astype(str).str.zfill(5),
        date = lambda df: pd.to_datetime(df["first_day_of_month"]),
        m_density = lambda df: df['microbusiness_density'],
        )\
    .sort_values(['cfips','date'], ascending=True)\
    .assign(
        state_i = lambda df: df['cfips'].apply(lambda x: x[:2]),
        county_i = lambda df: df['cfips'].apply(lambda x: x[2:]),
        year = lambda df: df['date'].dt.year,
        month = lambda df: df['date'].dt.month,
        dcount = lambda df: df.groupby('cfips')['row_id'].cumcount(),
        
        m_density_lag_1 = lambda df: df.groupby('cfips')['m_density'].shift(1),
        m_density_lag_2 = lambda df: df.groupby('cfips')['m_density'].shift(2),
        m_density_lag_3 = lambda df: df.groupby('cfips')['m_density'].shift(3),

        target_1 = lambda df: np.where( df['m_density']==0, 0, (df['m_density']/(df['m_density_lag_1'])).clip(0,99) - 1),
        target_2 = lambda df: np.where( df['m_density']==0, 0, (df['m_density']/(df['m_density_lag_2'])).clip(0,99) - 1),
        target_3 = lambda df: np.where( df['m_density']==0, 0, (df['m_density']/(df['m_density_lag_3'])).clip(0,99) - 1),

        )\
    [['cfips','date','dcount','county_i','state_i','month','year','is_test','active','m_density',
        'm_density_lag_1','m_density_lag_2','m_density_lag_3','target_1','target_2', 'target_3'
        ]]
    # .sort_index(ascending=True)

assert all(data.groupby('cfips')['county_i'].nunique() == 1)
assert all(data.groupby('cfips')['state_i'].nunique() == 1)
assert data['cfips'].nunique() == 3135 # there are 3135 county,state tuples
assert data['dcount'].nunique() == 47 # there are 47 series for each county state tuple
assert data.query('is_test==0')['dcount'].nunique() == 39 # there are 39 series in the train set. 
assert data.query('is_test==1')['dcount'].nunique() == 8  # there are 8 series in the test set. 

#The private leaderboard will include 03-2023, 04-2023, 05-2023
#The public leaderboard includes the first month 11-2022. Probably it will be updated later as 12-2022,01-2023 and 02-2023


In [338]:
# data.groupby(['cfips'])['target'].agg(lambda x: sum(x.isna())).value_counts()

In [339]:
# data['lag_minus1_density']  = data.groupby('cfips')['m_density'].shift(-1)
# data
# data.sort_values('target',ascending=False).head()
# data['target'].quantile(.99)

In [340]:
# data[data["cfips"] == '17075']

# Clean Outliers

In [341]:
data['target_1'] = data['target_1'].fillna(0)
data['target_2'] = data['target_2'].fillna(0)
data['target_3'] = data['target_3'].fillna(0)

capper = Winsorizer(capping_method='iqr',tail='both', fold=5)
data['target_1'] = capper.fit_transform(data[['target_1']])
data['target_2'] = capper.fit_transform(data[['target_2']])
data['target_3'] = capper.fit_transform(data[['target_3']])


In [342]:
# # check
# data['target_ratio'] = data['target_ratio'].abs()
# data.groupby('dcount')['target_ratio'].sum().plot()

# data['target_ratio_capped_1'] = data['target_ratio_capped_1'].abs()
# data.groupby('dcount')['target_ratio_capped_1'].sum().plot()


In [343]:
# # check
# temp = data.groupby(['cfips']).agg({'target_ratio':['mean','median','std']})
# temp.columns = ['mean','median','std']
# temp['ratio'] = temp['std']/(temp['median']+1e-10)
# temp = temp.sort_values('std',ascending=False)

# LEVEL1,LEVEL2,LEVEL3 = 1,2,5
# capper = Winsorizer(capping_method='iqr',tail='both', fold=LEVEL1)
# data[f'target_ratio_clean_{LEVEL1}'] = capper.fit_transform(data[['target_ratio']])
# capper = Winsorizer(capping_method='iqr',tail='both', fold=LEVEL2)
# data[f'target_ratio_clean_{LEVEL2}'] = capper.fit_transform(data[['target_ratio']])
# capper = Winsorizer(capping_method='iqr',tail='both', fold=LEVEL3)
# data[f'target_ratio_clean_{LEVEL3}'] = capper.fit_transform(data[['target_ratio']])

# for i in range(0,20):
#     try:
#         plt.figure()
#         x = data[(data['cfips'] == temp.index[i]) & (data['is_test'] == 0)][['target_ratio',f'target_ratio_clean_{LEVEL1}',f'target_ratio_clean_{LEVEL2}',f'target_ratio_clean_{LEVEL3}']]
#         # plt.plot(x[['target_ratio']].values.reshape(-1, 1))
#         plt.plot(x[[f'target_ratio_clean_{LEVEL1}']].values.reshape(-1, 1),'--', label=f'{LEVEL1}')
#         plt.plot(x[[f'target_ratio_clean_{LEVEL2}']].values.reshape(-1, 1),'--', label=f'{LEVEL2}')
#         plt.plot(x[[f'target_ratio_clean_{LEVEL3}']].values.reshape(-1, 1),'--', label=f'{LEVEL3}')
#         plt.legend()
#     except Exception as e:
#         print(e)
#         print(i)

    


# Extra Data

In [344]:
# census_starter = pd.read_csv('./data/census_starter.csv')

# census_starter = census_starter.assign(
#     cfips = lambda x: x['cfips'].astype(str)
#     )\
#     .set_index(['cfips']).sort_index(ascending=True)

# colname_tuples = [('_'.join(e.split('_')[:-1]),e.split('_')[-1]) for e in census_starter.columns.tolist()]
# new_index = pd.MultiIndex.from_tuples(colname_tuples, names=['category','year_info'])
# census_starter = census_starter.set_axis(new_index, axis=1).stack(level=1)
# census_starter = census_starter.reset_index()
# census_starter['year_available'] = census_starter['year_info'].astype(int) + 2



# census_starter = pd.read_csv('./data/census_starter.csv')

# census_starter = census_starter.assign(
#     cfips = lambda x: x['cfips'].astype(str)
#     )\
#     .set_index(['cfips']).sort_index(ascending=True)

# new_index = pd.MultiIndex.from_tuples([('_'.join(e.split('_')[:-1]),e.split('_')[-1]) for e in census_starter.columns.tolist()], names=['category','year'])
# census_starter = census_starter.set_axis(new_index, axis=1).stack(level=1)
# mean_census = census_starter.groupby(level='year').mean()
# mean_census

# Cross validation

In [345]:
import numpy as np
import pandas as pd
from sklearn.dummy import DummyRegressor
from sklearn.model_selection import cross_val_score
from mlxtend.evaluate.time_series import GroupTimeSeriesSplit, plot_splits, print_cv_info, print_split_info

In [402]:
TARGETS = ['target_1', 'target_2', 'target_3']
LAG_DENSITY = ['m_density_lag_1', 'm_density_lag_2', 'm_density_lag_3']
TEST_DATE = ['2022-11-01','2022-12-01','2023-01-01']
TEST_PERIOD = [39, 40, 41]

# sample = data[data.cfips.isin(['01001'])]
# sample = data[data.cfips.isin(['01001','56045'])]
sample = data.copy()
sample.loc[sample.is_test==1,TARGETS]  = np.nan

sample = sample.set_index(['date','cfips'])
sample  = sample.sort_index()['2022-01':'2023-01']
sample = sample[['dcount', 'year','county_i','m_density','m_density_lag_1','m_density_lag_2','m_density_lag_3'] + TARGETS]


sample_train= sample[sample['dcount']<39]
sample_test= sample[sample['dcount']>=39]

train_X = sample_train.drop(TARGETS,axis='columns')
train_y = sample_train[TARGETS]

test_X = sample_test.drop(TARGETS,axis='columns')
test_y = sample_test[TARGETS]

In [414]:
def check_score(X, y_true, y_pred,):
    m_density_target = (y_true + 1) * X['m_density_lag_1']
    m_density_pred = (y_pred + 1) * X['m_density_lag_1']
    error = SMAPE_1(m_density_target, m_density_pred)
    print('SMAPE SCORE',error)
    return error

In [459]:

from collections import defaultdict
errors = defaultdict(list)

test_preds = []
for model_i in range(3):
    cv_args = {"test_size": 1, "n_splits": 3, "train_size": 5, 'gap_size': model_i}
    # cv_args = {"test_size": 1, "n_splits": 3, 'gap_size': model_i}
    # plot_splits(sample, None, sample['dcount'], **cv_args)
    # print_split_info(sample, None, sample['dcount'], **cv_args)
    cv = GroupTimeSeriesSplit(**cv_args)

    for fold_i, (train_index, val_index) in enumerate(cv.split(train_X, train_y, train_X['dcount'])):
        # SPLIT DATA
        
        # print(
        # # #'\ntrain period:',np.unique(train_X.index[train_index].tolist()),
        # '\ntrain period:',np.unique(train_X['dcount'][train_index]).tolist(),
        # # # '\nvalidation_period:', np.unique(train_X.index[val_index].tolist()),
        # '\nvalidation_period:', np.unique(train_X['dcount'][val_index]).tolist(),
        # # # '\ntest_period:',TEST_DATE[model_i],
        # # '\ntest_period:',TEST_PERIOD[model_i],
        # )
        X_train, y_train = train_X.iloc[train_index], train_y.iloc[train_index, model_i]
        X_val, y_val = train_X.iloc[val_index], train_y.iloc[val_index, model_i]
        # display(X_train)
        # display(y_val)
        # display(test_X)
        # MODEL
        model = DummyRegressor(strategy="median")
        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)
        # print(f"for model {model_i+1} and fold {fold_i+1}")
        errors[model_i].append(check_score(X_val, y_val, y_pred))
        
    # INFERENCE
    # model.fit(train_X.loc[['2022-10-01']], train_y.loc[['2022-10-01']].iloc[:, model_i])
    model.fit(train_X, train_y.iloc[:, model_i])
    
    test_preds.append(
         model.predict(test_X[test_X.dcount == TEST_PERIOD[model_i]])
         )


# target
output = pd.DataFrame({'target':np.concatenate(test_preds)},index =test_X.index).reset_index()
output = output.merge(train_X.groupby('cfips')['m_density'].last(), how='left', on='cfips')
output = output.assign(
    row_id = lambda df: df.apply(lambda df: "{}_{}".format(int(df['cfips']),df['date'].date()), axis='columns'),
    microbusiness_density = lambda df: (df['target']+1) * df['m_density']
    )[['row_id','microbusiness_density']]


SMAPE SCORE 2.5315404042788443
SMAPE SCORE 1.3239561547581042
SMAPE SCORE 0.9259471304232395
SMAPE SCORE 1.9234055010180007
SMAPE SCORE 3.0996207404097373
SMAPE SCORE 1.6980757479600894
SMAPE SCORE 2.596288845324702
SMAPE SCORE 2.4016135913874876
SMAPE SCORE 2.8611749767166095


In [460]:
all_errors= []
for i,errors_i in errors.items():
    print(f"mean error for model_{i}:",(np.mean(errors_i)))
    print(f"std error for model_{i}:",(np.std(errors_i)))
    print()
    all_errors.append(errors_i)
print("overall error:" ,np.mean(all_errors))

mean error for model_0: 1.593814563153396
std error for model_0: 0.6826908088336036

mean error for model_1: 2.2403673297959426
std error for model_1: 0.6145083086728033

mean error for model_2: 2.6196924711429332
std error for model_2: 0.18834359381634624

overall error: 2.1512914546974233


In [395]:
submission = pd.concat((
    output,
    sample_submission[~sample_submission.row_id.isin(output.row_id)])
)

submission.to_csv("data/0126_mean_benchmark_submission.csv",index=None)