# Data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from feature_engine.outliers import Winsorizer

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.options.display.float_format = '{:.3f}'.format
import warnings; warnings.filterwarnings('ignore')


In [2]:
def SMAPE_1 (y_true, y_pred):
    """
    Symmetric Mean Absolute Percentage Error (SMAPE)
    """
    y_true = np.array(y_true)
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 200.0
    diff = np.abs(y_true - y_pred) / denominator
    diff[denominator == 0] = 0.0
    return np.mean(diff)

In [194]:
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')
sample_submission = pd.read_csv('./data/sample_submission.csv')


train['is_test'] = 0 ; test['is_test'] = 1

data = pd.concat((
        train,
        test)
        )\
    .reset_index(drop=True)\
    .assign(
        cfips = lambda df: df['cfips'].astype(str).str.zfill(5),
        date = lambda df: pd.to_datetime(df["first_day_of_month"]),
        mdensity_t0 = lambda df: df['microbusiness_density'],
        )\
    .sort_values(['cfips','date'], ascending=True)\
    .assign(
        state_i = lambda df: df['cfips'].apply(lambda x: x[:2]),
        county_i = lambda df: df['cfips'].apply(lambda x: x[2:]),
        year = lambda df: df['date'].dt.year,
        month = lambda df: df['date'].dt.month,
        dcount = lambda df: df.groupby('cfips')['row_id'].cumcount(),
        
        mdensity_lag1 = lambda df: df.groupby('cfips')['mdensity_t0'].shift(1),
        mdensity_lag2 = lambda df: df.groupby('cfips')['mdensity_t0'].shift(2),
        mdensity_lag3 = lambda df: df.groupby('cfips')['mdensity_t0'].shift(3),
        
        target_1 = lambda df: np.nan_to_num(df.groupby('cfips')['mdensity_t0'].shift(0)),
        target_2 = lambda df: np.nan_to_num(df.groupby('cfips')['mdensity_t0'].shift(-1)),
        target_3 = lambda df: np.nan_to_num(df.groupby('cfips')['mdensity_t0'].shift(-2)),

        target_ratio_1 = lambda df: np.nan_to_num(np.where( df['mdensity_lag1']==0, 0, (df['mdensity_t0']/(df['mdensity_lag1'])).clip(0,99) - 1)),
        target_ratio_2 = lambda df: np.nan_to_num(np.where( df['mdensity_lag2']==0, 0, (df['mdensity_t0']/(df['mdensity_lag2'])).clip(0,99) - 1)),
        target_ratio_3 = lambda df: np.nan_to_num(np.where( df['mdensity_lag3']==0, 0, (df['mdensity_t0']/(df['mdensity_lag3'])).clip(0,99) - 1)),
        

        )\
    [['cfips','date','dcount','county_i','state_i','month','year','is_test','active','mdensity_t0', 
    'mdensity_lag1','mdensity_lag2','mdensity_lag3',
    'target_ratio_1','target_ratio_2', 'target_ratio_3',
    'target_1','target_2', 'target_3',
        ]]
    # .sort_index(ascending=True)

assert all(data.groupby('cfips')['county_i'].nunique() == 1)
assert all(data.groupby('cfips')['state_i'].nunique() == 1)
assert data['cfips'].nunique() == 3135 # there are 3135 county,state tuples
assert data['dcount'].nunique() == 47 # there are 47 series for each county state tuple
assert data.query('is_test==0')['dcount'].nunique() == 39 # there are 39 series in the train set. 
assert data.query('is_test==1')['dcount'].nunique() == 8  # there are 8 series in the test set. 

#The private leaderboard will include 03-2023, 04-2023, 05-2023
#The public leaderboard includes the first month 11-2022. Probably it will be updated later as 12-2022,01-2023 and 02-2023

capper = Winsorizer(capping_method='iqr',tail='both', fold=5)
data['target_ratio_1'] = capper.fit_transform(data[['target_ratio_1']])
data['target_ratio_2'] = capper.fit_transform(data[['target_ratio_2']])
data['target_ratio_3'] = capper.fit_transform(data[['target_ratio_3']])

data['target_1'] = capper.fit_transform(data[['target_1']])
data['target_2'] = capper.fit_transform(data[['target_2']])
data['target_3'] = capper.fit_transform(data[['target_3']])


In [219]:
data[data['cfips'] == '01001'].tail(20)

Unnamed: 0,cfips,date,dcount,county_i,state_i,month,year,is_test,active,mdensity_t0,mdensity_lag1,mdensity_lag2,mdensity_lag3,target_ratio_1,target_ratio_2,target_ratio_3,target_1,target_2,target_3
27,1001,2021-11-01,27,1,1,11,2021,0,1350.0,3.201,3.203,3.187,3.22,-0.001,0.004,-0.006,3.201,3.286,3.297
28,1001,2021-12-01,28,1,1,12,2021,0,1386.0,3.286,3.201,3.203,3.187,0.027,0.026,0.031,3.286,3.297,3.334
29,1001,2022-01-01,29,1,1,1,2022,0,1401.0,3.297,3.286,3.201,3.203,0.003,0.03,0.029,3.297,3.334,3.337
30,1001,2022-02-01,30,1,1,2,2022,0,1417.0,3.334,3.297,3.286,3.201,0.011,0.015,0.042,3.334,3.337,3.372
31,1001,2022-03-01,31,1,1,3,2022,0,1418.0,3.337,3.334,3.297,3.286,0.001,0.012,0.015,3.337,3.372,3.313
32,1001,2022-04-01,32,1,1,4,2022,0,1433.0,3.372,3.337,3.334,3.297,0.011,0.011,0.023,3.372,3.313,3.346
33,1001,2022-05-01,33,1,1,5,2022,0,1408.0,3.313,3.372,3.337,3.334,-0.017,-0.007,-0.006,3.313,3.346,3.438
34,1001,2022-06-01,34,1,1,6,2022,0,1422.0,3.346,3.313,3.372,3.337,0.01,-0.008,0.003,3.346,3.438,3.424
35,1001,2022-07-01,35,1,1,7,2022,0,1461.0,3.438,3.346,3.313,3.372,0.027,0.038,0.02,3.438,3.424,3.443
36,1001,2022-08-01,36,1,1,8,2022,0,1455.0,3.424,3.438,3.346,3.313,-0.004,0.023,0.033,3.424,3.443,3.464


In [212]:
(3.464/3.438)-1

0.0075625363583478045

In [198]:
# data['target_1']==

# Sample Data

In [4]:
import numpy as np
import pandas as pd
from sklearn.dummy import DummyRegressor
from sklearn.model_selection import cross_val_score
from mlxtend.evaluate.time_series import GroupTimeSeriesSplit, plot_splits, print_cv_info, print_split_info

In [214]:
TARGETS = ['target_1', 'target_2', 'target_3']
TARGETS_RATIO = ['target_ratio_1', 'target_ratio_2', 'target_ratio_3']
LAG_DENSITY = ['mdensity_lag1', 'mdensity_lag2', 'mdensity_lag3',]

TRAIN_DATE = np.sort(data.query('is_test==0')['date'].unique())
TEST_DATE = ['2022-11-01','2022-12-01','2023-01-01']

TEST_PERIOD = [39, 40, 41]

LEAKAGE = ['mdensity_t0']

TRAIN_SIZE = 3

# sample = data[data.cfips.isin(['01001'])] # sample = data[data.cfips.isin(['01001','56045'])]
sample = data.copy()
sample.loc[sample.is_test==1,TARGETS]  = np.nan
sample = sample.set_index(['date','cfips']).sort_index()['2022-01':'2023-01']
sample = sample[['dcount', 'year','county_i'] + LAG_DENSITY + TARGETS + LEAKAGE]
sample_train= sample[sample['dcount']< 39] ; sample_test= sample[sample['dcount']>=39]

train_X = sample_train.drop(TARGETS,axis='columns') ; train_y = sample_train[TARGETS]
test_X = sample_test.drop(TARGETS,axis='columns') ; test_y = sample_test[TARGETS]


# Pipelining

In [215]:
from sklearn.base import BaseEstimator, TransformerMixin

class ColumnSelector(BaseEstimator, TransformerMixin):
    """Select only specified columns."""
    def __init__(self, features):
        self.features = features
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X[self.features]

class BaseTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, features=None):
        self.features = features
	    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X_transformed = X.copy()
        return X_transformed
    
    # def get_feature_names_out(self, X):
    #     return self.features


    
class SimpleFeatureEngineering(BaseTransformer):
    def transform(self, X):
        X_transformed = X.copy()
        X_transformed['mdensity_ratio_1_2'] = ((X_transformed['mdensity_lag1']/X_transformed['mdensity_lag2']) - 1).fillna(0)
        X_transformed['mdensity_ratio_2_3'] = ((X_transformed['mdensity_lag2']/X_transformed['mdensity_lag3']) - 1).fillna(0)
        new_features = ['mdensity_ratio_1_2' , 'mdensity_ratio_2_3']
        return X_transformed[new_features]


In [216]:
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from feature_engine.outliers import Winsorizer
from sklearn.compose import TransformedTargetRegressor
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn import set_config, get_config
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from pprint import pprint

# feature_engineering = Pipeline([
#         ('select', ColumnSelector(['mdensity_lag1','mdensity_lag2','mdensity_lag3'])),
#         ('transform', SimpleFeatureEngineering())
#         ])

# feature_engineering


set_config(transform_output="pandas")
# pipe = make_pipeline(SimpleFeatureEngineering())
####


cols = ['mdensity_lag1','mdensity_lag2','mdensity_lag3']
new_features = ColumnTransformer([
    ('lag_features', Pipeline([('lag_targets',SimpleFeatureEngineering())]), cols)
    ],remainder='drop',
    verbose_feature_names_out=False)

cols = []
raw_features = Pipeline([('select', ColumnSelector(cols))])

merge_features_numeric = FeatureUnion([
    ('new_features', new_features),
    ('raw_features', raw_features)
])

final_features_numeric = Pipeline([('p',merge_features_numeric),
                         ('remove_outliers', Winsorizer(capping_method='iqr', tail='both',fold=3)),
                         ('standart_scaler', StandardScaler())
                         ]
                         )

model = DummyRegressor(constant=0)
# model = TransformedTargetRegressor(regressor=LinearRegression(), transformer=MinMaxScaler())
# model = LinearRegression()

model_pipeline = Pipeline([
    ("transform", final_features_numeric),
    ("model", model)
])



In [189]:
from collections import defaultdict
errors = defaultdict(list)
test_preds = defaultdict(list)



def SMAPE(y_true, y_pred):
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 200.0
    diff = np.abs(y_true - y_pred) / denominator
    diff[denominator == 0] = 0.0
    return np.nanmean(diff)


for model_i in range(3):
    
    cv_args = {"test_size": 1, "n_splits": 3, "train_size": TRAIN_SIZE, 'gap_size': model_i}
    cv = GroupTimeSeriesSplit(**cv_args)

    train_y_i = train_y.iloc[:, model_i]
    train_y_i
    # model_pipeline = Pipeline([("model" , DummyRegressor(strategy="constant", constant=0))])
    # param_grid = {"model__strategy": ['mean','median','constant']}
    
    param_grid = {"model__strategy": ['mean']}

    
    grid = GridSearchCV(model_pipeline, scoring=make_scorer(SMAPE, greater_is_better=False), param_grid=param_grid, cv=cv)
    grid.fit(train_X, train_y_i, groups=train_X['dcount'])
    # pprint(grid.cv_results_)
    # display( grid.best_estimator_['model'].strategy)
    ## CHECK
    # best_model = grid.best_estimator_.train(train_X.loc[TRAIN_DATE[-TRAIN_SIZE:]],train_y_i.loc[TRAIN_DATE[-TRAIN_SIZE:]])
    y_pred =  grid.best_estimator_.predict(train_X.loc[TRAIN_DATE[-TRAIN_SIZE:]])
   
    y_val_i = train_y_i.loc[TRAIN_DATE[-TRAIN_SIZE:]]

    y_pred_target = (y_pred+1) * train_X.loc[TRAIN_DATE[-TRAIN_SIZE:],f'mdensity_lag{model_i+1}']
    y_val_target = (y_val_i+1) * train_X.loc[TRAIN_DATE[-TRAIN_SIZE:],f'mdensity_lag{model_i+1}']
    
    # test_preds[f'target_{model_i}']= model.predict(test_X)

    print(SMAPE(y_true=y_val_i, y_pred=y_pred))
    print(SMAPE(y_true=y_val_target, y_pred=y_pred_target))

    

152.1912078949058
1.109003414526155
136.07246720848954
1.7328342300564805
124.7100031585511
2.3395521962826726


In [182]:
# 169.0067448718508, 152.1912078949058
# 1.0730837144785978, 1.109003414526155
# 145.82845110893408, 136.07246720848954
# 1.7214972858534574, 1.7328342300564805
# 130.85211125280742, 124.7100031585511
# 2.348016937375499, 2.3395521962826726

'mean'

In [108]:
train['microbusiness_density'].describe()

count   122265.000
mean         3.818
std          4.991
min          0.000
25%          1.639
50%          2.587
75%          4.519
max        284.340
Name: microbusiness_density, dtype: float64