In [283]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from feature_engine.outliers import Winsorizer
from sklearn.dummy import DummyRegressor
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from feature_engine.outliers import Winsorizer
from sklearn.compose import TransformedTargetRegressor
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn import set_config, get_config
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from pprint import pprint
from collections import defaultdict

set_config(transform_output="pandas")
from mlxtend.evaluate.time_series import GroupTimeSeriesSplit, plot_splits, print_cv_info, print_split_info


pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.options.display.float_format = '{:.3f}'.format
import warnings; warnings.filterwarnings('ignore')


In [284]:
def SMAPE_1 (y_true, y_pred):
    """
    Symmetric Mean Absolute Percentage Error (SMAPE)
    """
    y_true = np.array(y_true)
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 200.0
    diff = np.abs(y_true - y_pred) / denominator
    diff[denominator == 0] = 0.0
    return np.mean(diff)

In [285]:
old_train = pd.read_csv('./data/raw/godaddy-microbusiness-density-forecasting/train.csv')
new_train = pd.read_csv('./data/raw/godaddy-microbusiness-density-forecasting_new/revealed_test.csv')

old_test = pd.read_csv('./data/raw/godaddy-microbusiness-density-forecasting/test.csv')
sample_submission = pd.read_csv('./data/raw/godaddy-microbusiness-density-forecasting/sample_submission.csv')

train = pd.concat((old_train, new_train))
test = old_test[~old_test['first_day_of_month'].isin(new_train['first_day_of_month'])]

train['is_test'] = 0 ; test['is_test'] = 1

data = pd.concat((
        train,
        test)
        )\
    .reset_index(drop=True)\
    .assign(
        cfips = lambda df: df['cfips'].astype(str).str.zfill(5),
        date = lambda df: pd.to_datetime(df["first_day_of_month"]).dt.date,
        mdensity_t0 = lambda df: df['microbusiness_density'],
        )\
    .sort_values(['cfips','date'], ascending=True)\
    .assign(
    
        state_i = lambda df: df['cfips'].apply(lambda x: x[:2]),
        county_i = lambda df: df['cfips'].apply(lambda x: x[2:]),
        
        # year = lambda df: df['date'].dt.year,
        # month = lambda df: df['date'].dt.month,

        dcount = lambda df: df.groupby('cfips')['row_id'].cumcount(),
        
        mdensity_lag1 = lambda df: df.groupby('cfips')['mdensity_t0'].shift(1),
        mdensity_lag2 = lambda df: df.groupby('cfips')['mdensity_t0'].shift(2),
        mdensity_lag3 = lambda df: df.groupby('cfips')['mdensity_t0'].shift(3),
        
        target_0 = lambda df: np.nan_to_num(df['mdensity_t0']),
        target_1 = lambda df: np.nan_to_num(df['mdensity_t0']),
        target_2 = lambda df: np.nan_to_num(df['mdensity_t0']),

    

    )\
    .drop(['county','state'], axis='columns')
# .sort_index(ascending=True)

assert all(data.groupby('cfips')['county_i'].nunique() == 1)
assert all(data.groupby('cfips')['state_i'].nunique() == 1)
assert data['cfips'].nunique() == 3135 # there are 3135 county,state tuples
assert data['dcount'].nunique() == 47 # there are 47 series for each county state tuple
assert data.query('is_test==0')['dcount'].nunique() == 41 # there are 41 series in the train set. 
assert data.query('is_test==1')['dcount'].nunique() == 6  # there are 6 series in the test set. 

#The private leaderboard will include 03-2023, 04-2023, 05-2023
#The public leaderboard includes the first month 11-2022. Probably it will be updated later as 12-2022,01-2023 and 02-2023
#The LB is updated as 01-2023

# capper = Winsorizer(capping_method='iqr',tail='both', fold=5)
# data['target_0'] = capper.fit_transform(data[['target_0']])
# data['target_1'] = capper.fit_transform(data[['target_1']])
# data['target_2'] = capper.fit_transform(data[['target_2']])

In [286]:
data[data['cfips'] == '01001'].tail()

Unnamed: 0,row_id,cfips,first_day_of_month,microbusiness_density,active,is_test,date,mdensity_t0,state_i,county_i,dcount,mdensity_lag1,mdensity_lag2,mdensity_lag3,target_0,target_1,target_2
131670,1001_2023-02-01,1001,2023-02-01,,,1,2023-02-01,,1,1,42,,3.471,3.443,0.0,0.0,0.0
134805,1001_2023-03-01,1001,2023-03-01,,,1,2023-03-01,,1,1,43,,,3.471,0.0,0.0,0.0
137940,1001_2023-04-01,1001,2023-04-01,,,1,2023-04-01,,1,1,44,,,,0.0,0.0,0.0
141075,1001_2023-05-01,1001,2023-05-01,,,1,2023-05-01,,1,1,45,,,,0.0,0.0,0.0
144210,1001_2023-06-01,1001,2023-06-01,,,1,2023-06-01,,1,1,46,,,,0.0,0.0,0.0


In [287]:
# data[data['cfips'] == '01001'].tail(20)

# Sample Data

In [288]:
# PARAMETERS
n_SPLITS = 5 
n_TRAIN_TRAIN_SIZE = 3
n_TRAIN_PERIOD = n_TRAIN_TRAIN_SIZE + 3 + n_SPLITS - 1 


TEST_DATES = list(np.sort(data.query('is_test==1')['date'].unique())[:3])
TEST_PERIOD = list(np.sort(data.query('is_test==1')['dcount'].unique())[:3])

TRAIN_PERIOD = list(np.sort(data.query('is_test==0')['dcount'].unique())[-n_TRAIN_PERIOD:])
TRAIN_DATES = list(np.sort(data.query('is_test==0')['date'].unique())[-n_TRAIN_PERIOD:])

LEAKAGE = ['mdensity_t0']
TARGETS = ['target_0', 'target_1', 'target_2']
LAG_DENSITY = ['mdensity_lag1', 'mdensity_lag2', 'mdensity_lag3']

In [289]:
# data[data['dcount'].isin(TEST_PERIOD)].head()
# sample = data[data.cfips.isin(['01001'])] # sample = data[data.cfips.isin(['01001','56045'])]
sample = data.copy()
sample.loc[sample.is_test==1,TARGETS]  = np.nan
sample = sample.set_index(['date','cfips']).sort_index().loc[TRAIN_DATES+TEST_DATES]
sample = sample[['dcount','county_i'] + LAG_DENSITY + TARGETS + LEAKAGE]
sample_train= sample.query("dcount in @TRAIN_PERIOD") ; sample_test= sample.query("dcount in @TEST_PERIOD")
train_X = sample_train.drop(TARGETS,axis='columns') ; train_y = sample_train[TARGETS]
test_X = sample_test.drop(TARGETS,axis='columns') ; test_y = sample_test[TARGETS]


# Pipelining

In [290]:
from tools import ColumnSelector, LagModel, create_submission
from sklearn.ensemble import GradientBoostingRegressor

In [291]:
dic_pipelines = {}
y_test_preds  = [] 

y_val_preds = defaultdict(list)
errors = defaultdict(list)

lag=1
list_cols_model = [[f'mdensity_lag{lag_i+model_i+1}' for lag_i in range(lag)] for model_i in range(3)]

for model_i in range(3):

    train_y_i = train_y.iloc[:, model_i]
    
    cv_args = {"test_size": 1, "n_splits": n_SPLITS, "train_size": n_TRAIN_TRAIN_SIZE, 'gap_size': 0}
    cv = GroupTimeSeriesSplit(**cv_args)

    # new_features = Pipeline([('select', SimpleFeatureEngineering(features=list_cols_model[model_i]))])
    # print(list_cols_model[model_i])
    raw_features = Pipeline([('select', ColumnSelector(features=list_cols_model[model_i]))])
    
    merge_features_numeric = FeatureUnion([
        # ('new_features', new_features),
        ('raw_features', raw_features)
    ])

    final_features_numeric = Pipeline([
                            ('merge_features',merge_features_numeric),
                            # ('remove_outliers', Winsorizer(capping_method='iqr', tail='both',fold=3)),
                            # ('standart_scaler', StandardScaler())
                            ]
                            )

    
    # model = TransformedTargetRegressor(regressor=DummyRegressor(strategy='median'), transformer=None)
    model = TransformedTargetRegressor(regressor=LagModel(), transformer=None)
    
    model_pipeline = Pipeline([
        ("transform", final_features_numeric),
        ("model", model)
    ])

    dic_pipelines[f'pipeline_model_{model_i}'] = model_pipeline
    
    # param_grid = {'model__regressor__strategy':['mean','median' ]}
    param_grid = {}
    grid = GridSearchCV(dic_pipelines[f'pipeline_model_{model_i}'], scoring=make_scorer(SMAPE_1, greater_is_better=False), param_grid=param_grid, cv=cv)
    grid.fit(train_X, train_y_i, groups=train_X['dcount'])

    # print(grid.cv_results_)
    
    # print(grid.best_estimator_)
    
    ## CHECK
    check_train_period = TRAIN_DATES[-1-n_TRAIN_TRAIN_SIZE: -1] 
    validation_period = TRAIN_DATES[-1] 

    best_model = grid.best_estimator_

    best_model.fit(train_X.loc[check_train_period], train_y_i.loc[check_train_period])   
    
    y_val_pred =  best_model.predict(train_X.loc[validation_period])   
    y_val_preds[f'target_{model_i}'] = y_val_pred
    y_val_i = train_y_i.loc[validation_period]    
    errors[f'error_{model_i}'] = SMAPE_1(y_true=y_val_i, y_pred=y_val_pred)

#     # INFERENCE
    final_train_period = TRAIN_DATES[-n_TRAIN_TRAIN_SIZE:] 

    best_model.fit(train_X.loc[final_train_period], train_y_i.loc[final_train_period])   

    y_test_pred =  best_model.predict(test_X.loc[TEST_DATES[model_i]] )
    y_test_preds.append(y_test_pred)

test_X['microbusiness_density'] = np.concatenate((y_test_preds))

# prepare data for error analysis
val_X = train_X.loc[validation_period]
y_val_preds =  pd.DataFrame(y_val_preds, index=val_X.index)
val_X = pd.concat((val_X, y_val_preds), axis=1)

In [292]:
errors

defaultdict(list,
            {'error_0': 1.889206717018118,
             'error_1': 2.4787836068174856,
             'error_2': 2.9725080424835664})

In [293]:
x = 11
f"{x:03}"

'011'

In [313]:
COLS = ['GEO_ID','NAME','S0101_C01_026E']
df2020 = pd.read_csv('./data/raw/census_data_1/ACSST5Y2020.S0101-Data.csv',usecols=COLS)
df2020 = df2020.iloc[1:]
df2020['S0101_C01_026E'] = df2020['S0101_C01_026E'].astype('int')

COLS = ['GEO_ID','NAME','S0101_C01_026E']
df2021 = pd.read_csv('./data/raw/census_data_1/ACSST5Y2021.S0101-Data.csv',usecols=COLS)
df2021 = df2021.iloc[1:]
df2021['S0101_C01_026E'] = df2021['S0101_C01_026E'].astype('int')

df2020['cfips'] = df2020.GEO_ID.apply(lambda x: f"{int(x.split('US')[-1]):05}" )
adult2020 = df2020.set_index('cfips').S0101_C01_026E.to_dict()

df2021['cfips'] = df2021.GEO_ID.apply(lambda x: f"{int(x.split('US')[-1]):05}" )
adult2021 = df2021.set_index('cfips').S0101_C01_026E.to_dict()

sub = test_X.reset_index()
sub['adult2020'] = sub.cfips.map(adult2020)
sub['adult2021'] = sub.cfips.map(adult2021)

sub.microbusiness_density = sub.microbusiness_density * sub.adult2020 / sub.adult2021
# sub = sub.drop(['adult2020','adult2021','cfips'],axis=1)
# sub.to_csv('submission.csv',index=False)
# sub.head()

In [316]:
new_test_X = sub.set_index(['date','cfips'])

In [319]:
# # Prepare submission file

date_submission = '0103'
local_score = round(errors['error_0'],2)
model_name = 'adjusted_lag_1'

submission = create_submission(new_test_X,date_submission, model_name, local_score, sample_submission)

submission is created for date: 0103 model: adjusted_lag_1 with score: 1.89


Unnamed: 0,row_id,microbusiness_density
0,1001_2023-01-01,3.319
1,1003_2023-01-01,7.935
2,1005_2023-01-01,1.24
3,1007_2023-01-01,1.287
4,1009_2023-01-01,1.819


In [205]:
# sample_submission

In [311]:
data.query('(cfips == "01001") and (first_day_of_month== "2022-06-01")')

# 42496

Unnamed: 0,row_id,cfips,first_day_of_month,microbusiness_density,active,is_test,date,mdensity_t0,state_i,county_i,dcount,mdensity_lag1,mdensity_lag2,mdensity_lag3,target_0,target_1,target_2
34,1001_2022-06-01,1001,2022-06-01,3.346,1422.0,0,2022-06-01,3.346,1,1,34,3.313,3.372,3.337,3.346,3.346,3.346


In [312]:
1422.000/42496*100

3.3461972891566263

In [251]:
# 7.090 = 
100 * 9780/44438


22.00819118772222

In [None]:
# ERROR ANALYSIS

In [227]:

for model_i in range(3):
    val_X[f'error_{model_i}'] = val_X.apply(lambda x: SMAPE_1([x['mdensity_t0']],[x[f'target_{model_i}']]),axis='columns')

# x = val_X['error_0'].expanding().mean()
# x = x[x>4]
# x


In [228]:
val_X

Unnamed: 0_level_0,dcount,county_i,mdensity_lag1,mdensity_lag2,mdensity_lag3,mdensity_t0,target_0,target_1,target_2,error_0,error_1,error_2
cfips,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
01001,40,001,3.443,3.464,3.443,3.471,3.443,3.464,3.443,0.817,0.204,0.817
01003,40,003,8.258,8.360,8.342,8.251,8.258,8.360,8.342,0.085,1.314,1.098
01005,40,005,1.247,1.232,1.207,1.252,1.247,1.232,1.207,0.404,1.626,3.696
01007,40,007,1.276,1.287,1.315,1.287,1.276,1.287,1.315,0.877,0.000,2.160
01009,40,009,1.836,1.832,1.852,1.852,1.836,1.832,1.852,0.855,1.101,0.000
...,...,...,...,...,...,...,...,...,...,...,...,...
56037,40,037,2.814,2.824,2.811,2.877,2.814,2.824,2.811,2.193,1.861,2.304
56039,40,039,26.372,26.273,26.091,26.315,26.372,26.273,26.091,0.218,0.159,0.856
56041,40,041,4.016,4.009,3.996,4.051,4.016,4.009,3.996,0.854,1.026,1.370
56043,40,043,3.143,3.127,3.209,3.060,3.143,3.127,3.209,2.667,2.139,4.749


In [222]:
# val_X

In [None]:
for e in x.index:
    plt.figure()
    # val_X[val_X['cfips'] == e]['target_0'].plot()
    fig = plt.figure()
    data[data['cfips'] == e].reset_index(drop=True)['mdensity_t0'].plot()
    plt.scatter(38, val_X.loc[e]['target_0'], color='r')
    
    

In [None]:
# x_sample['error_1']= x_sample[['mdensity_t0','pred']].apply(lambda x: SMAPE_1(x[[0]],x[[1]]),axis=1)
# x_sample['error_2']= x_sample[['mdensity_t0','mdensity_lag1']].apply(lambda x: SMAPE_1(x[[0]],x[[1]]),axis=1)a

In [None]:
# x_sample = x_sample.sort_values(['error_1'],ascending=False)


In [None]:
# errors = [SMAPE_1(y_val_i[[i]],y_pred[[i]]) for i,y_pred_i in enumerate(y_pred)]
# errors = [SMAPE_1(y_val_i[[i]],train_X.loc[TRAIN_DATE[-1:],'mdensity_lag1'].iloc[[i]]) for i,y_pred_i in enumerate(y_pred)]


In [None]:
# y_pred[np.argsort(errors)]
# errors= np.sort(errors)

# cum_errors = pd.Series(errors).expanding().mean()
# cum_errors.plot()

In [None]:
train_sample = train_X.loc[TRAIN_DATE[-1:]]
train_sample.iloc[np.argsort(errors)]

In [None]:
train_y_sample = train_y.loc[TRAIN_DATE[-1:]]
SMAPE_1(train_y_sample.values,np.ones(train_y_sample.shape[0])*np.median(train_y_sample))
SMAPE_1(train_y.loc['2022-10-01'].values,train_y.loc['2022-09-01'].values)

In [None]:
# 169.0067448718508, 152.1912078949058
# 1.0730837144785978, 1.109003414526155
# 145.82845110893408, 136.07246720848954
# 1.7214972858534574, 1.7328342300564805
# 130.85211125280742, 124.7100031585511
# 2.348016937375499, 2.3395521962826726

In [None]:
black_list = []
black_list.extend(train_X.sort_values(['mdensity_t0'])[:50].reset_index()['cfips'].unique())
black_list.extend(train_X.sort_values(['mdensity_t0'])[-50:].reset_index()['cfips'].unique())
keep = list(set(train_X.reset_index()['cfips'].unique()) - set(black_list))
# train_X.loc[(slice(None),keep)]
train_X_sample = train_X.loc[(slice(None),keep),:].reset_index().set_index(['date','cfips'])
train_y_sample = train_y.loc[(slice(None),keep),:].reset_index().set_index(['date','cfips'])

In [None]:
train_X.head()

In [None]:
train_X_sample.head()

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import HuberRegressor, Lasso, Ridge
from sklearn.tree import DecisionTreeRegressor

def smape(Y_predict, Y_test):
    result = np.linalg.norm(Y_predict - Y_test, axis = 1)
    result = np.abs(result)
    denom = np.linalg.norm(Y_predict, axis = 1)
    denom += np.linalg.norm(Y_test, axis = 1)
    result /= denom
    result *= 100 * 2
    result = np.mean(result)
    return result
epsilon = 1e-6
param_search = np.arange(10, 200, 20)

scores = []
for i in param_search:
    print(i)

    # definition of ztransformation.

    def ztransform1(Y, param=i):
        return 1 / (param + Y)

    # inverse transformation, Y = inverseZ(Z)

    def inverseZ1(Z, param=i):
        return -param + 1 / Z
    
    
    model = TransformedTargetRegressor(GradientBoostingRegressor(loss='squared_error', n_estimators=50,max_depth=10),func= ztransform1, inverse_func=inverseZ1)

    model.fit( train_X.loc['2022-05-01':'2022-09-01',['mdensity_lag1','mdensity_lag2','mdensity_lag3']], train_y.loc['2022-05-01':'2022-09-01',['target_0']]) 
        
    print(SMAPE_1(epsilon+model.predict(train_X.loc['2022-02-01':'2022-09-01',['mdensity_lag1','mdensity_lag2','mdensity_lag3']]),train_y.loc['2022-02-01':'2022-09-01',['target_0']].values))
    print(SMAPE_1(epsilon+model.predict(train_X.loc['2022-10-01',['mdensity_lag1','mdensity_lag2','mdensity_lag3']]),train_y.loc['2022-10-01',['target_0']].values))
    
# 160
# 1.3528178494715637
# 1.4203927419328115
# 190
# 1.3527584337361627
# 1.4161016190620148
