In [270]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from feature_engine.outliers import Winsorizer
from sklearn.dummy import DummyRegressor
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from feature_engine.outliers import Winsorizer
from sklearn.compose import TransformedTargetRegressor
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn import set_config, get_config
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from pprint import pprint
from collections import defaultdict
import tools
from sklearn.ensemble import GradientBoostingRegressor

set_config(transform_output="pandas")
from mlxtend.evaluate.time_series import GroupTimeSeriesSplit, plot_splits, print_cv_info, print_split_info


pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.options.display.float_format = '{:.2f}'.format
import warnings; warnings.filterwarnings('ignore')


In [309]:
old_train = pd.read_csv('./data/raw/godaddy-microbusiness-density-forecasting/train.csv')
new_train = pd.read_csv('./data/raw/godaddy-microbusiness-density-forecasting_new/revealed_test.csv')

old_test = pd.read_csv('./data/raw/godaddy-microbusiness-density-forecasting/test.csv')
sample_submission = pd.read_csv('./data/raw/godaddy-microbusiness-density-forecasting/sample_submission.csv')

train = pd.concat((old_train, new_train))
test = old_test[~old_test['first_day_of_month'].isin(new_train['first_day_of_month'])]

train['is_test'] = 0 ; test['is_test'] = 1

data = pd.concat((
        train,
        test)
        )\
    .reset_index(drop=True)\
    .assign(
        cfips = lambda df: df['cfips'].astype(str).str.zfill(5),
        date = lambda df: pd.to_datetime(df["first_day_of_month"]),
        mdensity_t0 = lambda df: df['microbusiness_density'],
        active_t0 = lambda df: df['active'],
        )\
    .sort_values(['cfips','date'], ascending=True)\
    .assign(
    
        state_i = lambda df: df['cfips'].apply(lambda x: x[:2]),
        county_i = lambda df: df['cfips'].apply(lambda x: x[2:]),
        
        year = lambda df: df['date'].dt.year,
        date = lambda df: df["date"].dt.date,
        # month = lambda df: df['date'].dt.month,

        dcount = lambda df: df.groupby('cfips')['row_id'].cumcount(),
        
        active_lag1 = lambda df: df.groupby('cfips')['active_t0'].shift(1),
        active_lag2 = lambda df: df.groupby('cfips')['active_t0'].shift(2),
        active_lag3 = lambda df: df.groupby('cfips')['active_t0'].shift(3),
        
        target_0 = lambda df: np.nan_to_num(df['active']),
        target_1 = lambda df: np.nan_to_num(df['active']),
        target_2 = lambda df: np.nan_to_num(df['active']),

    

    )\
    .drop(['county','state'], axis='columns')
# .sort_index(ascending=True)

assert all(data.groupby('cfips')['county_i'].nunique() == 1)
assert all(data.groupby('cfips')['state_i'].nunique() == 1)
assert data['cfips'].nunique() == 3135 # there are 3135 county,state tuples
assert data['dcount'].nunique() == 47 # there are 47 series for each county state tuple
assert data.query('is_test==0')['dcount'].nunique() == 41 # there are 41 series in the train set. 
assert data.query('is_test==1')['dcount'].nunique() == 6  # there are 6 series in the test set. 

#The private leaderboard will include 03-2023, 04-2023, 05-2023
#The public leaderboard includes the first month 11-2022. Probably it will be updated later as 12-2022,01-2023 and 02-2023
#The LB is updated as 01-2023


In [310]:
# adding census data
data_census = []
for year in range(2017,2022):
    COLS = ['GEO_ID','NAME','S0101_C01_026E']
    data_census_i = pd.read_csv(f'./data/raw/census_data_1/ACSST5Y{year}.S0101-Data.csv',usecols=COLS)
    data_census_i = data_census_i.iloc[1:]
    data_census_i['population'] = data_census_i['S0101_C01_026E'].astype('int')


    data_census_i['cfips'] = data_census_i.GEO_ID.apply(lambda x: f"{int(x.split('US')[-1]):05}" )
    data_census_i['year'] = year+2
    data_census.append(data_census_i[['cfips','year','population']])

data_census = pd.concat((data_census),axis='rows')


In [311]:
data = data.merge(data_census, on=['cfips','year'], how='left')

In [312]:
data.head()

Unnamed: 0,row_id,cfips,first_day_of_month,microbusiness_density,active,is_test,date,mdensity_t0,active_t0,state_i,county_i,year,dcount,active_lag1,active_lag2,active_lag3,target_0,target_1,target_2,population
0,1001_2019-08-01,1001,2019-08-01,3.01,1249.0,0,2019-08-01,3.01,1249.0,1,1,2019,0,,,,1249.0,1249.0,1249.0,41527
1,1001_2019-09-01,1001,2019-09-01,2.88,1198.0,0,2019-09-01,2.88,1198.0,1,1,2019,1,1249.0,,,1198.0,1198.0,1198.0,41527
2,1001_2019-10-01,1001,2019-10-01,3.06,1269.0,0,2019-10-01,3.06,1269.0,1,1,2019,2,1198.0,1249.0,,1269.0,1269.0,1269.0,41527
3,1001_2019-11-01,1001,2019-11-01,2.99,1243.0,0,2019-11-01,2.99,1243.0,1,1,2019,3,1269.0,1198.0,1249.0,1243.0,1243.0,1243.0,41527
4,1001_2019-12-01,1001,2019-12-01,2.99,1243.0,0,2019-12-01,2.99,1243.0,1,1,2019,4,1243.0,1269.0,1198.0,1243.0,1243.0,1243.0,41527


# Sample Data

In [313]:
# PARAMETERS
n_SPLITS = 5 
n_TRAIN_TRAIN_SIZE = 30
n_TRAIN_PERIOD = n_TRAIN_TRAIN_SIZE + 3 + n_SPLITS - 1 


TEST_DATES = list(np.sort(data.query('is_test==1')['date'].unique())[:3])
TEST_PERIOD = list(np.sort(data.query('is_test==1')['dcount'].unique())[:3])

TRAIN_PERIOD = list(np.sort(data.query('is_test==0')['dcount'].unique())[-n_TRAIN_PERIOD:])
TRAIN_DATES = list(np.sort(data.query('is_test==0')['date'].unique())[-n_TRAIN_PERIOD:])

LEAKAGE = ['mdensity_t0','active']
TARGETS = ['target_0', 'target_1', 'target_2']
FEATURES = ['population']
LAG_TARGET = ['active_lag1', 'active_lag2', 'active_lag3']

In [314]:
# data[data['dcount'].isin(TEST_PERIOD)].head()
# sample = data[data.cfips.isin(['01001'])] # sample = data[data.cfips.isin(['01001','56045'])]
sample = data.copy()
sample.loc[sample.is_test==1,TARGETS]  = np.nan
sample = sample.set_index(['date','cfips']).sort_index().loc[TRAIN_DATES+TEST_DATES]
sample = sample[['dcount','county_i'] + LAG_TARGET + TARGETS + FEATURES+ LEAKAGE]
sample_train= sample.query("dcount in @TRAIN_PERIOD") ; sample_test= sample.query("dcount in @TEST_PERIOD")
train_X = sample_train.drop(TARGETS,axis='columns') ; train_y = sample_train[TARGETS]
test_X = sample_test.drop(TARGETS,axis='columns') ; test_y = sample_test[TARGETS]


In [315]:
# test_X

# Pipelining

In [316]:
dic_pipelines = {}
y_test_preds  = [] 

y_val_preds = defaultdict(list)
errors = defaultdict(list)

lag=1
list_cols_model = [[f'active_lag{lag_i+model_i+1}' for lag_i in range(lag)] for model_i in range(3)]

for model_i in range(3):

    train_y_i = train_y.iloc[:, model_i]
    
    cv_args = {"test_size": 1, "n_splits": n_SPLITS, "train_size": n_TRAIN_TRAIN_SIZE, 'gap_size': 0}
    cv = GroupTimeSeriesSplit(**cv_args)

    # new_features = Pipeline([('select', SimpleFeatureEngineering(features=list_cols_model[model_i]))])
    # print(list_cols_model[model_i])
    raw_features = Pipeline([('select', tools.ColumnSelector(features=list_cols_model[model_i]))])
    
    merge_features_numeric = FeatureUnion([
        # ('new_features', new_features),
        ('raw_features', raw_features)
    ])

    final_features_numeric = Pipeline([
                            ('merge_features',merge_features_numeric),
                            # ('remove_outliers', Winsorizer(capping_method='iqr', tail='both',fold=3)),
                            # ('standart_scaler', StandardScaler())
                            ]
                            )

    
    # model = TransformedTargetRegressor(regressor=DummyRegressor(strategy='median'), transformer=None)
    model = TransformedTargetRegressor(regressor=tools.LagModel(), transformer=None)
    
    model_pipeline = Pipeline([
        ("transform", final_features_numeric),
        ("model", model)
    ])

    dic_pipelines[f'pipeline_model_{model_i}'] = model_pipeline
    
    # param_grid = {'model__regressor__strategy':['mean','median' ]}
    param_grid = {}
    grid = GridSearchCV(dic_pipelines[f'pipeline_model_{model_i}'], scoring=make_scorer(tools.SMAPE_1, greater_is_better=False), param_grid=param_grid, cv=cv)
    grid.fit(train_X, train_y_i, groups=train_X['dcount'])

    # print(grid.cv_results_)
    
    # print(grid.best_estimator_)
    
    ## CHECK
    check_train_period = TRAIN_DATES[-1-n_TRAIN_TRAIN_SIZE: -1] 
    validation_period = TRAIN_DATES[-1] 

    best_model = grid.best_estimator_

    best_model.fit(train_X.loc[check_train_period], train_y_i.loc[check_train_period])   
    
    y_val_pred =  best_model.predict(train_X.loc[validation_period])   
    y_val_preds[f'target_{model_i}'] = y_val_pred
    y_val_i = train_y_i.loc[validation_period]    
    errors[f'error_{model_i}'] = tools.SMAPE_1(y_true=y_val_i, y_pred=y_val_pred)

#     # INFERENCE
    final_train_period = TRAIN_DATES[-n_TRAIN_TRAIN_SIZE:] 

    best_model.fit(train_X.loc[final_train_period], train_y_i.loc[final_train_period])   

    y_test_pred =  best_model.predict(test_X.loc[TEST_DATES[model_i]] )
    y_test_preds.append(y_test_pred)

test_X['active'] = np.concatenate((y_test_preds))

# prepare data for error analysis
val_X = train_X.loc[validation_period]
y_val_preds =  pd.DataFrame(y_val_preds, index=val_X.index)
val_X = pd.concat((val_X, y_val_preds), axis=1)

test_X['microbusiness_density'] = 100*test_X['active']/test_X['population']

In [317]:
errors

defaultdict(list,
            {'error_0': 1.8892067573580058,
             'error_1': 2.4787835188289034,
             'error_2': 2.9725079862458226})

In [347]:
# # # Prepare submission file

# date_submission = '0203'
# local_score = round(errors['error_0'],2)
# model_name = 'activity_lag_1'

# submission = tools.create_submission(test_X,date_submission, model_name, local_score, sample_submission)

# ERROR ANALYSIS

In [None]:
train_sample = train_X.loc[TRAIN_DATE[-1:]]
train_sample.iloc[np.argsort(errors)]

In [None]:
train_y_sample = train_y.loc[TRAIN_DATE[-1:]]
SMAPE_1(train_y_sample.values,np.ones(train_y_sample.shape[0])*np.median(train_y_sample))
SMAPE_1(train_y.loc['2022-10-01'].values,train_y.loc['2022-09-01'].values)

In [None]:
# 169.0067448718508, 152.1912078949058
# 1.0730837144785978, 1.109003414526155
# 145.82845110893408, 136.07246720848954
# 1.7214972858534574, 1.7328342300564805
# 130.85211125280742, 124.7100031585511
# 2.348016937375499, 2.3395521962826726

In [None]:
black_list = []
black_list.extend(train_X.sort_values(['mdensity_t0'])[:50].reset_index()['cfips'].unique())
black_list.extend(train_X.sort_values(['mdensity_t0'])[-50:].reset_index()['cfips'].unique())
keep = list(set(train_X.reset_index()['cfips'].unique()) - set(black_list))
# train_X.loc[(slice(None),keep)]
train_X_sample = train_X.loc[(slice(None),keep),:].reset_index().set_index(['date','cfips'])
train_y_sample = train_y.loc[(slice(None),keep),:].reset_index().set_index(['date','cfips'])

In [None]:
train_X.head()

In [None]:
train_X_sample.head()

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import HuberRegressor, Lasso, Ridge
from sklearn.tree import DecisionTreeRegressor

def smape(Y_predict, Y_test):
    result = np.linalg.norm(Y_predict - Y_test, axis = 1)
    result = np.abs(result)
    denom = np.linalg.norm(Y_predict, axis = 1)
    denom += np.linalg.norm(Y_test, axis = 1)
    result /= denom
    result *= 100 * 2
    result = np.mean(result)
    return result
epsilon = 1e-6
param_search = np.arange(10, 200, 20)

scores = []
for i in param_search:
    print(i)

    # definition of ztransformation.

    def ztransform1(Y, param=i):
        return 1 / (param + Y)

    # inverse transformation, Y = inverseZ(Z)

    def inverseZ1(Z, param=i):
        return -param + 1 / Z
    
    
    model = TransformedTargetRegressor(GradientBoostingRegressor(loss='squared_error', n_estimators=50,max_depth=10),func= ztransform1, inverse_func=inverseZ1)

    model.fit( train_X.loc['2022-05-01':'2022-09-01',['mdensity_lag1','mdensity_lag2','mdensity_lag3']], train_y.loc['2022-05-01':'2022-09-01',['target_0']]) 
        
    print(SMAPE_1(epsilon+model.predict(train_X.loc['2022-02-01':'2022-09-01',['mdensity_lag1','mdensity_lag2','mdensity_lag3']]),train_y.loc['2022-02-01':'2022-09-01',['target_0']].values))
    print(SMAPE_1(epsilon+model.predict(train_X.loc['2022-10-01',['mdensity_lag1','mdensity_lag2','mdensity_lag3']]),train_y.loc['2022-10-01',['target_0']].values))
    
# 160
# 1.3528178494715637
# 1.4203927419328115
# 190
# 1.3527584337361627
# 1.4161016190620148
