In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from feature_engine.outliers import Winsorizer
from sklearn.dummy import DummyRegressor
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from feature_engine.outliers import Winsorizer
from sklearn.compose import TransformedTargetRegressor
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn import set_config, get_config
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from pprint import pprint
from collections import defaultdict
import tools
from sklearn.ensemble import GradientBoostingRegressor

set_config(transform_output="pandas")
from mlxtend.evaluate.time_series import GroupTimeSeriesSplit, plot_splits, print_cv_info, print_split_info


pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.options.display.float_format = '{:.2f}'.format
import warnings; warnings.filterwarnings('ignore')


In [56]:
old_train = pd.read_csv('./data/kaggle/train.csv')
new_train = pd.read_csv('./data/kaggle/revealed_test.csv')

old_test = pd.read_csv('./data/kaggle/test.csv')
sample_submission = pd.read_csv('./data/kaggle/sample_submission.csv')

train = pd.concat((old_train, new_train))
test = old_test[~old_test['first_day_of_month'].isin(new_train['first_day_of_month'])]

train['is_test'] = 0 ; test['is_test'] = 1

data = pd.concat((
        train,
        test)
        )\
    .reset_index(drop=True)\
    .assign(
        cfips = lambda df: df['cfips'].astype(str).str.zfill(5),
        date = lambda df: pd.to_datetime(df["first_day_of_month"]),
        mdensity_t0 = lambda df: df['microbusiness_density'],
        active_t0 = lambda df: df['active'],
        )\
    .sort_values(['cfips','date'], ascending=True)\
    .assign(
    
        state_i = lambda df: df['cfips'].apply(lambda x: x[:2]),
        county_i = lambda df: df['cfips'].apply(lambda x: x[2:]),
        
        year = lambda df: df['date'].dt.year,
        date = lambda df: df["date"].dt.date,
        # month = lambda df: df['date'].dt.month,

        dcount = lambda df: df.groupby('cfips')['row_id'].cumcount(),
        
        active_lag1 = lambda df: df.groupby('cfips')['active_t0'].shift(1),
        active_lag2 = lambda df: df.groupby('cfips')['active_t0'].shift(2),
        active_lag3 = lambda df: df.groupby('cfips')['active_t0'].shift(3),
        active_lag4 = lambda df: df.groupby('cfips')['active_t0'].shift(4),
        active_lag5 = lambda df: df.groupby('cfips')['active_t0'].shift(5),
        active_lag6 = lambda df: df.groupby('cfips')['active_t0'].shift(6),
        
        target_0 = lambda df: np.nan_to_num(df['active']/df['active_lag1']),
        target_1 = lambda df: np.nan_to_num(df['active']/df['active_lag2']),
        target_2 = lambda df: np.nan_to_num(df['active']/df['active_lag3']),
        # target_1 = lambda df: np.nan_to_num(df['active']),
        # target_2 = lambda df: np.nan_to_num(df['active']),

    

    )\
    .drop(['county','state'], axis='columns')
# .sort_index(ascending=True)

assert all(data.groupby('cfips')['county_i'].nunique() == 1)
assert all(data.groupby('cfips')['state_i'].nunique() == 1)
assert data['cfips'].nunique() == 3135 # there are 3135 county,state tuples
assert data['dcount'].nunique() == 47 # there are 47 series for each county state tuple
assert data.query('is_test==0')['dcount'].nunique() == 41 # there are 41 series in the train set. 
assert data.query('is_test==1')['dcount'].nunique() == 6  # there are 6 series in the test set. 

#The private leaderboard will include 03-2023, 04-2023, 05-2023
#The public leaderboard includes the first month 11-2022. Probably it will be updated later as 12-2022,01-2023 and 02-2023
#The LB is updated as 01-2023


In [58]:
data

Unnamed: 0,row_id,cfips,first_day_of_month,microbusiness_density,active,is_test,date,mdensity_t0,active_t0,state_i,county_i,year,dcount,active_lag1,active_lag2,active_lag3,active_lag4,active_lag5,active_lag6,target_0,target_1,target_2
0,1001_2019-08-01,01001,2019-08-01,3.01,1249.00,0,2019-08-01,3.01,1249.00,01,001,2019,0,,,,,,,0.00,0.00,0.00
1,1001_2019-09-01,01001,2019-09-01,2.88,1198.00,0,2019-09-01,2.88,1198.00,01,001,2019,1,1249.00,,,,,,0.96,0.00,0.00
2,1001_2019-10-01,01001,2019-10-01,3.06,1269.00,0,2019-10-01,3.06,1269.00,01,001,2019,2,1198.00,1249.00,,,,,1.06,1.02,0.00
3,1001_2019-11-01,01001,2019-11-01,2.99,1243.00,0,2019-11-01,2.99,1243.00,01,001,2019,3,1269.00,1198.00,1249.00,,,,0.98,1.04,1.00
4,1001_2019-12-01,01001,2019-12-01,2.99,1243.00,0,2019-12-01,2.99,1243.00,01,001,2019,4,1243.00,1269.00,1198.00,1249.00,,,1.00,0.98,1.04
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
134804,56045_2023-02-01,56045,2023-02-01,,,1,2023-02-01,,,56,045,2023,42,,101.00,100.00,100.00,100.00,100.00,0.00,0.00,0.00
137939,56045_2023-03-01,56045,2023-03-01,,,1,2023-03-01,,,56,045,2023,43,,,101.00,100.00,100.00,100.00,0.00,0.00,0.00
141074,56045_2023-04-01,56045,2023-04-01,,,1,2023-04-01,,,56,045,2023,44,,,,101.00,100.00,100.00,0.00,0.00,0.00
144209,56045_2023-05-01,56045,2023-05-01,,,1,2023-05-01,,,56,045,2023,45,,,,,101.00,100.00,0.00,0.00,0.00


In [59]:
# adding census data
data_census = []
for year in range(2017,2022):
    COLS = ['GEO_ID','NAME','S0101_C01_026E']
    data_census_i = pd.read_csv(f'./data/census/ACSST5Y{year}.S0101-Data.csv',usecols=COLS)
    data_census_i = data_census_i.iloc[1:]
    data_census_i['population'] = data_census_i['S0101_C01_026E'].astype('int')


    data_census_i['cfips'] = data_census_i.GEO_ID.apply(lambda x: f"{int(x.split('US')[-1]):05}" )
    data_census_i['year'] = year+2
    data_census.append(data_census_i[['cfips','year','population']])

data_census = pd.concat((data_census),axis='rows')


In [60]:
data = data.merge(data_census, on=['cfips','year'], how='left')

In [61]:
data.head()

Unnamed: 0,row_id,cfips,first_day_of_month,microbusiness_density,active,is_test,date,mdensity_t0,active_t0,state_i,county_i,year,dcount,active_lag1,active_lag2,active_lag3,active_lag4,active_lag5,active_lag6,target_0,target_1,target_2,population
0,1001_2019-08-01,1001,2019-08-01,3.01,1249.0,0,2019-08-01,3.01,1249.0,1,1,2019,0,,,,,,,0.0,0.0,0.0,41527
1,1001_2019-09-01,1001,2019-09-01,2.88,1198.0,0,2019-09-01,2.88,1198.0,1,1,2019,1,1249.0,,,,,,0.96,0.0,0.0,41527
2,1001_2019-10-01,1001,2019-10-01,3.06,1269.0,0,2019-10-01,3.06,1269.0,1,1,2019,2,1198.0,1249.0,,,,,1.06,1.02,0.0,41527
3,1001_2019-11-01,1001,2019-11-01,2.99,1243.0,0,2019-11-01,2.99,1243.0,1,1,2019,3,1269.0,1198.0,1249.0,,,,0.98,1.04,1.0,41527
4,1001_2019-12-01,1001,2019-12-01,2.99,1243.0,0,2019-12-01,2.99,1243.0,1,1,2019,4,1243.0,1269.0,1198.0,1249.0,,,1.0,0.98,1.04,41527


# Sample Data

In [62]:
# PARAMETERS
n_SPLITS = 5 
n_TRAIN_TRAIN_SIZE = 6
n_TRAIN_PERIOD = n_TRAIN_TRAIN_SIZE + 3 + n_SPLITS - 1 


TEST_DATES = list(np.sort(data.query('is_test==1')['date'].unique())[:3])
TEST_PERIOD = list(np.sort(data.query('is_test==1')['dcount'].unique())[:3])

TRAIN_PERIOD = list(np.sort(data.query('is_test==0')['dcount'].unique())[-n_TRAIN_PERIOD:])
TRAIN_DATES = list(np.sort(data.query('is_test==0')['date'].unique())[-n_TRAIN_PERIOD:])

LEAKAGE = ['mdensity_t0','active_t0']
TARGETS = ['target_0', 'target_1', 'target_2']
FEATURES = ['population']
LAG_TARGET = ['active_lag1', 'active_lag2', 'active_lag3','active_lag4','active_lag5','active_lag6']

In [63]:
# data[data['dcount'].isin(TEST_PERIOD)].head()
# sample = data[data.cfips.isin(['01001'])] # sample = data[data.cfips.isin(['01001','56045'])]
sample = data.copy()
sample.loc[sample.is_test==1,TARGETS]  = np.nan
sample = sample.set_index(['date','cfips']).sort_index().loc[TRAIN_DATES+TEST_DATES]
sample = sample[['dcount','county_i'] + LAG_TARGET + TARGETS + FEATURES+ LEAKAGE]
sample_train= sample.query("dcount in @TRAIN_PERIOD") ; sample_test= sample.query("dcount in @TEST_PERIOD")
train_X = sample_train.drop(TARGETS,axis='columns') ; train_y = sample_train[TARGETS]
test_X = sample_test.drop(TARGETS,axis='columns') ; test_y = sample_test[TARGETS]


In [64]:
train_X

Unnamed: 0_level_0,Unnamed: 1_level_0,dcount,county_i,active_lag1,active_lag2,active_lag3,active_lag4,active_lag5,active_lag6,population,mdensity_t0,active_t0
date,cfips,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2021-12-01,01001,28,001,1350.00,1351.00,1344.00,1358.00,1354.00,1359.00,42175,3.29,1386.00
2021-12-01,01003,28,003,13162.00,13048.00,12998.00,13192.00,13301.00,13456.00,166595,7.93,13211.00
2021-12-01,01005,28,005,231.00,228.00,225.00,232.00,230.00,222.00,20054,1.15,231.00
2021-12-01,01007,28,007,220.00,212.00,212.00,216.00,221.00,221.00,17862,1.21,216.00
2021-12-01,01009,28,009,768.00,767.00,766.00,758.00,760.00,759.00,44292,1.75,776.00
...,...,...,...,...,...,...,...,...,...,...,...,...
2022-12-01,56037,40,037,902.00,905.00,901.00,901.00,909.00,892.00,32049,2.88,922.00
2022-12-01,56039,40,039,5054.00,5035.00,5000.00,4999.00,4971.00,4916.00,19164,26.31,5043.00
2022-12-01,56041,40,041,583.00,582.00,580.00,577.00,578.00,567.00,14516,4.05,588.00
2022-12-01,56043,40,043,190.00,189.00,194.00,194.00,195.00,189.00,6045,3.06,185.00


# Pipelining

In [65]:
from lightgbm import LGBMRegressor

In [68]:
lgb_params = {
    'n_iter': 200,
    'verbosity': -1,
    'objective': 'l1',
    'random_state': 42,
    'extra_trees': True,
    'colsample_bytree': 0.88,
    'colsample_bynode': 0.93,
    'max_depth': 9,
    'learning_rate': 0.015,
    'lambda_l1': 4.7,
    'lambda_l2': 6.7,
    'num_leaves': 541,
    'min_data_in_leaf': 243
    }


# lgb_params = {
#     'n_iter': 200,
#     'verbosity': 3,
#     'objective': 'l1',
#     'random_state': 42,
#     'extra_trees': True,
#     'colsample_bytree': 0.95,
#     'colsample_bynode': 0.95,
#     'max_depth': 100,
#     'learning_rate': 0.1,
#     'num_leaves': 10,
#     'min_data_in_leaf': 10
#     }


In [91]:
lag=3
list_cols_model = [[f'active_lag{lag_i+model_i+1}' for lag_i in range(lag)] for model_i in range(3)]
list_cols_model

[['active_lag1', 'active_lag2', 'active_lag3'],
 ['active_lag2', 'active_lag3', 'active_lag4'],
 ['active_lag3', 'active_lag4', 'active_lag5']]

In [92]:

dic_pipelines = {}
y_test_preds  = [] 

y_val_preds = defaultdict(list)
errors = defaultdict(list)


for model_i in range(3):

    train_y_i = train_y.iloc[:, model_i]
    
    cv_args = {"test_size": 1, "n_splits": n_SPLITS, "train_size": n_TRAIN_TRAIN_SIZE, 'gap_size': 0}
    
    cv = GroupTimeSeriesSplit(**cv_args)

    # new_features = Pipeline([('select', SimpleFeatureEngineering(features=list_cols_model[model_i]))])
    # print(list_cols_model[model_i])

    raw_features = Pipeline([('select', tools.ColumnSelector(features=list_cols_model[model_i]))])
    
    merge_features_numeric = FeatureUnion([
        # ('new_features', new_features),
        ('raw_features', raw_features)
    ])

    final_features_numeric = Pipeline([
                            ('merge_features',merge_features_numeric),
                            # ('remove_outliers', Winsorizer(capping_method='iqr', tail='both',fold=1)),
                            # ('standart_scaler', StandardScaler())
                            ]
                            )

    
    model = TransformedTargetRegressor(regressor=LGBMRegressor(**lgb_params))
    
    
    model_pipeline = Pipeline([
        ("transform", final_features_numeric),
        ("model", model)
    ])

    dic_pipelines[f'pipeline_model_{model_i}'] = model_pipeline
    
    param_grid = {}
    grid = GridSearchCV(dic_pipelines[f'pipeline_model_{model_i}'], scoring=make_scorer(tools.SMAPE_1, greater_is_better=False), param_grid=param_grid, cv=cv)
    grid.fit(train_X, train_y_i, groups=train_X['dcount'])
    
    # print(grid.cv_results_)
    # print(grid.best_estimator_)
    
    ## CHECK

    best_model = grid.best_estimator_

    check_train_period = TRAIN_DATES[-1-n_TRAIN_TRAIN_SIZE: -1] 
    validation_period = TRAIN_DATES[-1] 
    
    best_model.fit(train_X.loc[check_train_period], train_y_i.loc[check_train_period])   
    
    y_val_pred =  best_model.predict(train_X.loc[validation_period])   
    y_val_preds[f'target_{model_i}'] = y_val_pred
    y_val_i = train_y_i.loc[validation_period]    
    errors[f'error_{model_i}'] = tools.SMAPE_1(y_true=y_val_i, y_pred=y_val_pred)

    # INFERENCE
    # final_train_period = TRAIN_DATES[-n_TRAIN_TRAIN_SIZE:] 

    # best_model.fit(train_X.loc[final_train_period], train_y_i.loc[final_train_period])   

    # y_test_pred =  best_model.predict(test_X.loc[TEST_DATES[model_i]] )
    # y_test_preds.append(y_test_pred)

# test_X['active'] = np.concatenate((y_test_preds))

# prepare data for error analysis
val_X = train_X.loc[validation_period]
y_val_preds =  pd.DataFrame(y_val_preds, index=val_X.index)
val_X = pd.concat((val_X, y_val_preds), axis=1)

errors



defaultdict(list,
            {'error_0': 1.8537019626124678,
             'error_1': 2.4264903309547314,
             'error_2': 2.8509509551177343})

In [93]:
errors

defaultdict(list,
            {'error_0': 1.8537019626124678,
             'error_1': 2.4264903309547314,
             'error_2': 2.8509509551177343})

In [94]:
train

Unnamed: 0,row_id,cfips,county,state,first_day_of_month,microbusiness_density,active,is_test
0,1001_2019-08-01,1001,Autauga County,Alabama,2019-08-01,3.01,1249,0
1,1001_2019-09-01,1001,Autauga County,Alabama,2019-09-01,2.88,1198,0
2,1001_2019-10-01,1001,Autauga County,Alabama,2019-10-01,3.06,1269,0
3,1001_2019-11-01,1001,Autauga County,Alabama,2019-11-01,2.99,1243,0
4,1001_2019-12-01,1001,Autauga County,Alabama,2019-12-01,2.99,1243,0
...,...,...,...,...,...,...,...,...
6265,56041_2022-12-01,56041,Uinta County,Wyoming,2022-12-01,4.05,588,0
6266,56043_2022-11-01,56043,Washakie County,Wyoming,2022-11-01,3.14,190,0
6267,56043_2022-12-01,56043,Washakie County,Wyoming,2022-12-01,3.06,185,0
6268,56045_2022-11-01,56045,Weston County,Wyoming,2022-11-01,1.79,100,0


# ERROR ANALYSIS

In [75]:
val_X['target_0'] = val_X['target_0'] * val_X['active_lag1']
val_X['target_1'] = val_X['target_1'] * val_X['active_lag2']
val_X['target_2'] = val_X['target_2'] * val_X['active_lag3']


In [76]:
val_X['error_0']= val_X[['active_t0','target_0']].apply(lambda x: tools.SMAPE_1(x[[0]],x[[1]]),axis=1)
val_X['error_1']= val_X[['active_t0','target_1']].apply(lambda x: tools.SMAPE_1(x[[0]],x[[1]]),axis=1)
val_X['error_2']= val_X[['active_t0','target_2']].apply(lambda x: tools.SMAPE_1(x[[0]],x[[1]]),axis=1)

In [77]:
errors = val_X.sort_values('error_0', ascending=False)
# errors['cum_error'] = errors['error_0'].expanding().mean()
# errors['cum_population'] = errors['population'].expanding().sum()
# val_X

In [87]:
black_list = errors[errors['error_0']>5].index


In [50]:
errors['c_population']=  pd.cut(np.log1p(errors['population']),5)
errors['c_population'].value_counts()
errors.groupby(['c_population'])['error_0'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
c_population,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
"(4.51, 6.793]",40.0,25.16,38.39,0.48,4.51,9.19,18.09,140.61
"(6.793, 9.064]",752.0,5.7,8.46,0.02,1.53,3.64,6.98,140.61
"(9.064, 11.335]",1782.0,4.52,6.62,0.0,1.22,3.02,5.78,126.8
"(11.335, 13.606]",517.0,6.47,5.91,0.01,2.36,4.95,8.83,37.48
"(13.606, 15.878]",44.0,25.66,31.31,0.1,5.47,12.12,34.89,143.23


In [154]:
errors = errors[errors['error_0']>1]

In [155]:
# plt.scatter( np.log1p(errors['population']), np.log1p(errors['error_0']))

# Submission

In [89]:
# test_X['microbusiness_density'] = 100*test_X['active_t0']/test_X['population']
test_X

Unnamed: 0_level_0,Unnamed: 1_level_0,dcount,county_i,active_lag1,active_lag2,active_lag3,active_lag4,active_lag5,active_lag6,population,mdensity_t0,active_t0,microbusiness_density
date,cfips,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2023-01-01,01001,41,001,1475.00,1463.00,1472.00,1463.00,1455.00,1461.00,44438,,,
2023-01-01,01003,41,003,14133.00,14145.00,14320.00,14289.00,14545.00,14686.00,178105,,,
2023-01-01,01005,41,005,248.00,247.00,244.00,239.00,237.00,241.00,19995,,,
2023-01-01,01007,41,007,229.00,227.00,229.00,234.00,230.00,236.00,17800,,,
2023-01-01,01009,41,009,822.00,815.00,813.00,822.00,815.00,813.00,45201,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-03-01,56037,43,037,,,922.00,902.00,905.00,901.00,31514,,,
2023-03-01,56039,43,039,,,5043.00,5054.00,5035.00,5000.00,19169,,,
2023-03-01,56041,43,041,,,588.00,583.00,582.00,580.00,14641,,,
2023-03-01,56043,43,043,,,185.00,190.00,189.00,194.00,6000,,,


In [173]:
# # # Prepare submission file

# date_submission = '0303'
# local_score = round(errors['error_0'],2)
# model_name = 'regression_lag_1_4'

# submission = tools.create_submission(test_X,date_submission, model_name, local_score, sample_submission)