In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from feature_engine.outliers import Winsorizer
from sklearn.dummy import DummyRegressor
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from feature_engine.outliers import Winsorizer
from sklearn.compose import TransformedTargetRegressor
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn import set_config, get_config
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from pprint import pprint
from collections import defaultdict
import tools
from sklearn.ensemble import GradientBoostingRegressor

set_config(transform_output="pandas")
from mlxtend.evaluate.time_series import GroupTimeSeriesSplit, plot_splits, print_cv_info, print_split_info


pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.options.display.float_format = '{:.3f}'.format
import warnings; warnings.filterwarnings('ignore')


In [2]:
# new_train = pd.read_csv('./data/raw/godaddy-microbusiness-density-forecasting_last/revealed_test.csv')
# new_train.head()


In [3]:
old_train = pd.read_csv('./data/raw/godaddy-microbusiness-density-forecasting/train.csv')
new_train = pd.read_csv('./data/raw/godaddy-microbusiness-density-forecasting_new/revealed_test.csv')

old_test = pd.read_csv('./data/raw/godaddy-microbusiness-density-forecasting/test.csv')
sample_submission = pd.read_csv('./data/raw/godaddy-microbusiness-density-forecasting/sample_submission.csv')

train = pd.concat((old_train, new_train))
test = old_test[~old_test['first_day_of_month'].isin(new_train['first_day_of_month'])]

train['is_test'] = 0 ; test['is_test'] = 1

data = pd.concat((
        train,
        test)
        )\
    .reset_index(drop=True)\
    .assign(
        cfips = lambda df: df['cfips'].astype(str).str.zfill(5),
        date = lambda df: pd.to_datetime(df["first_day_of_month"]),
        # mdensity_t0 = lambda df: df['microbusiness_density'],
        # active_t0 = lambda df: df['active'],
        )\
    .sort_values(['cfips','date'], ascending=True)\
    .assign(
    
        state_i = lambda df: df['cfips'].apply(lambda x: x[:2]),
        county_i = lambda df: df['cfips'].apply(lambda x: x[2:]),
        
        year = lambda df: df['date'].dt.year,
        date = lambda df: df["date"].dt.date,
        # month = lambda df: df['date'].dt.month,

        dcount = lambda df: df.groupby('cfips')['row_id'].cumcount(),
        
        # active_lag1 = lambda df: df.groupby('cfips')['active'].shift(1),
        # active_lag2 = lambda df: df.groupby('cfips')['active_t0'].shift(2),
        # active_lag3 = lambda df: df.groupby('cfips')['active_t0'].shift(3),
        # active_lag_1 = lambda df: df.groupby('cfips')['active'].shift(1),
        # active_lag_2 = lambda df: df.groupby('cfips')['active'].shift(2),
        active_lag_3 = lambda df: df.groupby('cfips')['active'].shift(3),
        active_lag_4 = lambda df: df.groupby('cfips')['active'].shift(4),
        active_lag_5 = lambda df: df.groupby('cfips')['active'].shift(5),
        active_lag_6 = lambda df: df.groupby('cfips')['active'].shift(6),
        active_lag_7 = lambda df: df.groupby('cfips')['active'].shift(7),
        active_lag_8 = lambda df: df.groupby('cfips')['active'].shift(8),
        active_lag_9 = lambda df: df.groupby('cfips')['active'].shift(9),
        active_lag_10 = lambda df: df.groupby('cfips')['active'].shift(10),

        target_0 = lambda df: np.nan_to_num(df['active']/df.groupby('cfips')['active'].shift(3)-1, posinf=10),
        target_1 = lambda df: np.nan_to_num(df['active']/df.groupby('cfips')['active'].shift(4)-1, posinf=10),
        target_2 = lambda df: np.nan_to_num(df['active']/df.groupby('cfips')['active'].shift(5)-1, posinf=10),
        # target_0 = lambda df: np.nan_to_num(df['active'], posinf=10),
        # target_1 = lambda df: np.nan_to_num(df['active'], posinf=10),
        # target_2 = lambda df: np.nan_to_num(df['active'], posinf=10),

    )\
    .drop(['county','state'], axis='columns')
# .sort_index(ascending=True)

assert all(data.groupby('cfips')['county_i'].nunique() == 1)
assert all(data.groupby('cfips')['state_i'].nunique() == 1)
assert data['cfips'].nunique() == 3135 # there are 3135 county,state tuples
assert data['dcount'].nunique() == 47 # there are 47 series for each county state tuple
assert data.query('is_test==0')['dcount'].nunique() == 41 # there are 41 series in the train set. 
assert data.query('is_test==1')['dcount'].nunique() == 6  # there are 6 series in the test set. 

#The private leaderboard will include 03-2023, 04-2023, 05-2023
#The first public leaderboard includes the first month 11-2022.
#The updated LB is as 01-2023

# adding census data
data_census = []
for year in range(2017,2022):
    COLS = ['GEO_ID','NAME','S0101_C01_026E']
    data_census_i = pd.read_csv(f'./data/raw/census_data_1/ACSST5Y{year}.S0101-Data.csv',usecols=COLS)
    data_census_i = data_census_i.iloc[1:]
    data_census_i['population'] = data_census_i['S0101_C01_026E'].astype('int')


    data_census_i['cfips'] = data_census_i.GEO_ID.apply(lambda x: f"{int(x.split('US')[-1]):05}" )
    data_census_i['year'] = year+2
    data_census.append(data_census_i[['cfips','year','population']])

data_census = pd.concat((data_census),axis='rows')

data = data.merge(data_census, on=['cfips','year'], how='left')

In [4]:
# data.query('cfips=="46127"')['microbusiness_density'].plot()
# data.query('cfips=="46127"')['active'].plot()

# Sample Data

In [5]:
# PARAMETERS
n_SPLITS = 4
n_TRAIN_TRAIN_SIZE = 6
n_TRAIN_PERIOD = n_TRAIN_TRAIN_SIZE  + 3 + n_SPLITS - 1 


TEST_DATES = list(np.sort(data.query('is_test==1')['date'].unique())[-4:-1])
TEST_PERIOD = list(np.sort(data.query('is_test==1')['dcount'].unique())[-4:-1])

TRAIN_DATES = list(np.sort(data.query('is_test==0')['date'].unique())[-n_TRAIN_PERIOD:])
TRAIN_PERIOD = list(np.sort(data.query('is_test==0')['dcount'].unique())[-n_TRAIN_PERIOD:])

LEAKAGE = ['microbusiness_density','active']
TARGETS = ['target_0', 'target_1', 'target_2']
FEATURES = ['population']


In [6]:
from tqdm import tqdm 

sample = data.copy()
sample.loc[sample.is_test==1,TARGETS]  = np.nan

sample = sample.sort_values(['cfips','date'])
sample['bin_log_population'] = pd.cut(np.log1p(sample['population']),bins=8)
sample['bin_log_population'] = sample['bin_log_population'].astype(str)
LAGS = 8
for i in range(3, LAGS+1):
    lag_col = f'target_lag{i}'
    sample[lag_col] = sample.groupby('cfips')[TARGETS[0]].shift(i)  

sample = sample[sample['date'].isin(TRAIN_DATES + TEST_DATES)]
sample = sample.sort_values('dcount')




In [7]:
# sample.query("cfips=='56045'")

# Individual Features


In [8]:
TARGET_LAG_LENGTH=8

list_target_features = [[f'target_lag{lag_i+model_i+1}' for lag_i in range(2, TARGET_LAG_LENGTH-2)] for model_i in range(3)]


tmp_train = np.arange(TARGET_LAG_LENGTH-2-2).reshape((-1,1))
tmp_test = np.arange(TARGET_LAG_LENGTH-2-2+1).reshape((-1,1))

for model_i in tqdm(range(3)):
#     sample[f'target_lr_coef_model_{model_i}'] = sample[list_target_features[model_i]].fillna(0).apply(lambda x: LinearRegression().fit(tmp_train, x).coef_[0], axis='columns')
#     sample[f'target_lr_pred_model_{model_i}'] = sample[list_target_features[model_i]].fillna(0).apply(lambda x: LinearRegression().fit(tmp_train, x).predict(tmp_test)[-1], axis='columns')
    sample[f'target_mean_lag_model_{model_i}'] = sample[list_target_features[model_i]].mean(axis=1)
    sample[f'target_std_lag_model_{model_i}'] = sample[list_target_features[model_i]].std(axis=1)
    sample[f'target_median_lag_model_{model_i}'] = sample[list_target_features[model_i]].median(axis=1)
    list_target_features[model_i].extend([f'target_mean_lag_model_{model_i}',f'target_std_lag_model_{model_i}',f'target_median_lag_model_{model_i}'])
    

100%|██████████| 3/3 [00:00<00:00, 51.17it/s]


In [9]:
# sample.to_pickle('./data/sample_v1.p')

In [10]:
# sample = pd.read_pickle('./data/sample_v1.p')

In [11]:
# ran_id = np.random.randint(200)
# plt.plot(sample[list_cols_model[0]+['mean_lag_0']].iloc[ran_id] )
# plt.plot(sample[list_cols_model[0]+['lr_pred_0']].iloc[ran_id], '--' )
# plt.plot(sample[list_cols_model[0]+['median_lag_0']].iloc[ran_id], '--' )
# plt.plot(sample[list_cols_model[0]].iloc[ran_id], color='r' )




# Hiearchy features

In [12]:
# county, date
hierarchy_categories = ['bin_log_population', 'county_i', 'state_i']
for VAR_HIERARCHY in hierarchy_categories:
    TARGET_LAG_LENGTH=8


    df_hierarchy = sample.groupby([VAR_HIERARCHY,'date'])[[f'active_lag_{i}' for i in range(3,TARGET_LAG_LENGTH+3)]].sum()
    df_hierarchy = df_hierarchy.replace(0, np.nan)


    for i in range(2,TARGET_LAG_LENGTH):
        df_hierarchy[f'{VAR_HIERARCHY}_target_lag_{i+1}'] = (df_hierarchy[f'active_lag_{i+1}']/df_hierarchy[f'active_lag_{i+2}'])-1

    list_hierarchy_features = [[f'{VAR_HIERARCHY}_target_lag_{lag_i+model_i+1}' for lag_i in range(2, TARGET_LAG_LENGTH-2)] for model_i in range(3)]

    tmp_train = np.arange(TARGET_LAG_LENGTH-2-2).reshape((-1,1))
    tmp_test = np.arange(TARGET_LAG_LENGTH-2-2+1).reshape((-1,1))

    for model_i in tqdm(range(3)):
        # df_hierarchy[f'hierarchy_{VAR_HIERARCHY}_lr_coef_model_{model_i}'] = df_hierarchy[list_hierarchy_features[model_i]].fillna(0).apply(lambda x: LinearRegression().fit(tmp_train, x).coef_[0], axis='columns')
        # df_hierarchy[f'hierarchy_{VAR_HIERARCHY}_lr_pred_model_{model_i}'] = df_hierarchy[list_hierarchy_features[model_i]].fillna(0).apply(lambda x: LinearRegression().fit(tmp_train, x).predict(tmp_test)[-1], axis='columns')
        df_hierarchy[f'hierarchy_{VAR_HIERARCHY}_mean_lag_model_{model_i}'] = df_hierarchy[list_hierarchy_features[model_i]].mean(axis=1)
        df_hierarchy[f'hierarchy_{VAR_HIERARCHY}_std_lag_model_{model_i}'] = df_hierarchy[list_hierarchy_features[model_i]].std(axis=1)
        df_hierarchy[f'hierarchy_{VAR_HIERARCHY}_median_lag_model_{model_i}'] = df_hierarchy[list_hierarchy_features[model_i]].median(axis=1)

        df_hierarchy
        sample = sample.merge(df_hierarchy[[
            # f'hierarchy_{VAR_HIERARCHY}_lr_coef_model_{model_i}',
            # f'hierarchy_{VAR_HIERARCHY}_lr_pred_model_{model_i}',
            f'hierarchy_{VAR_HIERARCHY}_mean_lag_model_{model_i}',
            f'hierarchy_{VAR_HIERARCHY}_std_lag_model_{model_i}',
            f'hierarchy_{VAR_HIERARCHY}_median_lag_model_{model_i}'
        ]].reset_index(),
        on = [f'{VAR_HIERARCHY}','date'],
        how='left'
        )

100%|██████████| 3/3 [00:00<00:00, 46.03it/s]
100%|██████████| 3/3 [00:00<00:00, 46.56it/s]
100%|██████████| 3/3 [00:00<00:00, 47.57it/s]


In [13]:
list_hierarchy_features = []
for model_i in range(3):
    tmp =[]
    for col in sample.columns:
        if (col.split('_')[0] == 'hierarchy') and (col.split('_')[-1] == f'{model_i}'):    
          tmp.append(col)
    list_hierarchy_features.append(tmp)
# hierarchy_features  

In [14]:
# ran_id = np.random.randint(len(df_hierarchy))
# plt.plot(df_hierarchy[list_cols_model[0]+[f'{VAR_HIERARCHY}_mean_lag_0']].iloc[ran_id] )
# plt.plot(df_hierarchy[list_cols_model[0]+[f'{VAR_HIERARCHY}_lr_pred_0']].iloc[ran_id], '--' )
# plt.plot(df_hierarchy[list_cols_model[0]+[f'{VAR_HIERARCHY}_median_lag_0']].iloc[ran_id], '--' )
# plt.plot(df_hierarchy[list_cols_model[0]].iloc[ran_id] )



# Other Features

In [15]:
# display(
# train_X.head(2),
# # train_X.tail(2),
# # test_X.head(2),
# # test_X.tail(2)
# )
# list_cols_model[-1]

In [16]:
from sklearn.preprocessing import KBinsDiscretizer, LabelEncoder, QuantileTransformer, OneHotEncoder
from ray.tune.sklearn import TuneGridSearchCV
from ray.tune.sklearn import TuneSearchCV
import lightgbm as lgb
from sklearn.model_selection import RandomizedSearchCV
from category_encoders.target_encoder import TargetEncoder

from tools import BaseTransformer

class CategoricalFeatureEngineering(BaseTransformer):
    def __init__(self, features=None):
        self.features = features
	    
    def fit(self, X, y=None):
        self.dict_transformers = {}
        
        self.dict_transformers['le_state_i'] = TargetEncoder(handle_unknown=np.nan).fit(X['state_i'],y)
        self.dict_transformers['le_county_i'] = TargetEncoder(handle_unknown=np.nan).fit(X['county_i'],y)
        self.dict_transformers['le_bin_log_population'] = TargetEncoder(handle_unknown=np.nan).fit(X['bin_log_population'],y)


        return self
    
    def transform(self, X):
        X_transformed = X.copy()
        X_transformed['le_state_i'] = self.dict_transformers['le_state_i'].transform(X_transformed['state_i'])
        X_transformed['le_county_i'] =  self.dict_transformers['le_county_i'].transform(X_transformed['county_i'])
        X_transformed['le_bin_log_population'] =  self.dict_transformers['le_bin_log_population'].transform(X_transformed['bin_log_population'])

        return X_transformed[['le_state_i','le_county_i','le_bin_log_population']]

# CategoricalFeatureEngineering().fit_transform(train_X, train_X['target_0'])

# Pipelining

In [17]:
sample_train= sample.query("dcount in @TRAIN_PERIOD") ; sample_test= sample.query("dcount in @TEST_PERIOD")
train_y = sample_train[TARGETS] ; test_y = sample_test[TARGETS]
# train_X = sample_train.drop(TARGETS,axis='columns')
train_X = sample_train ; test_X = sample_test
# test_X = sample_test.drop(TARGETS,axis='columns')  


In [18]:
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
import xgboost as xgb
from sklearn.linear_model import Ridge, Lasso
from sklearn.svm import SVR

In [29]:
%%time


dict_pipelines = {}
y_test_preds  = [] 

y_val_preds = defaultdict(list)
errors = defaultdict(list)


for model_i in range(3):

    train_y_i = train_y.iloc[:, model_i]
    
    cv_args = {"test_size": 1, "n_splits": n_SPLITS, "train_size": n_TRAIN_TRAIN_SIZE, 'gap_size': 0}
    
    cv = GroupTimeSeriesSplit(**cv_args)

    # NUMERICAL FEATURES
    target_features = Pipeline([('select', tools.ColumnSelector(features=list_target_features[model_i]))])
    hierarchy_features = Pipeline([('select', tools.ColumnSelector(features=list_hierarchy_features[model_i]))])
    leak_features = Pipeline([('select', tools.ColumnSelector(features=TARGETS[model_i]))])
    
    categorical_features = Pipeline(
        [
        ('categorical_features', CategoricalFeatureEngineering(TARGETS[model_i])),
        ])


    merge_features = FeatureUnion([
        
        ('target_features', target_features),
        ('hierarchy_features', hierarchy_features),
        # ('leak_features', leak_features),
        ('categorical_features', categorical_features),

    

    ])

    general_features = Pipeline(
        [
        ('remove_outliers', Winsorizer()),
        ('standart_scalar', StandardScaler())
        ]
    )

    model = TransformedTargetRegressor(regressor=LinearRegression())
    model = TransformedTargetRegressor(regressor=lgb.LGBMRegressor())
    # model = TransformedTargetRegressor(regressor=RandomForestRegressor())

    model_pipeline = Pipeline([
        ("feature_engineering", merge_features),
        # ("general_features", general_features),
        ("model", model)
    ])

    
    param_dists = {
    "model__regressor__num_leaves": [5,10,30,50],
    "model__regressor__max_depth": [3, 5,10,30,50],
    "model__regressor__learning_rate": [0.1, 0.05, 0.01],
    "model__regressor__n_estimators": [300, 500,1000,3000,5000],
    # "model__regressor__min_child_samples": [10,20,30],
    "model__regressor__subsample": [.8,.9,1],
    "model__regressor__colsample_bytree": [.8,.9,1],
    "model__regressor__reg_alpha": [.05,.1,.3,.5],
    "model__regressor__reg_lambda": [.1,.3,.5],
    # "model__regressor__random_state": [1881],
    "model__regressor__objective": ['mean_squared_error','mean_absolute_error'],    
    }
    
    # param_dists = {
    #     "model__regressor__C": [.1,.4,.6,.9,1],
        
    #     }
    # param_dists = {}


    grid = RandomizedSearchCV(
                model_pipeline,
                param_distributions=param_dists,
                scoring=make_scorer(tools.SMAPE_1, greater_is_better=False), 
                cv=cv,
                # n_jobs=-1,
                n_iter=50)
    
    grid.fit(train_X, train_y_i, groups=train_X['dcount'])

    dict_pipelines[f'pipeline_model_{model_i}'] = grid
    
    # print(grid.cv_results_)
    # print(grid.best_estimator_)
    
    # ## CHECK

    check_train_period = TRAIN_PERIOD[-1-n_TRAIN_TRAIN_SIZE: -1] 
    check_validation_period = [TRAIN_PERIOD[-1]]

    best_model = grid.best_estimator_
    check_train_index = train_X.query('dcount in @check_train_period').index
    best_model.fit(train_X.loc[check_train_index], train_y_i.loc[check_train_index])  
    
    val_index = train_X.query('dcount in @check_validation_period').index
    # y_val_pred =  best_model.predict(train_X.loc[val_index])
    print(f'active_lag_{model_i+2+1}')
    y_val_pred =  (best_model.predict(train_X.loc[val_index])+1)*train_X.loc[val_index,f'active_lag_{model_i+2+1}']	
    y_val_preds[f'pred_target_{model_i}'] = y_val_pred
    errors[f'error_{model_i}'] = tools.SMAPE_1(y_true= train_X.loc[val_index,'active'], y_pred=y_val_pred)

    # INFERENCE
    # final_train_period = TRAIN_PERIOD[-n_TRAIN_TRAIN_SIZE:] 
    # final_train_index = train_X.query('dcount in @final_train_period').index
    # TEST_PERIOD_i = [TEST_PERIOD[model_i]]

    # best_model.fit(train_X.loc[final_train_index], train_y_i.loc[final_train_index])
    # y_test_preds.append(best_model.predict(test_X.query('dcount in @TEST_PERIOD_i')))
    break
# # prepare test the output
# test_X['ratio_pred'] = np.concatenate((y_test_preds))

# # prepare validation for error analysis
val_X = train_X.query('dcount in @check_validation_period')
y_val_preds =  pd.DataFrame(y_val_preds, index=val_X.index)
val_X = pd.concat((val_X, y_val_preds), axis=1)


Traceback (most recent call last):
  File "/Users/samet/.pyenv/versions/3.9.15/envs/fistik/lib/python3.9/site-packages/joblib/parallel.py", line 862, in dispatch_one_batch
    tasks = self._ready_batches.get(block=False)
  File "/Users/samet/.pyenv/versions/3.9.15/lib/python3.9/queue.py", line 168, in get
    raise Empty
_queue.Empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/samet/.pyenv/versions/3.9.15/envs/fistik/lib/python3.9/site-packages/debugpy/_vendored/pydevd/_pydevd_bundle/pydevd_frame.py", line 987, in trace_dispatch
    self.do_wait_suspend(thread, frame, event, arg)
  File "/Users/samet/.pyenv/versions/3.9.15/envs/fistik/lib/python3.9/site-packages/debugpy/_vendored/pydevd/_pydevd_bundle/pydevd_frame.py", line 164, in do_wait_suspend
    self._args[0].do_wait_suspend(*args, **kwargs)
  File "/Users/samet/.pyenv/versions/3.9.15/envs/fistik/lib/python3.9/site-packages/debugpy/_vendored/pydevd/pydevd

KeyboardInterrupt: 

In [None]:
display(dict_pipelines['pipeline_model_0'].best_params_)
display(errors)
# condition = val_X['population']<np.quantile(val_X['population'],q=.3)
# (tools.SMAPE_1(val_X.loc[condition]['active'],val_X.loc[condition]['pred_target_0']),tools.SMAPE_1(val_X.loc[~condition]['active'],val_X.loc[~condition]['pred_target_0']))


{'model__regressor__objective': 'mean_absolute_error',
 'model__regressor__n_estimators': 3000,
 'model__regressor__learning_rate': 0.05}

defaultdict(list, {'error_0': 2.965482969484996})

In [None]:
# defaultdict(list, {'error_0': 2.8409474379596613})
# defaultdict(list,
#             {'error_0': 2.8770759725599087,
#              'error_1': 3.395943795656617,
#              'error_2': 4.835238471998951})

In [None]:
val_X['dif'] = np.abs(val_X['pred_target_0']-val_X['active'])
val_X.sort_values(['dif'],ascending=False).head()

In [None]:
# sample.query('cfips == "56033"')

In [None]:
(0.344+1)*238245.000

In [None]:
test_X = test_X.sort_values(['cfips','first_day_of_month'])

In [None]:
test_X['benchmark'] = test_X.groupby('cfips').first()[['active_lag_1','active_lag_1','active_lag_1']].stack().values

In [None]:
for i,TEST_PERIOD_i in enumerate(TEST_PERIOD):
    test_index = test_X.query('dcount == @TEST_PERIOD_i').index 
    test_X.loc[test_index,'pred'] = (test_X.loc[test_index]['ratio_pred']+1)*test_X.loc[test_index][f'active_lag_{i+1}']


In [None]:
random_id = np.random.choice(train_X['cfips'])

plt.plot(train_X.query('cfips==@random_id')['dcount'],train_X.query('cfips==@random_id')['microbusiness_density'])
plt.plot(test_X.query('cfips==@random_id')['dcount'],test_X.query('cfips==@random_id')['microbusiness_density'])
# plt.plot(test_X.query('cfips==@random_id')['dcount'],test_X.query('cfips==@random_id')['benchmark'], '--')

In [None]:
test_X['final_pred'] = test_X['pred']
condition = test_X['population']<np.quantile(test_X['population'],q=.3)
test_X.loc[condition,'final_pred'] = test_X.loc[condition,'benchmark']

In [None]:
# # # Prepare submission file

# microbusiness_density = 100 * active / adult_population
# 100*data.head()['active'] / data.head()['population']
# test_X['microbusiness_density'] = 100 * test_X['active'] / test_X['population']

date_submission = '0903'
local_score = round(errors['error_0'],2)
model_name = 'ratio_regression_lag_1_2_with_.4_constant'

df_output = test_X.assign(
    microbusiness_density = lambda df: 100 * df['final_pred'] / df['population'],
    row_id = lambda df: df.apply(lambda df: "{}_{}".format(int(df['cfips']),df['date']), axis='columns'))[['row_id','microbusiness_density']]

submission = pd.concat((
    df_output,
    sample_submission[~sample_submission.row_id.isin(df_output.row_id)]))

submission.to_csv(f"data/{date_submission}_{model_name}_local_{local_score}.csv",index=None)


# ERROR ANALYSIS

In [None]:
# val_X['target_benchmark'] = val_X['active_lag1']
# val_X['error_0']= val_X[['active_t0','target_0']].apply(lambda x: tools.SMAPE_1(x[[0]],x[[1]]),axis=1)
# val_X['error_benchmark']= val_X[['active_t0','target_benchmark']].apply(lambda x: tools.SMAPE_1(x[[0]],x[[1]]),axis=1)
# val_X['error_1']= val_X[['active_t0','target_1']].apply(lambda x: tools.SMAPE_1(x[[0]],x[[1]]),axis=1)
# val_X['error_2']= val_X[['active_t0','target_2']].apply(lambda x: tools.SMAPE_1(x[[0]],x[[1]]),axis=1)



In [None]:
# df_errors = val_X.sort_values('error_0', ascending=False)
# df_errors['cum_error'] = df_errors['error_0'].expanding().mean()
# df_errors['cum_error_benchmark'] = df_errors['error_benchmark'].expanding().mean()
# # val_X

In [None]:
# df_errors['cum_error'].plot()
# df_errors['cum_error_benchmark'].plot()

In [None]:
# errors[errors['error_0']>20]


In [None]:
# errors['c_population']=  pd.cut(np.log1p(errors['population']),5)
# errors['c_population'].value_counts()
# errors.groupby(['c_population'])['error_0'].describe()

In [None]:
# errors = errors[errors['error_0']>1]

In [None]:
# plt.scatter( np.log1p(errors['population']), np.log1p(errors['error_0']))