In [35]:
import gc
import numpy as np
import pandas as pd


census = pd.read_csv('./data/census_starter.csv')
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')
sub = pd.read_csv('./data/sample_submission.csv')

train['istest'] = 0
test['istest'] = 1
raw = pd.concat((train, test)).sort_values(['cfips','row_id']).reset_index(drop=True)


raw['first_day_of_month'] = pd.to_datetime(raw["first_day_of_month"])
raw['county'] = raw.groupby('cfips')['county'].ffill()
raw['state'] = raw.groupby('cfips')['state'].ffill()
raw["year"] = raw["first_day_of_month"].dt.year
raw["month"] = raw["first_day_of_month"].dt.month
raw["dcount"] = raw.groupby(['cfips'])['row_id'].cumcount()
raw['county_i'] = (raw['county'] + raw['state']).factorize()[0]
raw['state_i'] = raw['state'].factorize()[0]

    x1 -> x2/x1
    x2 -> x3/x2
    x3 -> x4/x3  
    x4 -> x5/x4
    x5 -> --/x5
    --


In [6]:
raw['microbusiness_density'].head().shift(-1)

0    2.884870
1    3.055843
2    2.993233
3    2.993233
4         NaN
Name: microbusiness_density, dtype: float64

In [7]:
def build_features(raw, target='microbusiness_density', target_act='active_tmp', lags = 6):
    feats = []
    for lag in range(1, lags):
        raw[f'mbd_lag_{lag}'] = raw.groupby('cfips')[target].shift(lag)
        raw[f'act_lag_{lag}'] = raw.groupby('cfips')[target_act].diff(lag)
        feats.append(f'mbd_lag_{lag}')
        feats.append(f'act_lag_{lag}')
        
    lag = 1
    for window in [2, 4, 6]:
        raw[f'mbd_rollmea{window}_{lag}'] = raw.groupby('cfips')[f'mbd_lag_{lag}'].transform(lambda s: s.rolling(window, min_periods=1).sum())        
        #raw[f'mbd_rollmea{window}_{lag}'] = raw[f'mbd_lag_{lag}'] - raw[f'mbd_rollmea{window}_{lag}']
        feats.append(f'mbd_rollmea{window}_{lag}')
        
    return raw, feats

In [39]:
sample = pd.concat((
    train[train['cfips'] == 1001].copy() , 
    test[test['cfips'] == 1001].copy()
))
sample = raw[raw['cfips'] == 1001].copy() 

sample = sample[['cfips','dcount','first_day_of_month','microbusiness_density','active']].reset_index(drop=True)

target = 'microbusiness_density'
lag = 1
sample[f'lag_{lag}'] = sample[target].shift(lag)

window = 2
sample[f'roll_{window}_{lag}'] = sample[f'lag_{lag}'].transform(lambda s: s.rolling(window, min_periods=1).sum())        
sample

Unnamed: 0,cfips,dcount,first_day_of_month,microbusiness_density,active,lag_1,roll_2_1
0,1001,0,2019-08-01,3.007682,1249.0,,
1,1001,1,2019-09-01,2.88487,1198.0,3.007682,3.007682
2,1001,2,2019-10-01,3.055843,1269.0,2.88487,5.892552
3,1001,3,2019-11-01,2.993233,1243.0,3.055843,5.940713
4,1001,4,2019-12-01,2.993233,1243.0,2.993233,6.049076
5,1001,5,2020-01-01,2.96909,1242.0,2.993233,5.986466
6,1001,6,2020-02-01,2.909326,1217.0,2.96909,5.962323
7,1001,7,2020-03-01,2.933231,1227.0,2.909326,5.878416
8,1001,8,2020-04-01,3.000167,1255.0,2.933231,5.842557
9,1001,9,2020-05-01,3.004948,1257.0,3.000167,5.933399


In [52]:
sample['lastactive'] = sample.groupby('cfips')['active'].transform('last')

dt = sample.loc[sample.dcount==28].groupby('cfips')['microbusiness_density'].agg('last')
sample['lasttarget'] = sample['cfips'].map(dt)

# raw['lastactive'].clip(0, 8000).hist(bins=30)

In [61]:
test['first_day_of_month'].unique()

array(['2022-11-01', '2022-12-01', '2023-01-01', '2023-02-01',
       '2023-03-01', '2023-04-01', '2023-05-01', '2023-06-01'],
      dtype=object)

In [58]:
# sample

ACT_THR = 1.8
ABS_THR = 1.00
sample['ypred_last'] = np.nan
sample['ypred'] = np.nan
sample['k'] = 1.
VAL = []
BEST_ROUNDS = []
for TS in range(29, 38):
    print(TS)
    # train_indices = (sample.dcount  < TS) & (sample.dcount >= 1) & (sample.lastactive>ACT_THR)  & (sample.lasttarget>ABS_THR) 
    # valid_indices = (sample.dcount == TS)
    # print(train_indices)

29
30
31
32
33
34
35
36
37
