## Preprocessing

In [1]:
# Preprocessing for kaggle API
!pip install -q kaggle
from google.colab import files
files.upload()
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json
! kaggle competitions download -c godaddy-microbusiness-density-forecasting
! unzip godaddy-microbusiness-density-forecasting.zip

Saving kaggle.json to kaggle.json
Downloading godaddy-microbusiness-density-forecasting.zip to /content
  0% 0.00/1.74M [00:00<?, ?B/s]
100% 1.74M/1.74M [00:00<00:00, 136MB/s]
Archive:  godaddy-microbusiness-density-forecasting.zip
  inflating: census_starter.csv      
  inflating: sample_submission.csv   
  inflating: test.csv                
  inflating: train.csv               


In [2]:
# Importing the libraries
import math
import copy
import time
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
import plotly.express as px
from sklearn import metrics
from sklearn.linear_model import LinearRegression
import torch.nn.functional as fun
from torch.utils.data import DataLoader, Dataset
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.autograd import Variable
from collections import defaultdict
from dateutil.relativedelta import relativedelta 

In [3]:
# Declaraing and initializing the paths of our datasets
class paths:
    # train and test files have microbusiness density data
    TRAIN = "train.csv"
    TEST = "test.csv"

train = pd.read_csv(paths.TRAIN)
test = pd.read_csv(paths.TEST)

In [4]:
train

Unnamed: 0,row_id,cfips,county,state,first_day_of_month,microbusiness_density,active
0,1001_2019-08-01,1001,Autauga County,Alabama,2019-08-01,3.007682,1249
1,1001_2019-09-01,1001,Autauga County,Alabama,2019-09-01,2.884870,1198
2,1001_2019-10-01,1001,Autauga County,Alabama,2019-10-01,3.055843,1269
3,1001_2019-11-01,1001,Autauga County,Alabama,2019-11-01,2.993233,1243
4,1001_2019-12-01,1001,Autauga County,Alabama,2019-12-01,2.993233,1243
...,...,...,...,...,...,...,...
122260,56045_2022-06-01,56045,Weston County,Wyoming,2022-06-01,1.803249,101
122261,56045_2022-07-01,56045,Weston County,Wyoming,2022-07-01,1.803249,101
122262,56045_2022-08-01,56045,Weston County,Wyoming,2022-08-01,1.785395,100
122263,56045_2022-09-01,56045,Weston County,Wyoming,2022-09-01,1.785395,100


In [5]:
test

Unnamed: 0,row_id,cfips,first_day_of_month
0,1001_2022-11-01,1001,2022-11-01
1,1003_2022-11-01,1003,2022-11-01
2,1005_2022-11-01,1005,2022-11-01
3,1007_2022-11-01,1007,2022-11-01
4,1009_2022-11-01,1009,2022-11-01
...,...,...,...
25075,56037_2023-06-01,56037,2023-06-01
25076,56039_2023-06-01,56039,2023-06-01
25077,56041_2023-06-01,56041,2023-06-01
25078,56043_2023-06-01,56043,2023-06-01


In [6]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 122265 entries, 0 to 122264
Data columns (total 7 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   row_id                 122265 non-null  object 
 1   cfips                  122265 non-null  int64  
 2   county                 122265 non-null  object 
 3   state                  122265 non-null  object 
 4   first_day_of_month     122265 non-null  object 
 5   microbusiness_density  122265 non-null  float64
 6   active                 122265 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 6.5+ MB


In [7]:
#first_day_of_month is not Date
train['first_day_of_month'] = pd.to_datetime(train['first_day_of_month'], format = "%Y-%m-%d")
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 122265 entries, 0 to 122264
Data columns (total 7 columns):
 #   Column                 Non-Null Count   Dtype         
---  ------                 --------------   -----         
 0   row_id                 122265 non-null  object        
 1   cfips                  122265 non-null  int64         
 2   county                 122265 non-null  object        
 3   state                  122265 non-null  object        
 4   first_day_of_month     122265 non-null  datetime64[ns]
 5   microbusiness_density  122265 non-null  float64       
 6   active                 122265 non-null  int64         
dtypes: datetime64[ns](1), float64(1), int64(2), object(3)
memory usage: 6.5+ MB


In [8]:
len(np.unique(train['first_day_of_month']))

39

In [9]:
train_df = train
train_df["first_day_of_month"] = pd.to_datetime(train_df["first_day_of_month"])
train_df = train_df.sort_values(['cfips','first_day_of_month']).reset_index(drop=True)
train_df

Unnamed: 0,row_id,cfips,county,state,first_day_of_month,microbusiness_density,active
0,1001_2019-08-01,1001,Autauga County,Alabama,2019-08-01,3.007682,1249
1,1001_2019-09-01,1001,Autauga County,Alabama,2019-09-01,2.884870,1198
2,1001_2019-10-01,1001,Autauga County,Alabama,2019-10-01,3.055843,1269
3,1001_2019-11-01,1001,Autauga County,Alabama,2019-11-01,2.993233,1243
4,1001_2019-12-01,1001,Autauga County,Alabama,2019-12-01,2.993233,1243
...,...,...,...,...,...,...,...
122260,56045_2022-06-01,56045,Weston County,Wyoming,2022-06-01,1.803249,101
122261,56045_2022-07-01,56045,Weston County,Wyoming,2022-07-01,1.803249,101
122262,56045_2022-08-01,56045,Weston County,Wyoming,2022-08-01,1.785395,100
122263,56045_2022-09-01,56045,Weston County,Wyoming,2022-09-01,1.785395,100


In [10]:
test_df = test
test_df["first_day_of_month"] = pd.to_datetime(test_df["first_day_of_month"])
test_df = test_df.sort_values(['cfips','first_day_of_month']).reset_index(drop=True)
test_df

Unnamed: 0,row_id,cfips,first_day_of_month
0,1001_2022-11-01,1001,2022-11-01
1,1001_2022-12-01,1001,2022-12-01
2,1001_2023-01-01,1001,2023-01-01
3,1001_2023-02-01,1001,2023-02-01
4,1001_2023-03-01,1001,2023-03-01
...,...,...,...
25075,56045_2023-02-01,56045,2023-02-01
25076,56045_2023-03-01,56045,2023-03-01
25077,56045_2023-04-01,56045,2023-04-01
25078,56045_2023-05-01,56045,2023-05-01


## LSTM

In [11]:
cutoff_date = '2022-10-01'
start_date = '2019-09-01'
train_mask = train_df['first_day_of_month'] < cutoff_date
val_mask = (train_df['first_day_of_month'] <= cutoff_date) & (train_df['first_day_of_month'] >= start_date)
test_mask = (train_df['first_day_of_month'] <= cutoff_date) & (train_df['first_day_of_month'] >= '2019-10-01')

df_train = train_df[train_mask]
df_val = train_df[val_mask]
df_test = train_df[test_mask]


In [12]:
class Config:
    batch_size = 16
    num_workers = 2
    timesize = 2
    hidden = 32
    num_layers = 2
    num_classes = 1
    eps = 0.01


def sigmoid(x):
    return 1 / (1 + np.exp(-x))

sigmoid_v = np.vectorize(sigmoid)

class TimeSeriesDataset(Dataset):
    def __init__(self, df, is_train=True):
        self.df = df
        self.cfips = df.cfips.unique()
        self.is_train = is_train

    def __len__(self):
        return len(self.cfips)

    def __getitem__(self, idx):
        ct = self.cfips[idx]
        df_ct = self.df[self.df.cfips == ct]
        df_ct_arr = df_ct.microbusiness_density

        ft = []
        label = []
        label_all = []
        if self.is_train:
            for i in range(len(df_ct) - Config.timesize):
                ft.append(df_ct_arr.iloc[i:i+Config.timesize].to_list())
                label_all.append(df_ct_arr.iloc[i + Config.timesize])
            label.append(df_ct_arr.iloc[len(df_ct) - 1])
        else:
            for i in range(len(df_ct) - Config.timesize):
                ft.append(df_ct_arr.iloc[i:i+Config.timesize].to_list())
                label_all.append(df_ct_arr.iloc[i + Config.timesize])

        ft = np.array(ft)
        me = ft.max(axis=1)
        std = ft.min(axis=1)
        ft = np.transpose(ft)
        ft = (ft - std)/(me - std + Config.eps)
        ft = np.transpose(ft)

        if self.is_train:
            label = np.array(label)
            label_scale = (label - std[-1])/(me[-1] - std[-1] + Config.eps)
            label_all = np.array(label_all)
            ft_scale = sigmoid_v(ft)
            label_scale = sigmoid_v(label_scale)
            return torch.FloatTensor(ft_scale), torch.FloatTensor(label_scale), torch.FloatTensor(label), torch.FloatTensor(std), torch.FloatTensor(me), torch.FloatTensor(label_all)
        else:
            label_all = np.array(label_all)
            ft_scale = sigmoid_v(ft)
            return torch.FloatTensor(ft_scale), torch.FloatTensor(std), torch.FloatTensor(me), torch.FloatTensor(label_all), ct


train_dataset = TimeSeriesDataset(df_train)
val_dataset = TimeSeriesDataset(df_val)
test_dataset = TimeSeriesDataset(df_test, is_train=False)

train_loader = DataLoader(train_dataset, batch_size=Config.batch_size, shuffle=True,
                          num_workers=Config.num_workers, pin_memory=True, drop_last=True)

val_loader = DataLoader(val_dataset, batch_size=Config.batch_size, shuffle=True,
                        num_workers=Config.num_workers, pin_memory=True, drop_last=True)

test_loader = DataLoader(test_dataset, batch_size=Config.batch_size, shuffle=False,
                         num_workers=Config.num_workers, pin_memory=True, drop_last=False)

dataloaders = {'train': train_loader, 'val': val_loader, 'test': test_loader}


In [13]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

class FcModel(nn.Module):
    def __init__(self, num_classes, input_size, hidden_size, num_layers):
        super(FcModel, self).__init__()
        self.num_classes = num_classes
        self.num_layers = num_layers
        self.input_size = input_size
        self.hidden_size = hidden_size

        self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        h_0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size)).to(device)
        c_0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size)).to(device)
        out_lstm, (h_out, _) = self.lstm(x, (h_0, c_0))
        out_fc = self.fc(h_out)
        out = out_fc.mean(dim=0)
        return out

fc = FcModel(Config.num_classes, Config.timesize, Config.hidden, Config.num_layers).to(device)
print(fc)


FcModel(
  (lstm): LSTM(2, 32, num_layers=2, batch_first=True)
  (fc): Linear(in_features=32, out_features=1, bias=True)
)


In [14]:
def train_model(model, criterion, optimizer, scheduler, num_epochs):
    best_model_wts = copy.deepcopy(model.state_dict())
    best_loss = float('inf')
    
    for epoch in range(num_epochs):
        print(f'Epoch {epoch}/{num_epochs-1}')
        print('-' * 10)

        since = time.time()

        for phase in ['train', 'val']:
            is_training = phase == 'train'
            dataloader = dataloaders[phase]
            num_time_slice = len(df_train) - Config.timesize if is_training else len(df_val) - Config.timesize
            metrics = defaultdict(float)
            epoch_samples = 0
            count = 0

            model.train(is_training)

            for inputs, labels, _, __, ___, ____ in dataloaders[phase]:
                count += 1
                inputs, labels = inputs.to(device), labels.to(device)

                optimizer.zero_grad()

                with torch.set_grad_enabled(is_training):
                    outputs = model(inputs)
                    loss = loss_func(criterion, outputs, labels.squeeze(-1), metrics)
                    if is_training:
                        loss.backward()
                        optimizer.step()

                epoch_samples += inputs.size(0)
                if (count + 1) % 20 == 0:
                    print(f'Epoch {epoch+1}/{num_epochs} - Iter {count+1}/{len(dataloader)}')
                    print_metrics(metrics, epoch_samples, phase)
            
            print_metrics(metrics, epoch_samples, phase)

            epoch_loss = metrics['loss'] / epoch_samples
            acc = metrics['acc'] / epoch_samples

            if not is_training and epoch_loss < best_loss:
                print("saving best model")
                best_loss = epoch_loss
                best_model_wts = copy.deepcopy(model.state_dict())
                torch.save(model.state_dict(), 'baseline.pth')

            if is_training:
                scheduler.step()

            print(f'{phase} Loss: {epoch_loss:.4f} Acc: {acc:.4f}')

        time_elapsed = time.time() - since
        print(f'{time_elapsed//60:.0f}m {time_elapsed%60:.0f}s')
        
    print(f'Best val loss: {best_loss:.4f}')
    model.load_state_dict(best_model_wts)
    return model

In [15]:
def loss_func(criterion, pred, target, metrics):
    loss = criterion(pred, target)
    mae = nn.L1Loss()
    metr = mae(pred, target)
    metrics['loss'] += loss.item() * target.size(0)
    metrics['acc'] += metr.item() * target.size(0)
    return loss

def print_metrics(metrics, epoch_samples, phase):
    outputs = [f"{k}: {v/epoch_samples:.4f}" for k,v in metrics.items()]
    print(f"{phase}: {', '.join(outputs)}")



In [16]:
optimizer = optim.Adam(filter(lambda p: p.requires_grad, fc.parameters()), lr=5*1e-3)
criterion = nn.L1Loss(reduction='mean')
scheduler = lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.6)

model = train_model(fc, criterion, optimizer, scheduler, num_epochs=15)


Epoch 0/14
----------


  return F.l1_loss(input, target, reduction=self.reduction)


Epoch 1/15 - Iter 20/195
train: loss: 0.2484, acc: 0.2484
Epoch 1/15 - Iter 40/195
train: loss: 0.2100, acc: 0.2100
Epoch 1/15 - Iter 60/195
train: loss: 0.1953, acc: 0.1953
Epoch 1/15 - Iter 80/195
train: loss: 0.1906, acc: 0.1906
Epoch 1/15 - Iter 100/195
train: loss: 0.1844, acc: 0.1844
Epoch 1/15 - Iter 120/195
train: loss: 0.1831, acc: 0.1831
Epoch 1/15 - Iter 140/195
train: loss: 0.1816, acc: 0.1816
Epoch 1/15 - Iter 160/195
train: loss: 0.1791, acc: 0.1791
Epoch 1/15 - Iter 180/195
train: loss: 0.1779, acc: 0.1779
train: loss: 0.1762, acc: 0.1762
train Loss: 0.1762 Acc: 0.1762
Epoch 1/15 - Iter 20/195
val: loss: 0.1793, acc: 0.1793
Epoch 1/15 - Iter 40/195
val: loss: 0.1901, acc: 0.1901
Epoch 1/15 - Iter 60/195
val: loss: 0.1909, acc: 0.1909
Epoch 1/15 - Iter 80/195
val: loss: 0.1913, acc: 0.1913
Epoch 1/15 - Iter 100/195
val: loss: 0.1905, acc: 0.1905
Epoch 1/15 - Iter 120/195
val: loss: 0.1911, acc: 0.1911
Epoch 1/15 - Iter 140/195
val: loss: 0.1905, acc: 0.1905
Epoch 1/15 - I

In [17]:
df = pd.DataFrame(columns = ['row_id','predicted_microbusiness_density'])
datestart = pd.to_datetime('2022-11-01')
count = 0
for feature, _, __, label_all, ct in dataloaders['test']:
    count += 1
    for i in range(ct.shape[0]):
        ft = feature[i : i + 1].to(device)
        std = _[i:i+1, -1].to(device)
        me = __[i:i+1, - 1].to(device)
        for j in range(8):
            datecheck = (datestart + relativedelta(months = j)).strftime('%Y-%m-%d')
            pred_test = model(ft.to(device)).squeeze()
            pred_test = -torch.log(1/pred_test - 1)
            final_pred = torch.mul(pred_test , (me - std + Config.eps)) + std
            final_pred_item = final_pred.item()
            df = df.append({'row_id': str(ct[i].item()) + '_' + str(datecheck),'predicted_microbusiness_density':final_pred_item}, ignore_index=True)
            

            last_ft = ft[0][-1]
            last_ft_cat = torch.cat([last_ft, final_pred], dim = 0)[1:]
            ft = torch.cat([ft[0][1:], last_ft_cat.unsqueeze(0)], dim = 0).unsqueeze(0)

# Create the submission file
df.to_csv('lstm_submission.csv', index=False)
print(df.head())


            row_id  predicted_microbusiness_density
0  1001_2022-11-01                         3.437773
1  1001_2022-12-01                         3.444416
2  1001_2023-01-01                         3.440165
3  1001_2023-02-01                         3.441710
4  1001_2023-03-01                         3.441792


## Linear Regression with shifting

In [19]:
# Set the validation period to the most recent month
validation_period_months = 1
train_months = train.first_day_of_month.values[-39:-1*validation_period_months]
val_months = train.first_day_of_month.values[-1*validation_period_months:]
test_lin = train.loc[train.first_day_of_month.isin(val_months)]
train_lin = train.loc[train.first_day_of_month.isin(train_months)]
print("Training data shape", train_lin.shape)
train_lin.head()

Training data shape (119130, 7)


Unnamed: 0,row_id,cfips,county,state,first_day_of_month,microbusiness_density,active
0,1001_2019-08-01,1001,Autauga County,Alabama,2019-08-01,3.007682,1249
1,1001_2019-09-01,1001,Autauga County,Alabama,2019-09-01,2.88487,1198
2,1001_2019-10-01,1001,Autauga County,Alabama,2019-10-01,3.055843,1269
3,1001_2019-11-01,1001,Autauga County,Alabama,2019-11-01,2.993233,1243
4,1001_2019-12-01,1001,Autauga County,Alabama,2019-12-01,2.993233,1243


In [20]:
train_lin.cfips.value_counts(), test_lin.cfips.value_counts()

(1001     38
 39133    38
 39089    38
 39091    38
 39093    38
          ..
 21113    38
 21115    38
 21117    38
 21119    38
 56045    38
 Name: cfips, Length: 3135, dtype: int64, 1001     1
 39133    1
 39089    1
 39091    1
 39093    1
         ..
 21113    1
 21115    1
 21117    1
 21119    1
 56045    1
 Name: cfips, Length: 3135, dtype: int64)

In [21]:
DISPLAY = 8
THRESHOLD = 8
ACTIVE_THRESHOLD = 9_000
TRAIN_SIZE = len(train_lin)//3135
TEST_SIZE = len(test_lin)//3135

cfips = train.cfips.unique()
x_train = np.arange(TRAIN_SIZE).reshape((-1,1))
x_test = np.arange(TRAIN_SIZE-1,TRAIN_SIZE+TEST_SIZE).reshape((-1,1))

linear_preds = np.zeros((len(cfips),TEST_SIZE))
last_preds = np.zeros((len(cfips),TEST_SIZE))
lin_trend = 0

ct = 0
for i,c in enumerate(cfips):
    df = train_lin.loc[train_lin.cfips == c]
    last = df.microbusiness_density.values[-1]
    active = df.active.values[-1]
    last_preds[i, :] = [last] * TEST_SIZE
    
    # Fit and transform linear regression model
    model = LinearRegression()
    model.fit(x_train,df.microbusiness_density)
    pred = model.predict(x_train)
    err = pred - df.microbusiness_density.values
    range_data = df.microbusiness_density.max() - df.microbusiness_density.min()
    
    # Determine if time series is linear or not
    score = 0
    for k in range(TRAIN_SIZE):
        abs_err = np.abs( err[k] )
        rel_err = abs_err/(range_data/2)
        score += rel_err
    if (score > THRESHOLD) or (active < ACTIVE_THRESHOLD): 
        linear_preds[i, :] = [last] * TEST_SIZE
    else:    
        pred1 = model.predict(x_test)
        shift =  last - pred1[0]
        linear_preds[i,] = pred1[1:]+shift

    ct += 1
    if ct>=DISPLAY+1: continue

In [22]:
# calculation of smape metric
def smape(y_true, y_pred):
    y_true = np.asarray(y_true, dtype=np.float64)
    y_pred = np.asarray(y_pred, dtype=np.float64)
    
    sum_abs = np.abs(y_true) + np.abs(y_pred)
    idx = sum_abs == 0
    sum_abs[idx] = 1
    
    return np.mean(200 * np.abs(y_pred - y_true) / sum_abs)

test_lin['true'] = test_lin.microbusiness_density.copy()
test_lin['last'] = last_preds.reshape((-1))
test_lin['linear'] = linear_preds.reshape((-1))
    
m = smape(test_lin.true.values, test_lin['last'].values)
print('Last value model has SMAPE =',m)

m = smape(test_lin.true.values, test_lin['linear'].values)
print('Linear value model has SMAPE =',m)

Last value model has SMAPE = 1.1011190959563657
Linear value model has SMAPE = 1.0961507765972487


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_lin['true'] = test_lin.microbusiness_density.copy()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_lin['last'] = last_preds.reshape((-1))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_lin['linear'] = linear_preds.reshape((-1))


In [None]:
# Add predictions to the 'test' dataframe
test_lin['predicted_microbusiness_density'] = linear_preds.reshape((-1))
submission_df = test_lin[['row_id', 'predicted_microbusiness_density']]
# Write the submission file to disk
submission_df.to_csv('lin_submission.csv', index=False)
print(f"Shape of submission file: {submission_df.shape}")
print(submission_df.head())
