In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
import tensorflow as tf
from joblib import Parallel, delayed
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from keras.models import Sequential
from sklearn.preprocessing import MinMaxScaler
from keras.layers import Dense, LSTM, Dropout
import warnings
warnings.filterwarnings("ignore")

In [2]:
train_data = pd.read_csv('/kaggle/input/godaddy-microbusiness-density-forecasting/train.csv')
test_data = pd.read_csv('/kaggle/input/godaddy-microbusiness-density-forecasting/test.csv')
census_data = pd.read_csv('/kaggle/input/godaddy-microbusiness-density-forecasting/census_starter.csv')

min_date = pd.to_datetime(min(train_data['first_day_of_month']))
max_date = pd.to_datetime(max(train_data['first_day_of_month']))
margin =  ( max_date.to_period('M')  ) \
                                   - ( min_date.to_period('M')  ) 

dict_val = {}
for leng, val in enumerate(train_data.cfips.unique()):
    dict_val[val] = leng
    


scaler = MinMaxScaler()

#data for broadband access
census_data.iloc[:,0:5]=scaler.fit_transform(census_data.iloc[:,0:5].to_numpy())

#population percent of 4yr degree
census_data.iloc[:,6:11]=scaler.fit_transform(census_data.iloc[:,6:11].to_numpy())

#percent of born outside of the US
census_data.iloc[:,11:16]=scaler.fit_transform(census_data.iloc[:,11:16].to_numpy())

#percent of IT workers
census_data.iloc[:,16:21]=scaler.fit_transform(census_data.iloc[:,16:21].to_numpy())

#income level
census_data.iloc[:,21:26]=scaler.fit_transform(census_data.iloc[:,21:26].to_numpy())

def census_to_train(row,idNum,dataset):
    row=row
    idNum = idNum
    year=pd.to_datetime(train_data['first_day_of_month'][row]).year - 2
    temp_cen_data = pd.DataFrame(census_data[census_data['cfips'] == idNum])
    
    dataset.loc[row,"pct_bb"] = float(temp_cen_data["pct_bb_" + str(year)])
    dataset.loc[row,"pct_college"] = float(temp_cen_data["pct_college_"+ str(year)])
    dataset.loc[row,"pct_foreign_born"] = float(temp_cen_data["pct_foreign_born_"+ str(year)])
    dataset.loc[row,"pct_it_workers"] = float(temp_cen_data["pct_it_workers_"+ str(year)])
    dataset.loc[row,"median_hh_inc"] = float(temp_cen_data["median_hh_inc_"+ str(year)])


def init_to_feature(dataset,LSTM=False,Test=False):
    #dataset['numMons'] = " "
    dates = np.array(dataset['first_day_of_month'])

    #numMons = ( (pd.to_datetime(dates).to_period('M')   ) \
    #                               - ( min_date.to_period('M')  ) )
    #for i in range(len(dataset)):
    #    dataset.loc[i,'numMons'] = numMons[i].n +1
    dataset['numMons'] = " "
    dataset['numMons'] = np.repeat(np.arange(1,margin.n+2,1), len(dict_val))

    
    
    
    dataset["countyID"] = " "
    for leng, val in enumerate(dataset['cfips']):
        dataset.loc[leng,"countyID"] = dict_val[dataset.loc[leng,'cfips']]
       
    dataset = dataset.sort_values(['countyID', 'numMons'],\
              ascending = [True, True])

    dataset["pct_bb"] = " "
    dataset["pct_college"] = " "
    dataset["pct_foreign_born"] = " "
    dataset["pct_it_workers"] = " "
    dataset["median_hh_inc"] = " "

    
    
    for row, idNum in enumerate(tqdm(dataset['cfips'])):
         census_to_train(row,idNum,dataset)
   
    
    
    
    if LSTM==True and Test==False:
        new_set=[]
        for i in np.arange(0, len(dataset),39):
            new_set.append(dataset[i:i+30])
        dataset = pd.concat(new_set)
        
    if LSTM==True and Test==True:
        new_set=[]
        for i in np.arange(0, len(dataset),39):
            new_set.append(dataset[i+25:i+39])
        dataset = pd.concat(new_set)
    
    
    features = np.array(dataset[['countyID','numMons','pct_bb','pct_college',\
                               'pct_foreign_born','pct_it_workers','median_hh_inc','microbusiness_density']])

    
    return features


In [3]:
#test_features = init_to_feature(test_data,LSTM=False)

train_features = init_to_feature(train_data[:],LSTM=True,Test=False)
test_features = init_to_feature(train_data[:],LSTM=True,Test=True)



100%|██████████| 122265/122265 [58:11<00:00, 35.02it/s]
100%|██████████| 122265/122265 [1:00:16<00:00, 33.81it/s]


In [4]:

train_target = np.array(train_features[:,7])
test_target = np.array(test_features[:,7])

train_features = train_features[:,:7]
test_features = test_features[:,:7]




In [5]:

def history_data(dataset1,dataset2, lookback=1):
    data_look=[]
    data_tar=[]
    for i in range(len(dataset1)-lookback):
        val1 = dataset1[i:(i+lookback),:] 
        val2 = dataset2[i+lookback] 
        data_look.append(val1)
        data_tar.append(val2)
    return np.array(data_look), np.array(data_tar)

In [6]:
def lookback_data(ids,featureset,targetset):
   
    dataset1 = featureset[featureset[:,0]==ids]
    dataset2 = targetset[featureset[:,0]==ids]
    rnnset, tar_rnn = history_data(dataset1,dataset2,5)

    rnnset = np.delete(rnnset,0, axis=2)
    rnnset = np.reshape(rnnset, (rnnset.shape[0],rnnset.shape[1],rnnset.shape[2]   ))
    rnnset = rnnset.astype(float)
    
    return rnnset, tar_rnn

In [7]:

predictions=list(np.zeros(3134))
trues=list(np.zeros(3134))
model = Sequential()
model.add(LSTM(units=200,return_sequences=True, input_shape=(5,6)))
model.add(Dropout(0.2))
model.add(LSTM(units=100,return_sequences=True))
model.add(Dropout(0.1))
model.add(LSTM(units=50))
model.add(Dropout(0.1))
model.add(Dense(64))
model.add(Dropout(0.1))
model.add(Dense(1))
model.compile(optimizer='adam', loss='mean_squared_error')



def lstm_model(ids):
    
    train_rnn, traintar_rnn = lookback_data(ids,train_features,train_target )
    test_rnn, testtar_rnn = lookback_data(ids,test_features,test_target )
    
    model.fit(train_rnn,traintar_rnn,epochs=50,batch_size=8,verbose=0)
    
    predictions[ids] = model.predict(test_rnn)
    trues[ids] = testtar_rnn
    


2023-01-12 15:07:33.423870: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.


In [8]:

for ids in tqdm(range(0,3134)):
    lstm_model(ids)
    
    

  0%|          | 0/3134 [00:00<?, ?it/s]2023-01-12 15:07:42.542240: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)
100%|██████████| 3134/3134 [3:33:18<00:00,  4.08s/it]


In [9]:
counter=0
summ=0
for tr1,pred1 in zip(trues,predictions):
    for tr2, pred2 in zip(tr1,pred1):
        summ+= abs((-tr2+pred2)/((abs(tr2)+abs(pred2))/2) )
        counter+=1
print('SMAPE: ', summ/counter)

SMAPE:  [nan]
