In [2]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from lightgbm import LGBMRegressor
from sklearn.pipeline import Pipeline
from sklearn.impute import KNNImputer
from sklearn.linear_model import Ridge
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import Lasso
from sklearn.decomposition import PCA
from statsmodels.tsa.arima.model import ARIMA
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import FunctionTransformer
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import BayesianRidge
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
import xgboost as xgb

In [3]:
def smape(y_true, y_pred):
    smap = np.zeros(len(y_true))
    
    num = np.abs(y_true - y_pred)
    dem = ((np.abs(y_true) + np.abs(y_pred)) / 2)
    
    pos_ind = (y_true!=0)|(y_pred!=0)
    smap[pos_ind] = num[pos_ind] / dem[pos_ind]
    
    return 100 * np.mean(smap)

In [4]:
train =pd.read_csv('train.csv', parse_dates=['first_day_of_month'],usecols =lambda x: x != 'active')
test = pd.read_csv('test.csv', parse_dates=['first_day_of_month'])
sample = pd.read_csv('sample_submission.csv')

In [5]:
test = test.merge(train[['cfips', 'county' , 'state']].drop_duplicates(), on = ['cfips'], how = 'left')

In [6]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 122265 entries, 0 to 122264
Data columns (total 6 columns):
 #   Column                 Non-Null Count   Dtype         
---  ------                 --------------   -----         
 0   row_id                 122265 non-null  object        
 1   cfips                  122265 non-null  int64         
 2   county                 122265 non-null  object        
 3   state                  122265 non-null  object        
 4   first_day_of_month     122265 non-null  datetime64[ns]
 5   microbusiness_density  122265 non-null  float64       
dtypes: datetime64[ns](1), float64(1), int64(1), object(3)
memory usage: 5.6+ MB


In [7]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 25080 entries, 0 to 25079
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   row_id              25080 non-null  object        
 1   cfips               25080 non-null  int64         
 2   first_day_of_month  25080 non-null  datetime64[ns]
 3   county              25080 non-null  object        
 4   state               25080 non-null  object        
dtypes: datetime64[ns](1), int64(1), object(3)
memory usage: 1.1+ MB


In [8]:
train['month'] = train['first_day_of_month'].dt.month
train['year'] = train['first_day_of_month'].dt.year
test['month'] = test['first_day_of_month'].dt.month
test['year'] = test['first_day_of_month'].dt.year

In [9]:
train.head()

Unnamed: 0,row_id,cfips,county,state,first_day_of_month,microbusiness_density,month,year
0,1001_2019-08-01,1001,Autauga County,Alabama,2019-08-01,3.007682,8,2019
1,1001_2019-09-01,1001,Autauga County,Alabama,2019-09-01,2.88487,9,2019
2,1001_2019-10-01,1001,Autauga County,Alabama,2019-10-01,3.055843,10,2019
3,1001_2019-11-01,1001,Autauga County,Alabama,2019-11-01,2.993233,11,2019
4,1001_2019-12-01,1001,Autauga County,Alabama,2019-12-01,2.993233,12,2019


In [10]:
test.head()

Unnamed: 0,row_id,cfips,first_day_of_month,county,state,month,year
0,1001_2022-11-01,1001,2022-11-01,Autauga County,Alabama,11,2022
1,1003_2022-11-01,1003,2022-11-01,Baldwin County,Alabama,11,2022
2,1005_2022-11-01,1005,2022-11-01,Barbour County,Alabama,11,2022
3,1007_2022-11-01,1007,2022-11-01,Bibb County,Alabama,11,2022
4,1009_2022-11-01,1009,2022-11-01,Blount County,Alabama,11,2022


In [11]:
bls = pd.read_csv('archive/bls-emp_by_cfips.csv', parse_dates=['first_day_of_month'])

In [12]:
bls['first_day_of_month'] = bls['first_day_of_month'] +pd.DateOffset(years= 1)

In [13]:
bls['row_id'] = bls['cfips'].astype(str) + '_' + bls['first_day_of_month'].astype(str)

In [14]:
bls.head()

Unnamed: 0,series_id,year,period,value,footnote_codes,cfips,first_day_of_month,measure_type,row_id
0,LAUCN010010000000003,1990,M01,6.5,,1001,1991-01-01,Unemployment rate,1001_1991-01-01
1,LAUCN010010000000003,1990,M02,6.5,,1001,1991-02-01,Unemployment rate,1001_1991-02-01
2,LAUCN010010000000003,1990,M03,5.7,,1001,1991-03-01,Unemployment rate,1001_1991-03-01
3,LAUCN010010000000003,1990,M04,6.6,,1001,1991-04-01,Unemployment rate,1001_1991-04-01
4,LAUCN010010000000003,1990,M05,6.0,,1001,1991-05-01,Unemployment rate,1001_1991-05-01


In [15]:
bls.first_day_of_month.min()

Timestamp('1991-01-01 00:00:00')

In [16]:
unemp_rate = bls[bls['measure_type'] == 'Unemployment rate']
unemp = bls[bls['measure_type'] == 'Unemployment']
employment = bls[bls['measure_type'] == 'Employment']
lab_force = bls[bls['measure_type'] == 'Labor force']

In [17]:
unemp_rate = unemp_rate[['row_id', 'value']].rename( columns= {'row_id' : 'row_id', 'value' : 'unemp_rate'})
unemp = unemp[['row_id', 'value']].rename( columns= {'row_id' : 'row_id', 'value' : 'unemp'})
employment = employment[['row_id', 'value']].rename( columns= {'row_id' : 'row_id', 'value' : 'employment'})
lab_force = lab_force[['row_id', 'value']].rename( columns= {'row_id' : 'row_id', 'value' : 'lab_force'})


In [18]:
# train = train.merge(unemp_rate, 
#            on = 'row_id', how = 'left').merge(unemp, 
#             on = 'row_id', how = 'left').merge(employment,
#             on = 'row_id', how = 'left').merge(lab_force,
#             on = 'row_id', how = 'left')

In [19]:
# test = test.merge(unemp_rate, 
#             on = 'row_id', how = 'left').merge(unemp, 
#             on = 'row_id', how = 'left').merge(employment,
#             on = 'row_id', how = 'left').merge(lab_force,
#             on = 'row_id', how = 'left')

In [20]:
train.head()

Unnamed: 0,row_id,cfips,county,state,first_day_of_month,microbusiness_density,month,year
0,1001_2019-08-01,1001,Autauga County,Alabama,2019-08-01,3.007682,8,2019
1,1001_2019-09-01,1001,Autauga County,Alabama,2019-09-01,2.88487,9,2019
2,1001_2019-10-01,1001,Autauga County,Alabama,2019-10-01,3.055843,10,2019
3,1001_2019-11-01,1001,Autauga County,Alabama,2019-11-01,2.993233,11,2019
4,1001_2019-12-01,1001,Autauga County,Alabama,2019-12-01,2.993233,12,2019


In [21]:
accommodation = pd.read_csv('all_indicators-all_data/accommodation_and_food_services_state_gdp_raw_in_millions.csv', sep= ';')
employment = pd.read_csv('all_indicators-all_data/federal_public_employment_raw_in_thousands.csv')
gdp = pd.read_csv('all_indicators-all_data/government_state_gdp_raw_in_millions.csv', sep= ';')
manufacturing_employment = pd.read_csv('all_indicators-all_data/manufacturing_employment_raw_in_thousands.csv')
private_employment = pd.read_csv('all_indicators-all_data/private_employment_raw_in_thousands.csv')
public_employment = pd.read_csv('all_indicators-all_data/public_employment_raw_in_thousands.csv')
retail_retail_trade_employment = pd.read_csv('all_indicators-all_data/retail_trade_employment_raw_in_thousands.csv')
retail_trade_state = pd.read_csv('all_indicators-all_data/retail_trade_state_gdp_raw_in_millions.csv', sep= ';')
state_and_local_public_education = pd.read_csv('all_indicators-all_data/state_and_local_public_education_employment_raw_in_thousands.csv')
state_and_local_public_employment = pd.read_csv('all_indicators-all_data/state_and_local_public_employment_raw_in_thousands.csv')
state_gdp = pd.read_csv('all_indicators-all_data/state_gdp_raw_in_millions.csv', sep= ';')
total_employment = pd.read_csv('all_indicators-all_data/total_employment_raw_in_thousands.csv')
unemployment_rate = pd.read_csv('all_indicators-all_data/unemployment_rate_raw.csv')
weekly_earnings = pd.read_csv('all_indicators-all_data/weekly_earnings_raw.csv')

In [22]:
weekly_earnings.rename(columns={'Geography':'state'}, inplace= True)
unemployment_rate.rename(columns={'Geography':'state'}, inplace= True)
total_employment.rename(columns={'Geography':'state'}, inplace= True)
state_gdp.rename(columns={'Geography':'state'}, inplace= True)
state_and_local_public_employment.rename(columns={'Geography':'state'}, inplace= True)
state_and_local_public_education.rename(columns={'Geography':'state'}, inplace= True)
retail_trade_state.rename(columns={'Geography':'state'}, inplace= True)
retail_retail_trade_employment.rename(columns={'Geography':'state'}, inplace= True)
public_employment.rename(columns={'Geography':'state'}, inplace= True)
private_employment.rename(columns={'Geography':'state'}, inplace= True)
manufacturing_employment.rename(columns={'Geography':'state'}, inplace= True)
gdp.rename(columns={'Geography':'state'}, inplace= True)
employment.rename(columns={'Geography':'state'}, inplace= True)
accommodation.rename(columns={'Geography':'state'}, inplace= True)

In [23]:
weekly_earnings_melt =  weekly_earnings.melt(id_vars= 'state',var_name= 'first_day_of_month', 
                                             value_name= 'weekly_earnings')
weekly_earnings_melt['first_day_of_month'] = pd.to_datetime(weekly_earnings_melt['first_day_of_month'])

In [24]:
accommodation_melt =  accommodation.melt(id_vars= 'state',var_name= 'first_day_of_month', 
                                             value_name= 'accommodation')
accommodation_melt['first_day_of_month'] = pd.to_datetime(accommodation_melt['first_day_of_month'])

In [25]:
employment_melt =  employment.melt(id_vars= 'state',var_name= 'first_day_of_month', 
                                             value_name= 'employment')
employment_melt['first_day_of_month'] = pd.to_datetime(employment_melt['first_day_of_month'])

In [26]:
gdp_melt =  gdp.melt(id_vars= 'state',var_name= 'first_day_of_month', 
                                             value_name= 'gdp')
gdp_melt['first_day_of_month'] = pd.to_datetime(gdp_melt['first_day_of_month'])

In [27]:
manufacturing_employment_melt =  manufacturing_employment.melt(id_vars= 'state',var_name= 'first_day_of_month', 
                                             value_name= 'manufacturing_employment')
manufacturing_employment_melt['first_day_of_month'] = pd.to_datetime(manufacturing_employment_melt['first_day_of_month'])

In [28]:
private_employment_melt =  private_employment.melt(id_vars= 'state',var_name= 'first_day_of_month', 
                                             value_name= 'private_employment')
private_employment_melt['first_day_of_month'] = pd.to_datetime(private_employment_melt['first_day_of_month'])

In [29]:
public_employment_melt =  public_employment.melt(id_vars= 'state',var_name= 'first_day_of_month', 
                                             value_name= 'public_employment')
public_employment_melt['first_day_of_month'] = pd.to_datetime(public_employment_melt['first_day_of_month'])

In [30]:
retail_retail_trade_employment_melt =  retail_retail_trade_employment.melt(id_vars= 'state',var_name= 'first_day_of_month', 
                                             value_name= 'retail_retail_trade_employment')
retail_retail_trade_employment_melt['first_day_of_month'] = pd.to_datetime(retail_retail_trade_employment_melt['first_day_of_month'])

In [31]:
retail_trade_state_melt =  retail_trade_state.melt(id_vars= 'state',var_name= 'first_day_of_month', 
                                             value_name= 'retail_trade_state')
retail_trade_state_melt['first_day_of_month'] = pd.to_datetime(retail_trade_state_melt['first_day_of_month'])

In [32]:
state_and_local_public_education_melt =  state_and_local_public_education.melt(id_vars= 'state',var_name= 'first_day_of_month', 
                                             value_name= 'state_and_local_public_education')
state_and_local_public_education_melt['first_day_of_month'] = pd.to_datetime(state_and_local_public_education_melt['first_day_of_month'])

In [33]:
state_and_local_public_employment_melt =  state_and_local_public_employment.melt(id_vars= 'state',var_name= 'first_day_of_month', 
                                             value_name= 'state_and_local_public_employment')
state_and_local_public_employment_melt['first_day_of_month'] = pd.to_datetime(state_and_local_public_employment_melt['first_day_of_month'])

In [34]:
state_gdp_melt =  state_gdp.melt(id_vars= 'state',var_name= 'first_day_of_month', 
                                             value_name= 'state_gdp')
state_gdp_melt['first_day_of_month'] = pd.to_datetime(state_gdp_melt['first_day_of_month'])

In [35]:
total_employment_melt =  total_employment.melt(id_vars= 'state',var_name= 'first_day_of_month', 
                                             value_name= 'total_employment')
total_employment_melt['first_day_of_month'] = pd.to_datetime(total_employment_melt['first_day_of_month'])

In [36]:
unemployment_rate_melt =  unemployment_rate.melt(id_vars= 'state',var_name= 'first_day_of_month', 
                                             value_name= 'unemployment_rate')
unemployment_rate_melt['first_day_of_month'] = pd.to_datetime(unemployment_rate_melt['first_day_of_month'])

In [37]:
data_merged = accommodation_melt.merge(employment_melt, on = ['state','first_day_of_month'], 
          how= 'left').merge(gdp_melt, on = ['state','first_day_of_month'], 
          how= 'left').merge(manufacturing_employment_melt, on = ['state','first_day_of_month'], 
          how= 'left').merge(private_employment_melt, on = ['state','first_day_of_month'], 
          how= 'left').merge(public_employment_melt, on = ['state','first_day_of_month'], 
          how= 'left').merge(retail_retail_trade_employment_melt, on = ['state','first_day_of_month'], 
          how= 'left').merge(retail_trade_state_melt, on = ['state','first_day_of_month'], 
          how= 'left').merge(state_gdp_melt, on = ['state','first_day_of_month'], 
          how= 'left').merge(total_employment_melt, on = ['state','first_day_of_month'], 
          how= 'left').merge(unemployment_rate_melt, on = ['state','first_day_of_month'], 
          how= 'left').merge(weekly_earnings_melt, on = ['state','first_day_of_month'], 
          how= 'left')

In [38]:
#adding one year - lagging
data_merged['first_day_of_month'] = data_merged['first_day_of_month'] +pd.DateOffset(years=1)

In [39]:
test = test.merge(data_merged, on = ['state', 'first_day_of_month'], 
                    how = 'left')

In [40]:
train = train.merge(data_merged, on = ['state', 'first_day_of_month'], 
                  how = 'left')

In [41]:
train.set_index('first_day_of_month',  inplace= True)
test.set_index('first_day_of_month', inplace= True)

In [42]:
train.drop(['row_id','county', 'state' ], axis=1, inplace=True)
row_id = test['row_id']
test.drop(['row_id','county', 'state'], axis=1, inplace=True)

In [43]:
train.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 122265 entries, 2019-08-01 to 2022-10-01
Data columns (total 16 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   cfips                           122265 non-null  int64  
 1   microbusiness_density           122265 non-null  float64
 2   month                           122265 non-null  int64  
 3   year                            122265 non-null  int64  
 4   accommodation                   122265 non-null  float64
 5   employment                      122265 non-null  float64
 6   gdp                             122265 non-null  float64
 7   manufacturing_employment        122265 non-null  float64
 8   private_employment              122265 non-null  float64
 9   public_employment               122265 non-null  float64
 10  retail_retail_trade_employment  122265 non-null  float64
 11  retail_trade_state              122265 non-null  float64
 12  

In [44]:
train['cfips'] = train['cfips'].astype('category')
test['cfips'] = test['cfips'].astype('category')


In [45]:
train.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 122265 entries, 2019-08-01 to 2022-10-01
Data columns (total 16 columns):
 #   Column                          Non-Null Count   Dtype   
---  ------                          --------------   -----   
 0   cfips                           122265 non-null  category
 1   microbusiness_density           122265 non-null  float64 
 2   month                           122265 non-null  int64   
 3   year                            122265 non-null  int64   
 4   accommodation                   122265 non-null  float64 
 5   employment                      122265 non-null  float64 
 6   gdp                             122265 non-null  float64 
 7   manufacturing_employment        122265 non-null  float64 
 8   private_employment              122265 non-null  float64 
 9   public_employment               122265 non-null  float64 
 10  retail_retail_trade_employment  122265 non-null  float64 
 11  retail_trade_state              122265 non-null  

In [46]:
# y_pred = []
# y_test_ = []
# for i in list(train['cfips'].unique()):
#     np.random.seed(42)
#     X = train[train['cfips'] == i].drop('microbusiness_density', axis= 1)
#     y = train[train['cfips'] == i]['microbusiness_density']
#     X_train, X_test, y_train, y_test = train_test_split(
#         X, y, test_size=0.33, random_state=42, shuffle=False)
#     pipe = Pipeline([('impute', KNNImputer()),('rd', LGBMRegressor())])
#     pipe.fit(X, y)
#     prediction = pipe.predict(X_test[X_test['cfips'] == i])
#     prediction = list(prediction)
#     y_pred += prediction
#     y_test = list(y_test)
#     y_test_ += list(y_test)
# #submission = pd.DataFrame({'row_id' : row_id.values, 'microbusiness_density' : y_pred})
# smape(np.array(y_test_) ,np.array(y_pred) )
# #pd.DataFrame.to_csv(submission, 'submission.csv', index= False)

In [47]:
# y_pred = []
# y_test_ = []
# for i in list(train['cfips'].unique()):
#     np.random.seed(42)
#     X = train[train['cfips'] == i].drop('microbusiness_density', axis= 1)
#     y = train[train['cfips'] == i]['microbusiness_density']
#     X_train, X_test, y_train, y_test = train_test_split(
#         X, y, test_size=0.33, random_state=42, shuffle=False)
#     pipe = Pipeline([('impute', KNNImputer()),('poly', PolynomialFeatures(2)),
#                      ('scaler', RobustScaler()),('rd', Ridge(alpha=42))])
#     pipe.fit(X, y)
#     prediction = pipe.predict(X_test[X_test['cfips'] == i])
#     prediction = list(prediction)
#     y_pred += prediction
#     y_test = list(y_test)
#     y_test_ += list(y_test)
# #submission = pd.DataFrame({'row_id' : row_id.values, 'microbusiness_density' : y_pred})
# smape(np.array(y_test_) ,np.array(y_pred) )
# #pd.DataFrame.to_csv(submission, 'submission.csv', index= False)

In [48]:
# y_pred = []
# y_test_ = []
# for i in list(train['cfips'].unique()):
#     np.random.seed(42)
#     X = train[train['cfips'] == i].drop('microbusiness_density', axis= 1)
#     y = train[train['cfips'] == i]['microbusiness_density']
#     X_train, X_test, y_train, y_test = train_test_split(
#         X, y, test_size=0.33, random_state=42, shuffle=False)
#     pipe = Pipeline([('impute', KNNImputer()),('poly', PolynomialFeatures(3)),
#                      ('scaler', RobustScaler()),('rd', Ridge(alpha=42))])
#     pipe.fit(X, y)
#     prediction = pipe.predict(X_test[X_test['cfips'] == i])
#     prediction = list(prediction)
#     y_pred += prediction
#     y_test = list(y_test)
#     y_test_ += list(y_test)
# #submission = pd.DataFrame({'row_id' : row_id.values, 'microbusiness_density' : y_pred})
# smape(np.array(y_test_) ,np.array(y_pred) )
# #pd.DataFrame.to_csv(submission, 'submission.csv', index= False)

In [49]:
# y_pred = []
# y_test_ = []
# for i in list(train['cfips'].unique()):
#     np.random.seed(42)
#     X = train[train['cfips'] == i].drop('microbusiness_density', axis= 1)
#     y = train[train['cfips'] == i]['microbusiness_density']
#     X_train, X_test, y_train, y_test = train_test_split(
#         X, y, test_size=0.33, random_state=42, shuffle=False)
#     pipe = Pipeline([('impute', KNNImputer()),('poly', PolynomialFeatures(3)),
#                      ('scaler', RobustScaler()), ('rd', RidgeCV(alphas=[1e-3, 1e-2, 1e-1, 1, 10, 100]))])
#     pipe.fit(X, y)
#     prediction = pipe.predict(X_test[X_test['cfips'] == i])
#     prediction = list(prediction)
#     y_pred += prediction
#     y_test = list(y_test)
#     y_test_ += list(y_test)
# #submission = pd.DataFrame({'row_id' : row_id.values, 'microbusiness_density' : y_pred})
# smape(np.array(y_test_) ,np.array(y_pred) )
# #pd.DataFrame.to_csv(submission, 'submission.csv', index= False)

In [50]:
# y_pred = []
# y_test_ = []
# for i in list(train['cfips'].unique()):
#     np.random.seed(42)
#     X = train[train['cfips'] == i].drop('microbusiness_density', axis= 1)
#     y = train[train['cfips'] == i]['microbusiness_density']
#     X_train, X_test, y_train, y_test = train_test_split(
#         X, y, test_size=0.33, random_state=42, shuffle=False)
#     pipe = Pipeline([('impute', KNNImputer()),('poly', PolynomialFeatures(2)),('lr', LinearRegression())])
#     pipe.fit(X, y)
#     prediction = pipe.predict(X_test[X_test['cfips'] == i])
#     prediction = list(prediction)
#     y_pred += prediction
#     y_test = list(y_test)
#     y_test_ += list(y_test)
# #submission = pd.DataFrame({'row_id' : row_id.values, 'microbusiness_density' : y_pred})
# smape(np.array(y_test_) ,np.array(y_pred) )
# #pd.DataFrame.to_csv(submission, 'submission.csv', index= False)



In [51]:
# y_pred = []
# y_test_ = []
# for i in list(train['cfips'].unique()):
#     np.random.seed(42)
#     X = train[train['cfips'] == i].drop('microbusiness_density', axis= 1)
#     y = train[train['cfips'] == i]['microbusiness_density']
#     X_train, X_test, y_train, y_test = train_test_split(
#         X, y, test_size=0.33, random_state=42, shuffle=False)
#     pipe = Pipeline([('impute', KNNImputer()),('poly', PolynomialFeatures(2)),('lr', KernelRidge())])
#     pipe.fit(X, y)
#     prediction = pipe.predict(X_test[X_test['cfips'] == i])
#     prediction = list(prediction)
#     y_pred += prediction
#     y_test = list(y_test)
#     y_test_ += list(y_test)
# #submission = pd.DataFrame({'row_id' : row_id.values, 'microbusiness_density' : y_pred})
# smape(np.array(y_test_) ,np.array(y_pred) )
# #pd.DataFrame.to_csv(submission, 'submission.csv', index= False)

In [52]:
# y_pred = []
# y_test_ = []
# for i in list(train['cfips'].unique()):
#     np.random.seed(42)
#     X = train[train['cfips'] == i].drop('microbusiness_density', axis= 1)
#     y = train[train['cfips'] == i]['microbusiness_density']
#     X_train, X_test, y_train, y_test = train_test_split(
#         X, y, test_size=0.33, random_state=42, shuffle=False)
#     pipe = Pipeline([('impute', KNNImputer()),('poly', PolynomialFeatures(2)),('lr', BayesianRidge())])
#     pipe.fit(X, y)
#     prediction = pipe.predict(X_test[X_test['cfips'] == i])
#     prediction = list(prediction)
#     y_pred += prediction
#     y_test = list(y_test)
#     y_test_ += list(y_test)
# #submission = pd.DataFrame({'row_id' : row_id.values, 'microbusiness_density' : y_pred})
# smape(np.array(y_test_) ,np.array(y_pred) )
# #pd.DataFrame.to_csv(submission, 'submission.csv', index= False)

In [53]:
# y_pred = []
# y_test_ = []
# for i in list(train['cfips'].unique()):
#     np.random.seed(42)
#     X = train[train['cfips'] == i].drop('microbusiness_density', axis= 1)
#     y = train[train['cfips'] == i]['microbusiness_density']
#     X_train, X_test, y_train, y_test = train_test_split(
#         X, y, test_size=0.2, random_state=42, shuffle=False)
#     pipe = Pipeline([('impute', KNNImputer()),('poly', PolynomialFeatures(2)),('lr', RandomForestRegressor())])
#     pipe.fit(X, y)
#     prediction = pipe.predict(X_test[X_test['cfips'] == i])
#     prediction = list(prediction)
#     y_pred += prediction
#     y_test = list(y_test)
#     y_test_ += list(y_test)
# #submission = pd.DataFrame({'row_id' : row_id.values, 'microbusiness_density' : y_pred})
# smape(np.array(y_test_) ,np.array(y_pred) )
# #pd.DataFrame.to_csv(submission, 'submission.csv', index= False)

Submissions

In [54]:
# y_pred = []
# for i in list(train['cfips'].unique()):
#     np.random.seed(42)
#     X = train[train['cfips'] == i].drop('microbusiness_density', axis= 1)
#     y = train[train['cfips'] == i]['microbusiness_density']
#     X.sort_index()
#     y.sort_index()
#     pipe = Pipeline([('impute', KNNImputer()),('lgbm', 
#             LGBMRegressor(learning_rate= 0.001,n_estimators=1000,objective='mean_squared_error'))])
#     pipe.fit(X, y)
#     prediction = pipe.predict(test[test['cfips'] == i])
#     prediction = list(prediction)
#     y_pred += prediction
    
# submission = pd.DataFrame({'row_id' : row_id.values, 'microbusiness_density' : y_pred})
# #smape(np.array(y_test_) ,np.array(y_pred) )
# pd.DataFrame.to_csv(submission, 'submission.csv', index= False)

In [74]:
test[test['cfips'] == 1001].shape[0]


8

In [76]:
y_pred = []
for i in list(train['cfips'].unique()):
    np.random.seed(42)
    X = train[train['cfips'] == i].drop('microbusiness_density', axis= 1)
    y = train[train['cfips'] == i]['microbusiness_density']
    X.sort_index(inplace= True)
    y.sort_index(inplace= True)
    pipe = Pipeline([('impute', KNNImputer()),
                     ('pca', PCA(n_components= test[test['cfips'] == i].shape[0])),
                     ('rd', Ridge(alpha=100))])
    pipe.fit(X, y)
    prediction = pipe.predict(test[test['cfips'] == i])
    prediction = list(prediction)
    y_pred += prediction
    
submission = pd.DataFrame({'row_id' : row_id.values, 'microbusiness_density' : y_pred})
#smape(np.array(y_test_) ,np.array(y_pred) )
pd.DataFrame.to_csv(submission, 'submission.csv', index= False)

In [57]:
# y_pred = []
# for i in list(train['cfips'].unique()):
#     np.random.seed(42)
#     X = train[train['cfips'] == i].drop('microbusiness_density', axis= 1)
#     y = train[train['cfips'] == i]['microbusiness_density']
#     pipe = Pipeline([('impute', KNNImputer()),('poly', PolynomialFeatures(3)),
#                      ('scaler', RobustScaler()), ('rd', RidgeCV(alphas=[1e-3, 1e-2, 1e-1, 1, 10, 100]))])
#     pipe.fit(X, y)
#     prediction = pipe.predict(test[test['cfips'] == i])
#     prediction = list(prediction)
#     y_pred += prediction
    
# submission = pd.DataFrame({'row_id' : row_id.values, 'microbusiness_density' : y_pred})
# #smape(np.array(y_test_) ,np.array(y_pred) )
# pd.DataFrame.to_csv(submission, 'submission.csv', index= False)

In [58]:
# y_pred = []
# for i in list(train['cfips'].unique()):
#     np.random.seed(42)
#     X = train[train['cfips'] == i].drop('microbusiness_density', axis= 1)
#     y = train[train['cfips'] == i]['microbusiness_density']
#     pipe = Pipeline([('impute', KNNImputer()),('poly', PolynomialFeatures(2)),
#                      ('rd', RidgeCV(alphas=[1e-3, 1e-2, 1e-1, 1, 10, 100]))])
#     pipe.fit(X, y)
#     prediction = pipe.predict(test[test['cfips'] == i])
#     prediction = list(prediction)
#     y_pred += prediction
    
# submission = pd.DataFrame({'row_id' : row_id.values, 'microbusiness_density' : y_pred})
# #smape(np.array(y_test_) ,np.array(y_pred) )
# pd.DataFrame.to_csv(submission, 'submission.csv', index= False)

In [59]:
# y_pred = []
# for i in list(train['cfips'].unique()):
#     np.random.seed(42)
#     X = train[train['cfips'] == i].drop('microbusiness_density', axis= 1)
#     y = train[train['cfips'] == i]['microbusiness_density']
#     pipe = Pipeline([('impute', KNNImputer()),('log',FunctionTransformer(np.sqrt)),
#                      ('rd', Ridge(alpha=100))])
#     pipe.fit(X, y)
#     prediction = pipe.predict(test[test['cfips'] == i])
#     prediction = list(prediction)
#     y_pred += prediction
    
# submission = pd.DataFrame({'row_id' : row_id.values, 'microbusiness_density' : y_pred})
# #smape(np.array(y_test_) ,np.array(y_pred) )
# pd.DataFrame.to_csv(submission, 'submission.csv', index= False)

In [60]:
# y_pred = []
# for i in list(train['cfips'].unique()):
#     np.random.seed(42)
#     X = train[train['cfips'] == i].drop('microbusiness_density', axis= 1)
#     y = train[train['cfips'] == i]['microbusiness_density']
#     X.sort_index(inplace= True)
#     y.sort_index(inplace= True)
#     pipe = Pipeline([('impute', KNNImputer()),
#                      ('rd', Ridge(alpha=100, random_state=42))])
#     pipe.fit(X, y)
#     prediction = pipe.predict(test[test['cfips'] == i])
#     prediction = list(prediction)
#     y_pred += prediction
    
# submission = pd.DataFrame({'row_id' : row_id.values, 'microbusiness_density' : y_pred})
# #smape(np.array(y_test_) ,np.array(y_pred) )
# pd.DataFrame.to_csv(submission, 'submission.csv', index= False)

In [61]:
# y_pred = []
# for i in list(train['cfips'].unique()):
#     np.random.seed(42)
#     X = train[train['cfips'] == i].drop('microbusiness_density', axis= 1)
#     y = train[train['cfips'] == i]['microbusiness_density']
#     X.sort_index(inplace= True)
#     y.sort_index(inplace= True)
#     pipe = Pipeline([('impute', KNNImputer()),
#                      ('rd', RandomForestRegressor(random_state=42))])
#     pipe.fit(X, y)
#     prediction = pipe.predict(test[test['cfips'] == i])
#     prediction = list(prediction)
#     y_pred += prediction
    
# submission = pd.DataFrame({'row_id' : row_id.values, 'microbusiness_density' : y_pred})
# #smape(np.array(y_test_) ,np.array(y_pred) )
# pd.DataFrame.to_csv(submission, 'submission.csv', index= False)

In [62]:
# y_pred = []
# for i in list(train['cfips'].unique()):
#     np.random.seed(42)
#     X = train[train['cfips'] == i].drop('microbusiness_density', axis= 1)
#     y = train[train['cfips'] == i]['microbusiness_density']
#     X.sort_index()
#     y.sort_index()
#     pipe = Pipeline([('xgb', xgb.XGBRegressor(objective ='reg:linear', colsample_bytree = 0.3, learning_rate = 0.1,
#                 max_depth = 5, alpha = 10, n_estimators = 10))])
#     pipe.fit(X, y)
#     prediction = pipe.predict(test[test['cfips'] == i])
#     prediction = list(prediction)
#     y_pred += prediction
   
    
# submission = pd.DataFrame({'row_id' : row_id.values, 'microbusiness_density' : y_pred})
# #smape(np.array(y_test_) ,np.array(y_pred) )
# pd.DataFrame.to_csv(submission, 'submission.csv', index= False)

Remove outliers

In [63]:
# cfips = train[['cfips']]
# month = train[['month']]
# year = train[['year']]
# microbusiness_density = train[['microbusiness_density']]

In [64]:
# train.loc['2021-01-01':'2021-09-01']= np.nan

In [65]:
# train.drop(['cfips','month', 'year','microbusiness_density' ], inplace= True,axis=1)

In [66]:
# train['cfips'] = cfips
# train['month'] = month
# train['year'] = year
# train['microbusiness_density'] = microbusiness_density

In [67]:
 # annormalities removed
# y_pred = []
# for i in list(train['cfips'].unique()):
#     np.random.seed(42)
#     X = train[train['cfips'] == i].drop('microbusiness_density', axis= 1)
#     y = train[train['cfips'] == i]['microbusiness_density']
#     X.sort_index()
#     y.sort_index()
#     pipe = Pipeline([('impute', KNNImputer()),('xgb', xgb.XGBRegressor(objective ='reg:linear', colsample_bytree = 0.3, learning_rate = 0.1,
#                 max_depth = 5, alpha = 10, n_estimators = 10))])
#     pipe.fit(X, y)
#     prediction = pipe.predict(test[test['cfips'] == i])
#     prediction = list(prediction)
#     y_pred += prediction
   
    
# submission = pd.DataFrame({'row_id' : row_id.values, 'microbusiness_density' : y_pred})
# #smape(np.array(y_test_) ,np.array(y_pred) )
# pd.DataFrame.to_csv(submission, 'submission.csv', index= False)