In [136]:
import os 
import numpy as np
import pandas as pd 
import missingno as msno
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import statsmodels.api as sm
from statsmodels.formula.api import ols
warnings.filterwarnings("ignore")
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from patsy import dmatrices
from sklearn.metrics import r2_score 
import math

In [2]:
# # change directories - run only once
# os.chdir(os.path.dirname(os.getcwd()))
# os.chdir(os.path.dirname(os.getcwd()))
# os.chdir('Data')

In [137]:
# read in data
df_full = pd.read_csv('od_final_transformed.csv', dtype = {'FIPS': str})
df_full['FIPS'] = df_full['FIPS'].astype(str).str.zfill(5)
df_full.to_csv('od_final_transformed.csv', index = False)
df_na = pd.read_csv('od_nonull_final.csv')

# define RMSE function
def compute_rmse(predictions, yvalues): 
    rmse = np.sqrt(np.sum(np.subtract(yvalues, predictions)**2)/len(yvalues))
    return rmse 

In [144]:
# change directories again to get county adjacency data
os.chdir('Geographical_Data')
adjacencies = pd.read_csv('county_adjacency.csv', dtype = {'fipscounty': str, 'fipsneighbor': str})
os.chdir(os.path.dirname(os.getcwd()))

# # removing rows where the adjacent county is itself
adjacencies = adjacencies[adjacencies['fipscounty'] != adjacencies['fipsneighbor']]
adjacencies.columns = ['countyname', 'fipscounty', 'neighborname', 'FIPS']

In [139]:
# fit our model
features = df_na.drop('log_Overdose_Rate_per_100k', axis = 1).columns
plop = '+'.join(features)
mod = ols('log_Overdose_Rate_per_100k ~ ' + plop, data = df_na)
res = mod.fit()
print(res.summary())

                                OLS Regression Results                                
Dep. Variable:     log_Overdose_Rate_per_100k   R-squared:                       0.625
Model:                                    OLS   Adj. R-squared:                  0.625
Method:                         Least Squares   F-statistic:                     1669.
Date:                        Sun, 01 May 2022   Prob (F-statistic):               0.00
Time:                                17:58:55   Log-Likelihood:                -2774.1
No. Observations:                        8018   AIC:                             5566.
Df Residuals:                            8009   BIC:                             5629.
Df Model:                                   8                                         
Covariance Type:                    nonrobust                                         
                                coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------

In [140]:
# this is where we are missing overdose rates in the full data
missing_idx = np.where(df_full.log_Overdose_Rate_per_100k.isnull())[0]

# make predictions using our ols model
predicts = res.predict(df_full)

# fill in what we can of the missing data
df_full.log_Overdose_Rate_per_100k[missing_idx] = predicts[missing_idx]

In [215]:
# function for computing spatial mean
def get_spatmean(df, adjacencies):
    rates = df[['Year', 'FIPS', 'log_Overdose_Rate_per_100k']]
    merged = adjacencies.merge(rates, on = ['FIPS'], how = 'left') 
    grouped_means = merged.groupby(['Year', 'fipscounty'], as_index = False).mean()
    grouped_means.columns = ['Year', 'FIPS', 'log_Spatial_Mean']
    df2_temp = df.merge(grouped_means, on = ['Year', 'FIPS'], how = 'left')
    null_idx = np.where(df_full.log_Spatial_Mean.isnull())[0]
    df.log_Spatial_Mean[null_idx] = df2_temp.log_Spatial_Mean_y[null_idx]
    
    return df

In [220]:
df_full2 = get_spatmean(df_full, adjacencies)

# this is where we are missing overdose rates in the full data
missing_idx = np.where(df_full2.log_Overdose_Rate_per_100k.isnull())[0]

# make predictions using our ols model
predicts = res.predict(df_full2)

# fill in what we can of the missing data
df_full2.log_Overdose_Rate_per_100k[missing_idx] = predicts[missing_idx]

In [221]:
df_full.to_csv('test_estimate.csv', index = False)
df_full2.to_csv('test_estimate2.csv', index = False)