# DS 2010 Final Project - Regression

## Import Packages

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
import os
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
import statistics
import json
import statsmodels.api as sm

## Import Data

In [2]:
# set random seed
np.random.seed(2010) 

# import data 
data = pd.read_csv("Clean_Data/final_data.csv")
responses = ['Mix_Score', 'Depression_Score', 'Anxiety_Score']

## Multiple Linear Regression

In [3]:
def multiple_linear_regression(data, predictors, responses):
    for response in responses:
        X = data.loc[:,predictors] # feature columns
        y = data.loc[:,response] # label column

        ## Linear Regression ##
        regr = linear_model.LinearRegression()
        lm = regr.fit(X, y) # train on training data
        score = regr.score(X, y)

        # Create summary table
        X2 = sm.add_constant(X)
        est = sm.OLS(y, X2)
        est2 = est.fit()
        
        print(est2.summary())
        # print(regr.coef_)
        print(score)

### Run on all data

In [4]:
# Multiple Linear Regression
# All Data
all_data = data.copy()

# ignore_columns = ['Period', 'state','positiveTestsViral', 'positiveTestsViralIncrease']
# drop_by_columns = [item for item in list(new_england.columns) if item not in ignore_columns]
feature_data = all_data.drop(['death', 'positiveTestsViral', 'positiveTestsViralIncrease',
                              'positivityRate', 'positiveCasesViral'],
                               axis= 1)

predictors = feature_data.columns[5:]
multiple_linear_regression(feature_data, predictors, responses)

                            OLS Regression Results                            
Dep. Variable:              Mix_Score   R-squared:                       0.299
Model:                            OLS   Adj. R-squared:                  0.286
Method:                 Least Squares   F-statistic:                     23.80
Date:                Thu, 10 Dec 2020   Prob (F-statistic):           6.71e-55
Time:                        03:49:31   Log-Likelihood:                -2349.5
No. Observations:                 853   AIC:                             4731.
Df Residuals:                     837   BIC:                             4807.
Df Model:                          15                                         
Covariance Type:            nonrobust                                         
                                     coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------------
const       

### All Data - Top 6 Predictors

In [5]:
# Multiple Linear Regression
# All Data - top 6 predictors

all_data = data.copy()
feature_data = all_data.drop(['death', 'positiveTestsViral', 'positiveTestsViralIncrease',
                              'positivityRate', 'positiveCasesViral'],
                               axis= 1)
predictors = ['deathIncrease', 'hospitalizedCurrently', 
              'Population Staying at Home',
              'Number of Trips 100-250', 'Number of Trips 250-500',
              'Number of Trips >=500']
multiple_linear_regression(feature_data, predictors, responses)

                            OLS Regression Results                            
Dep. Variable:              Mix_Score   R-squared:                       0.218
Model:                            OLS   Adj. R-squared:                  0.212
Method:                 Least Squares   F-statistic:                     39.23
Date:                Thu, 10 Dec 2020   Prob (F-statistic):           3.51e-42
Time:                        03:49:31   Log-Likelihood:                -2396.3
No. Observations:                 853   AIC:                             4807.
Df Residuals:                     846   BIC:                             4840.
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                                 coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------------
const               

### All Data - Rows with positivityRate

In [6]:
# Multiple Linear Regression
# All Data - rows with positivity rate

all_data = data.copy()
feature_data = all_data.drop([],
                               axis= 1)
feature_data = feature_data.dropna()
predictors = feature_data.columns[5:]
multiple_linear_regression(feature_data, predictors, responses)

                            OLS Regression Results                            
Dep. Variable:              Mix_Score   R-squared:                       0.410
Model:                            OLS   Adj. R-squared:                  0.363
Method:                 Least Squares   F-statistic:                     8.726
Date:                Thu, 10 Dec 2020   Prob (F-statistic):           1.87e-19
Time:                        03:49:31   Log-Likelihood:                -679.59
No. Observations:                 272   AIC:                             1401.
Df Residuals:                     251   BIC:                             1477.
Df Model:                          20                                         
Covariance Type:            nonrobust                                         
                                     coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------------
const       

### Run on New England data

In [7]:
# Multiple Linear Regression
# New England
new_england = data.copy().loc[data["State"].isin(['MA', 'ME', 'VT', 'NH', 'CT', "RI"])].reset_index(drop=True)

ignore_predictors = ['positiveTestsViral', 'positiveCasesViral',]
ignore_predictors = ['death','positivityRate', 'positiveTestsViralIncrease', 
                     'positiveTestsViral', 'positiveCasesViral']
predictors = [item for item in list(data.columns[5:]) if item not in ignore_predictors]
multiple_linear_regression(new_england, predictors, responses)

                            OLS Regression Results                            
Dep. Variable:              Mix_Score   R-squared:                       0.473
Model:                            OLS   Adj. R-squared:                  0.387
Method:                 Least Squares   F-statistic:                     5.504
Date:                Thu, 10 Dec 2020   Prob (F-statistic):           8.26e-08
Time:                        03:49:31   Log-Likelihood:                -262.72
No. Observations:                 108   AIC:                             557.4
Df Residuals:                      92   BIC:                             600.4
Df Model:                          15                                         
Covariance Type:            nonrobust                                         
                                     coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------------
const       

### New England - Top 3 Predictors

In [8]:
# Multiple Linear Regression
# New England - top 3 predictors
new_england = data.copy().loc[data["State"].isin(['MA', 'ME', 'VT', 'NH', 'CT', "RI"])].reset_index(drop=True)
new_england = new_england.dropna()

predictors = ['death', 'positivityRate', 'positiveCasesViral']
multiple_linear_regression(new_england, predictors, responses)

                            OLS Regression Results                            
Dep. Variable:              Mix_Score   R-squared:                       0.372
Model:                            OLS   Adj. R-squared:                  0.332
Method:                 Least Squares   F-statistic:                     9.292
Date:                Thu, 10 Dec 2020   Prob (F-statistic):           6.19e-05
Time:                        03:49:31   Log-Likelihood:                -132.75
No. Observations:                  51   AIC:                             273.5
Df Residuals:                      47   BIC:                             281.2
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                         coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------
const                 28.9772      2

## Single Linear Regression

In [9]:
def single_linear_regression(data, states, predictors, responses):
    final = {}
    for state in states:
        for predictor in predictors:
            for response in responses:
                # get state data
                state_data = data.copy().loc[data['State'] == state].reset_index(drop=True)
                X = state_data.loc[:,predictor] # feature columns
                y = state_data.loc[:,responses] # label column
                X_reshaped = X.to_numpy().reshape(-1, 1)

                ## Linear Regression ##
                regr = linear_model.LinearRegression()
                lm = regr.fit(X_reshaped, y) # train on training data
                score = regr.score(X_reshaped, y)
                
                # saves best r^2 and corresponding feature
                if not response in final:
                    final[response] = {}
                if state in final[response]:
                    if score > final[response][state][1]:
                        final[response][state] = [predictor, score]
                else:
                    final[response][state] = [predictor, score]

                if state_data.shape[0] == 0:
                    continue
    return final

In [10]:
# Single Linear Regression
states = data['State'].unique()
ignore_predictors = ['positivityRate','positiveTestsViral', 'positiveTestsViralIncrease']
predictors = [item for item in list(data.columns[5:]) if item not in ignore_predictors]
final_slr = single_linear_regression(data, states, predictors, responses)
with open("final_slr.json", "w") as outfile:  
        json.dump(final_slr, outfile) 

In [11]:
# final result dictionary
'''
final_slr: 
{
Mix_Score:{state:['bestFeature', R^2]},
Depression_Score:{state:['bestFeature', R^2]},
Anxiety_Score:{state:['bestFeature', R^2])
}
'''

final_slr

{'Mix_Score': {'AL': ['hospitalizedCurrently', 0.5405792069032893],
  'AK': ['positiveCasesIncrease', 0.1724416479921368],
  'AZ': ['positiveCasesIncrease', 0.5672884929211565],
  'AR': ['Number of Trips 10-25', 0.1742809206122338],
  'CA': ['positiveCasesIncrease', 0.5282891209378344],
  'CO': ['Number of Trips 1-3', 0.4418718440086821],
  'CT': ['Population Not Staying at Home', 0.22249181166150236],
  'GA': ['Number of Trips 10-25', 0.2136405842834612],
  'ID': ['Number of Trips 3-5', 0.5099082374408354],
  'IL': ['Population Not Staying at Home', 0.11137771621762083],
  'IN': ['Number of Trips 10-25', 0.3444404611035054],
  'IA': ['death', 0.35139504456437304],
  'KY': ['Number of Trips', 0.5797903799073492],
  'LA': ['positiveCasesIncrease', 0.26685844649785906],
  'ME': ['positiveCasesViral', 0.26635344004797773],
  'MD': ['positiveCasesIncrease', 0.23104503227851003],
  'MA': ['Population Not Staying at Home', 0.46830871366156385],
  'MN': ['Population Staying at Home', 0.253157