# DS 2010 Final Project - Regression

## Import Packages

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
import os
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
import statistics
import json
import statsmodels.api as sm

## Import Data

In [2]:
# set random seed
np.random.seed(2010) 

# import data 
data = pd.read_csv("Clean_Data/final_data.csv")
responses = ['Mix_Score', 'Depression_Score', 'Anxiety_Score']

## Multiple Linear Regression

In [3]:
def multiple_linear_regression(data, predictors, responses):
    for response in responses:
        X = data.loc[:,predictors] # feature columns
        y = data.loc[:,response] # label column

        ## Linear Regression ##
        regr = linear_model.LinearRegression()
        lm = regr.fit(X, y) # train on training data
        score = regr.score(X, y)

        # Create summary table
        X2 = sm.add_constant(X)
        est = sm.OLS(y, X2)
        est2 = est.fit()
        
        print(est2.summary())
        # print(regr.coef_)
        print(score)

In [4]:
# Multiple Linear Regression
new_england = data.copy().loc[data["State"].isin(['MA', 'ME', 'VT', 'NH', 'CT', "RI"])].reset_index(drop=True)
new_england = new_england.dropna()
predictors = list(data.columns[5:]) # including: 'positivityRate','positiveTestsViral', 'positiveTestsViralIncrease'
multiple_linear_regression(new_england, predictors, responses)

                            OLS Regression Results                            
Dep. Variable:              Mix_Score   R-squared:                       0.740
Model:                            OLS   Adj. R-squared:                  0.566
Method:                 Least Squares   F-statistic:                     4.263
Date:                Tue, 08 Dec 2020   Prob (F-statistic):           0.000183
Time:                        16:25:59   Log-Likelihood:                -110.30
No. Observations:                  51   AIC:                             262.6
Df Residuals:                      30   BIC:                             303.2
Df Model:                          20                                         
Covariance Type:            nonrobust                                         
                                     coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------------
const       

## Single Linear Regression

In [5]:
def single_linear_regression(data, states, predictors, responses):
    final = {}
    for state in states:
        for predictor in predictors:
            for response in responses:
                # get state data
                state_data = data.copy().loc[data['State'] == state].reset_index(drop=True)
                X = state_data.loc[:,predictor] # feature columns
                y = state_data.loc[:,responses] # label column
                X_reshaped = X.to_numpy().reshape(-1, 1)

                ## Linear Regression ##
                regr = linear_model.LinearRegression()
                lm = regr.fit(X_reshaped, y) # train on training data
                score = regr.score(X_reshaped, y)
                
                # saves best r^2 and corresponding feature
                if not response in final:
                    final[response] = {}
                if state in final[response]:
                    if score > final[response][state][1]:
                        final[response][state] = [predictor, score]
                else:
                    final[response][state] = [predictor, score]

                if state_data.shape[0] == 0:
                    continue
    return final

In [6]:
# Single Linear Regression
states = data['State'].unique()
ignore_predictors = ['positivityRate','positiveTestsViral', 'positiveTestsViralIncrease']
predictors = [item for item in list(data.columns[5:]) if item not in ignore_predictors]
final_slr = single_linear_regression(data, states, predictors, responses)
with open("final_slr.json", "w") as outfile:  
        json.dump(final_slr, outfile) 

In [7]:
# final result dictionary
'''
final_slr: 
{
Mix_Score:{state:['bestFeature', R^2]},
Depression_Score:{state:['bestFeature', R^2]},
Anxiety_Score:{state:['bestFeature', R^2])
}
'''

final_slr

{'Mix_Score': {'AL': ['hospitalizedCurrently', 0.5405792069032893],
  'AK': ['positiveCasesIncrease', 0.1724416479921368],
  'AZ': ['positiveCasesIncrease', 0.5672884929211565],
  'AR': ['hospitalizedCurrently', 0.17201648406698042],
  'CA': ['positiveCasesIncrease', 0.5282891209378344],
  'CO': ['positiveCasesIncrease', 0.4101428872971214],
  'CT': ['Number of Trips 1-3', 0.07853477312763117],
  'GA': ['Number of Trips <1', 0.264401293993017],
  'ID': ['hospitalizedCurrently', 0.42225361782987764],
  'IL': ['Number of Trips <1', 0.22663542518020752],
  'IN': ['Number of Trips <1', 0.13328774890183073],
  'IA': ['death', 0.35139504456437304],
  'KY': ['death', 0.3810517716376515],
  'LA': ['positiveCasesIncrease', 0.26685844649785906],
  'ME': ['positiveCasesViral', 0.26635344004797773],
  'MD': ['positiveCasesIncrease', 0.23104503227851003],
  'MA': ['positiveCasesViral', 0.3626153916423871],
  'MN': ['Number of Trips <1', 0.14561749946512062],
  'MO': ['hospitalizedCurrently', 0.1547