In [3]:
import pandas as pd
from sklearn.neighbors.regression import KNeighborsRegressor,check_array, _get_weights
from sklearn.neighbors import RadiusNeighborsRegressor
import matplotlib.pyplot as plt
import numpy as np
import math
from scipy.stats import pearsonr
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error,r2_score


In [4]:
def writer(outfilename,string):
    outfilename.write(string+".\n")


def health_smoothing(df,health,cols,rad=10): #see R2-plotter for documentation
    X=df[cols]
    y=df[health]
    knn= KNeighborsRegressor(n_neighbors=rad).fit(X,y)
    Y=knn.predict(X)
    df[health+'-smooth']=Y
    return df

def add_missing_features_vals(D): #adds missing keys to dictionary D
    L=['commute', 'safety','density','pollution']
    for x in L:
        if x not in D.keys():
            D[x]=0
    return D


def weight_computer(df,outfile,L,report=True):
    #keys for DD are the health features. The values are ordered pairs (features, n_neighbors for KNN)
    DD=({'no-asthma':(['density'],L[0]),
        'sleep >7':(['density','commute'],L[1]),
        'no-obesity':(['commute', 'safety','density'],L[2]),
        'no-mental-health-prob':(['commute', 'safety','density','pollution'],L[3])}) 

    D1=({'no-asthma':'asthma','sleep >7':'sleep',
        'no-obesity':'obesity','no-mental-health-prob':'mental'}) #renaming dictionary
    
    I=[] #list of intercepts
    features=[] #list of dictionaries where each dictionary has the feature as a key and the coiefficient as the value
    linear_regressors={} #dictionary with keys health issue and values  linear regressor objects
    for x in ['no-obesity','no-asthma', 'no-mental-health-prob', 'sleep >7']:
        I.append(D1[x])
        vals={} #coiefficients for regression

        cols,rad=DD[x]

        df_smooth=health_smoothing(df,x,cols,rad) #make smoothed column for health issue x

        smoothed_col=x+"-smooth"
        X=df_smooth[cols]
        y=df_smooth[smoothed_col]

        linreg=LinearRegression().fit(X,y) #fit a linear regression
        linear_regressors[x]=linreg
        Y=linreg.predict(X) #predict Y values from regression
        
        if report:
            #write a report
            writer(outfile,"For "+x+" the features are "+str(DD[x][0])+" and the number of neighbors is "+str(DD[x][1]))
            print("For",x,"the features are ",str(DD[x][0]),"and the number of neighbors is",str(DD[x][1]))

            writer(outfile,"For "+x+" the coiefficients are "+str(linreg.coef_))
            print("For",x,"the coiefficients are",str(linreg.coef_))

            writer(outfile,"For "+x+" the intercept is "+str(linreg.intercept_))
            print("For",x,"the intercept is",str(linreg.intercept_))

            writer(outfile,"For "+x+" the R2 score is "+str(r2_score(Y,y)))
            print("For",x,"the R2 score is",str(r2_score(Y,y)))

            writer(outfile,"For "+x+" the MSE score is "+str(mean_squared_error(Y,y)))
            print("For",x,"the MSE score is",str(mean_squared_error(Y,y)))
            print('')
            writer(outfile,"\n")

        n=len(DD[x][0]) #number of features
        for i in range(n):
            vals[DD[x][0][i]]=linreg.coef_[i] #set key to be i-th feature and value to be i-th coiefficient

        vals=add_missing_features_vals(vals) #make key for other features with value 0

        vals['intercept']=linreg.intercept_ #make an entry for the intercept
        features.append(vals)

    return df,features,linear_regressors

In [5]:
df_main=pd.read_csv("data/normalized-health-and-environmental-train.csv")
model="model1"
run="final-param"
nums=[300,500,900,900] #the number of neighbors to use in smoothing of asthma,sleep,obesity,and mental health, respectively




name="reports/regression-parameters-"+model+"-"+run+".txt"#name of report txt file
outfile=open(name,'w') #open report file

df,F,linear_regressors=weight_computer(df_main,outfile,nums) #outputs dataframe with smoothed columns as well as a dictionary with coiefficients

outfile.close()
a=pd.DataFrame(F,index=['obesity','asthma', 'mental', 'sleep']) #makes a dataframe from the list of dictionaries

a.to_csv("data/weights.csv") 


For no-obesity the features are  ['commute', 'safety', 'density'] and the number of neighbors is 900
For no-obesity the coiefficients are [0.12825307 0.17677874 0.06647973]
For no-obesity the intercept is 53.7113359756564
For no-obesity the R2 score is 0.7014057086887067
For no-obesity the MSE score is 1.2885855533724746

For no-asthma the features are  ['density'] and the number of neighbors is 300
For no-asthma the coiefficients are [-0.02926282]
For no-asthma the intercept is 91.75831238747185
For no-asthma the R2 score is 0.8018766951177511
For no-asthma the MSE score is 0.01696554830127304

For no-mental-health-prob the features are  ['commute', 'safety', 'density', 'pollution'] and the number of neighbors is 900
For no-mental-health-prob the coiefficients are [ 0.03671632  0.04716226 -0.01723103  0.00702343]
For no-mental-health-prob the intercept is 84.0651998527956
For no-mental-health-prob the R2 score is 0.6146561442787238
For no-mental-health-prob the MSE score is 0.12438967

In [27]:
#we run 
df_main=pd.read_csv("data/normalized-health-and-environmental-train.csv")
df_env=pd.read_csv("data/normalized-environmental-with-income.csv")
model="model1"
run="final-param"
nums=[300,500,900,900] #the number of neighbors to use in smoothing of asthma,sleep,obesity,and mental health, respectively
outfile=''

a,b,linear_regressor=weight_computer(df_main,outfile,nums,report=False) #outputs dataframe with smoothed columns as well as a dictionary with coiefficients
#here we just want the linear_regressor dictionary for the next cell

DD=({'no-asthma':['density'],
        'sleep >7':['density','commute'],
        'no-obesity':['commute', 'safety','density']})    


In [28]:
for x in DD.keys():
    df_env['predicted-'+x]=linear_regressor[x].predict(df_env[DD[x]])
    X=df_env['predicted-'+x]
    Y=df_env['per-capita-income']
    print("The correlation coiefficient for",x," and per-capita-income is",np.corrcoef(X,Y)[0][1])
    
df_env['HealthScore']=0
for x in DD.keys():
    df_env['HealthScore']=.33*linear_regressor[x].predict(df_env[DD[x]])+df_env['HealthScore']

X=df_env['HealthScore']
Y=df_env['per-capita-income']
print("The correlation coiefficient for equal weight HealthScore and per-capita-income is",np.corrcoef(X,Y)[0][1])

The correlation coiefficient for no-asthma  and per-capita-income is -0.11667111407392303
The correlation coiefficient for sleep >7  and per-capita-income is -0.18049589621493875
The correlation coiefficient for no-obesity  and per-capita-income is 0.1309054563244509
The correlation coiefficient for equal weight HealthScore and per-capita-income is -0.025055433141764612
