In [9]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.neighbors.regression import KNeighborsRegressor
from sklearn.metrics import mean_squared_error,r2_score


In [3]:
def health_smoothing(df,health,cols,rad=10):
    X=df[cols] #features
    y=df[health] #data

    knn= KNeighborsRegressor(n_neighbors=rad).fit(X,y) #fit KNN for smoothing
    
    Y=knn.predict(X) #smoothed column
    df[health+'-smooth']=Y #make new column in dataframe
    return df

def regression_details(X,y,linreg,nhb,cols,health):
      #write a report
    Y=linreg.predict(X)
    print("For",health,"the features are ",cols,"and the number of neighbors is",nhb)
    print("For",health,"the coiefficients are",str(linreg.coef_))
    print("For",health,"the intercept is",str(linreg.intercept_))
    print("For",health,"the R2 score is",str(r2_score(Y,y)))
    print('')
 
    

In [2]:
df_main=pd.read_csv("data/normalized-health-and-environmental-train.csv")
df_main.columns

Index(['TractFIPS', 'no-obesity', 'sleep >7', 'no-asthma',
       'no-mental-health-prob', 'commute', 'pollution', 'density', 'safety',
       'lat', 'long'],
      dtype='object')

In [18]:
#this dictionary is a very liberal inclusion of features
D=({'no-asthma':['density',"pollution"], 
    'sleep >7':['density','commute','pollution','safety'],
    'no-obesity':['commute', 'safety','density','pollution'],
    'no-mental-health-prob':['commute', 'safety','density','pollution']})

for health in ['no-obesity', 'sleep >7', 'no-asthma','no-mental-health-prob']:
    df_smooth=health_smoothing(df_main,health,D[health],500) #use 500 neighbors as default
    smoothed_col=health+"-smooth" #name of smoothed column
    
    X=df_smooth[D[health]]
    y=df_smooth[smoothed_col]
    linreg=LinearRegression().fit(X,y) #fit a linear model
    regression_details(X,y,linreg,500,D[health],health) #print out details of model

For no-obesity the features are  ['commute', 'safety', 'density', 'pollution'] and the number of neighbors is 500
For no-obesity the coiefficients are [ 0.10309197  0.16077573  0.04017918 -0.03005961]
For no-obesity the intercept is 57.49996858454858
For no-obesity the R2 score is 0.46236838767512045

For sleep >7 the features are  ['density', 'commute', 'pollution', 'safety'] and the number of neighbors is 500
For sleep >7 the coiefficients are [-0.04563411 -0.08754074  0.01920954  0.14112566]
For sleep >7 the intercept is 60.50992222453769
For sleep >7 the R2 score is 0.831807621408943

For no-asthma the features are  ['density', 'pollution'] and the number of neighbors is 500
For no-asthma the coiefficients are [-0.0214886   0.00382127]
For no-asthma the intercept is 91.24552109256437
For no-asthma the R2 score is 0.18948611268416948

For no-mental-health-prob the features are  ['commute', 'safety', 'density', 'pollution'] and the number of neighbors is 500
For no-mental-health-prob

Pollution seems to be a very weak indictor. I believe this is due to the pollution measure I am using, which appears to be normalized in some strang way.  I have thus decided to drop pollution.

Safety seems to be too strong an indictor for sleep and may be tracking with income. I will drop this feature for sleep.

In [19]:
#This dictionary gives the features that I reasonably expect to have a relationship with the quantity to be predicted.
#These choices are backed up by public health studies which show correlations.

D=({'no-asthma':['density'], 
    'sleep >7':['density','commute'],
    'no-obesity':['commute', 'safety','density'],
    'no-mental-health-prob':['commute', 'safety','density']})

for health in ['no-obesity', 'sleep >7', 'no-asthma','no-mental-health-prob']:
    df_smooth=health_smoothing(df_main,health,D[health],500) #use 500 neighbors as default
    smoothed_col=health+"-smooth" #name of smoothed column
    
    X=df_smooth[D[health]]
    y=df_smooth[smoothed_col]
    linreg=LinearRegression().fit(X,y) #fit a linear model
    regression_details(X,y,linreg,500,D[health],health) #print out details of model


For no-obesity the features are  ['commute', 'safety', 'density'] and the number of neighbors is 500
For no-obesity the coiefficients are [0.13356146 0.18214074 0.06562914]
For no-obesity the intercept is 53.323411561400285
For no-obesity the R2 score is 0.5608850018124318

For sleep >7 the features are  ['density', 'commute'] and the number of neighbors is 500
For sleep >7 the coiefficients are [-0.07217333 -0.15240359]
For sleep >7 the intercept is 71.45873028853912
For sleep >7 the R2 score is 0.8856411895284388

For no-asthma the features are  ['density'] and the number of neighbors is 500
For no-asthma the coiefficients are [-0.02795549]
For no-asthma the intercept is 91.69058471847006
For no-asthma the R2 score is 0.8415084017548302

For no-mental-health-prob the features are  ['commute', 'safety', 'density'] and the number of neighbors is 500
For no-mental-health-prob the coiefficients are [ 0.04787835  0.06580066 -0.03266286]
For no-mental-health-prob the intercept is 83.763946

Mental health does not seem to admit a linear relationship after smoothing. I will drop this health issue. 