In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import pickle

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn import preprocessing
from sklearn import metrics
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import VotingRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import LinearRegression
import time 


from google.colab import drive
drive.mount('/content/drive')

data=pd.read_csv('/content/drive/MyDrive/Data 298A Project-Team 3 Salton Sea/Data/Health_data_processed/final_mapped_data_v1.csv')

data= pd.concat([data, pd.get_dummies(data['Age_Group'], prefix = 'Age_Group')],axis=1)
data.drop(['Age_Group','Number_of_Asthma_ED_Visits'],axis=1, inplace=True)

#Rename Age_group columns
dict= {'Age_Group_0-17': 'Young_population',
       'Age_Group_18+': 'Adults_population',
       'Age_Group_All Ages':'General'
       }

data.rename(columns= dict, inplace=True)



# Converting counties to integer values
def convert_to_int(word):
     word_dict= {'Imperial':0,'Riverside':1}
     return word_dict[word]

data['county'] = (data['county'].apply(lambda x : convert_to_int(x)))

# Converting cities to integer values
def convert_to_int(word):
    word_dict = {'Bombay Beach':0, 'Brawley-220 Main Street':1, 'Buttercup':2, 'Cahuilla':3, 'Calexico-Ethel Street':4, 'Calipatria - Mulberry':5, 'Cathedral City':6, 'El Centro Naval Air Facility #2':7, 'El Centro-9th Street':8,
                  'Fish Creek Mountains':9, 'Imperial County Airport':10, 'Indio #3':11, 'Indio-Jackson Street':12,'Joshua Tree National Park':13, 'Joshua Tree NP-Cottonwood #2':14, 'La Quinta II':15, 'Mecca-65705 Johnson Street':16, 
                  'Mecca-66275 Martinez Road':17, 'Mecca-90-333 Avenue':18, 'Meloland':19, 'Naval Test Base':20, 'Niland-English Road':21, 'Oasis':22, 'Palm Springs Regional Airport':23, 'Palm Springs-Fire Station':24, 'Palo Verde II':25,
                  'PINYON':26, 'Salton City':27,'Salton Sea East':28,'Salton Sea Park':29, 'Seeley':30, 'Sonny Bono':31, 'Squaw Lake':32, 'Thermal South':33,'Thermal-Jacqueline Cochran Regional Airport':34,'Torres-Martinez':35,'UC-Andrade':36,
                   'Westmorland North':37,'Westmorland-W 1st Street':38}
    return word_dict[word]

data['City'] = data['City'].apply(lambda x : convert_to_int(x))

x = data.values 
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
data = pd.DataFrame(x_scaled, columns=data.columns)



X=data.loc[:, data.columns!= 'Age_Adjusted_Rate_of_Asthma_ED_V']  
Y=data[['Age_Adjusted_Rate_of_Asthma_ED_V']]
Y=Y.values.reshape(-1,1)   



#Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2,random_state=42) # 80% training and 20% test



#X_test.to_csv('/content/drive/MyDrive/Data 298A Project-Team 3 Salton Sea/Data/Health_data_processed/Actual_test_data.csv',index=False)


estimators = [('rfr', RandomForestRegressor(criterion='mse', max_depth = 50, n_estimators= 100, max_features =20, random_state = 42)),
              ('svr', SVR(kernel='rbf', gamma='auto', C= 100, epsilon= 0.01)),
              ('enr', ElasticNet(alpha= 0.0001, l1_ratio=0.5, max_iter =1000, normalize=True)),
               ('gbr', GradientBoostingRegressor(n_estimators=500, max_features=20,random_state = 42))]


reg = StackingRegressor(estimators=estimators,final_estimator= LinearRegression())

reg.fit(X_train, y_train)


# Saving model to disk
pickle.dump(reg, open('model.pkl','wb'))

# save the scaler
pickle.dump(min_max_scaler, open('scaler.pkl', 'wb'))

#print(model.predict([[]]))
print(reg.score(X_test,y_test))

Mounted at /content/drive


  y = column_or_1d(y, warn=True)


0.9776585634294425


In [None]:
# Loading model to compare the results
model = pickle.load(open('model.pkl','rb'))

# load the scaler
min_max_scaler = pickle.load(open('scaler.pkl', 'rb'))

#/content/drive/MyDrive/Data 298A Project-Team 3 Salton Sea/Data/Health_data_processed/Actual_test_data.csv',index=False)

test_data= pd.read_csv('/content/drive/MyDrive/Data 298A Project-Team 3 Salton Sea/Data/Health_data_processed/Actual_test_data.csv')

# Converting counties to integer values
def convert_to_int(word):
     word_dict= {'Imperial':0,'Riverside':1}
     return word_dict[word]

test_data['county'] = (test_data['county'].apply(lambda x : convert_to_int(x)))


# Converting cities to integer values
def convert_to_int(word):
    word_dict = {'Bombay Beach':0, 'Brawley-220 Main Street':1, 'Buttercup':2, 'Cahuilla':3, 'Calexico-Ethel Street':4, 'Calipatria - Mulberry':5, 'Cathedral City':6, 'El Centro Naval Air Facility #2':7, 'El Centro-9th Street':8,
                  'Fish Creek Mountains':9, 'Imperial County Airport':10, 'Indio #3':11, 'Indio-Jackson Street':12,'Joshua Tree National Park':13, 'Joshua Tree NP-Cottonwood #2':14, 'La Quinta II':15, 'Mecca-65705 Johnson Street':16, 
                  'Mecca-66275 Martinez Road':17, 'Mecca-90-333 Avenue':18, 'Meloland':19, 'Naval Test Base':20, 'Niland-English Road':21, 'Oasis':22, 'Palm Springs Regional Airport':23, 'Palm Springs-Fire Station':24, 'Palo Verde II':25,
                  'PINYON':26, 'Salton City':27,'Salton Sea East':28,'Salton Sea Park':29, 'Seeley':30, 'Sonny Bono':31, 'Squaw Lake':32, 'Thermal South':33,'Thermal-Jacqueline Cochran Regional Airport':34,'Torres-Martinez':35,'UC-Andrade':36,
                   'Westmorland North':37,'Westmorland-W 1st Street':38}
    return word_dict[word]

test_data['City'] = test_data['City'].apply(lambda x : convert_to_int(x))

x_scaled = min_max_scaler.fit_transform(test_data)
test_data = pd.DataFrame(x_scaled, columns=test_data.columns)

start_time = time.time() 

prediction=[]
for i in range(len(test_data)):
  pred = model.predict([np.array(test_data.iloc[i])])
  scale=1/4.02252615e-03
  prevalence = round(pred[0] *scale, 2)
  prediction.append(prevalence)

print("--- %s seconds ---" % (time.time() - start_time))
#print(prediction)
# # print(min_max_scaler.scale_)

--- 1.1571004390716553 seconds ---


In [None]:
#print(min_max_scaler.scale_)

In [None]:
df = pd.DataFrame()

df['Actual']  = pd.Series(y_test.ravel() * scale)
df['Predicted'] = pd.Series(prediction)


In [None]:
df

Unnamed: 0,Actual,Predicted
0,11.3,12.61
1,10.0,14.68
2,175.2,161.58
3,17.3,26.64
4,58.0,54.82
...,...,...
124,203.6,206.75
125,68.5,64.41
126,17.1,13.05
127,32.9,28.16


In [None]:
test= pd.read_csv('/content/drive/MyDrive/Data 298A Project-Team 3 Salton Sea/Data/Health_data_processed/Actual_test_data.csv')

In [None]:
df_out = pd.merge(test, df, how ='left', left_index=True, 
                 right_index=True)

In [None]:
df_out

Unnamed: 0,Year,Zip_Code,county,CO Annual Mean,NO2_Annual Mean,SO2_Annual Mean,PM2.5_Wtd_Mean,PM10_Annual Mean,surface area,error,City,Temp_Maximum,Temp_1_Day_Average,Pressure_1_Day_Average_mph,Pressure_Maximum_mph,Wind_Maximum_mph,Wind_1_Day_Average_mph,Dewpoint_Maximum_degF,Dewpoint_1_Day_Average_degF,CO_Maximum,NO2_Maximum,SO2_Maximum,O3_1_hr_maximum,O3_8_hr_maximum,PM25_Maximum,PM25_1_Day_Average,PM10_Maximum,PM10_1_Day_Average,Rel_Humidity_Maximum,Rel_Humidity_1_Day_Average,Young_population,Adults_population,General,Actual,Predicted
0,2018,92274,Riverside,6.63,14,0.000000,14.2,52,873.179322,33.194746,Salton City,112.20,99.46,1024.53,1028.21,49.15,31.00,80.75,73.87,1.69100,0.052333,0.002000,0.102667,0.093000,206.066667,31.3400,2581.360000,592.46,98.230769,86.076923,0,0,1,11.3,12.61
1,2016,92254,Riverside,6.38,15,0.000000,14.1,58,888.358913,20.983478,Mecca-66275 Martinez Road,119.00,103.00,1017.52,1019.26,49.50,33.30,73.42,72.16,1.96300,0.041000,0.003000,0.097333,0.087667,327.300000,34.1750,1050.440000,289.72,100.142857,97.571429,0,0,1,10.0,14.68
2,2014,92243,Imperial,7.32,12,0.260618,17.6,62,905.658947,14.689474,El Centro-9th Street,117.00,103.00,1028.17,1031.39,48.52,34.88,79.81,75.07,1.60000,0.059000,0.003000,0.100000,0.080000,447.400000,27.5000,9717.583333,120.40,98.818182,91.300000,1,0,0,175.2,161.58
3,2016,92264,Riverside,6.38,15,0.000000,14.1,58,888.358913,20.983478,Palm Springs Regional Airport,122.00,106.00,1017.52,1019.26,46.00,27.20,75.00,71.00,1.96300,0.041000,0.003000,0.097333,0.087667,327.300000,34.1750,1050.440000,289.72,100.000000,94.000000,0,0,1,17.3,26.64
4,2018,92243,Imperial,6.82,12,0.260618,10.4,60,873.179322,33.194746,Naval Test Base,118.00,106.00,1030.21,1033.52,46.20,29.36,80.00,75.00,2.91700,0.055250,0.003250,0.087750,0.075500,363.500000,86.0000,3804.800000,756.40,100.000000,88.000000,0,0,1,58.0,54.82
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
124,2014,92227,Imperial,7.32,12,0.260618,17.6,62,905.658947,14.689474,Brawley-220 Main Street,117.00,101.00,1044.91,1268.89,48.52,34.88,79.81,75.07,2.91700,0.064667,0.005667,0.097000,0.081667,447.400000,24.3000,980.600000,471.80,98.818182,91.300000,1,0,0,203.6,206.75
125,2017,92243,Imperial,6.38,12,1.000000,12.3,56,880.373269,32.749038,Naval Test Base,119.00,105.00,1031.83,1034.53,50.72,32.12,80.00,74.00,3.62275,0.059250,0.004750,0.095500,0.078750,360.000000,70.5000,5951.800000,962.10,100.000000,88.000000,0,0,1,68.5,64.41
126,2013,92274,Riverside,8.22,16,0.000000,14.1,55,913.295862,16.341034,Salton City,113.53,97.84,1032.19,1034.56,55.50,36.90,77.00,73.57,2.43700,0.038667,0.002000,0.103000,0.092333,400.950000,23.9250,666.450000,182.40,98.769231,89.230769,0,1,0,17.1,13.05
127,2013,92254,Riverside,8.22,16,0.000000,14.1,55,913.295862,16.341034,Mecca-90-333 Avenue,92.00,77.00,1032.19,1034.56,55.50,36.90,77.00,73.57,2.43700,0.038667,0.002000,0.103000,0.092333,400.950000,23.9250,666.450000,182.40,93.000000,74.000000,0,0,1,32.9,28.16


In [None]:
df_out.to_csv('/content/drive/MyDrive/Data 298A Project-Team 3 Salton Sea/Data/Health_data_processed/final_actual_predictions.csv',index=False)

In [None]:
df_latlong= pd.read_csv("/content/drive/MyDrive/Data 298A Project-Team 3 Salton Sea/Data/Health_data_processed/final_actual_predictions_withlat&long.csv", encoding='unicode_escape')
df_latlong

Unnamed: 0,Year,Zip_Code,county,CO Annual Mean,NO2_Annual Mean,SO2_Annual Mean,PM2.5_Wtd_Mean,PM10_Annual Mean,surface area,error,City,Latitude,Longitude,Temp_Maximum,Temp_1_Day_Average,Pressure_1_Day_Average_mph,Pressure_Maximum_mph,Wind_Maximum_mph,Wind_1_Day_Average_mph,Dewpoint_Maximum_degF,Dewpoint_1_Day_Average_degF,CO_Maximum,NO2_Maximum,SO2_Maximum,O3_1_hr_maximum,O3_8_hr_maximum,PM25_Maximum,PM25_1_Day_Average,PM10_Maximum,PM10_1_Day_Average,Rel_Humidity_Maximum,Rel_Humidity_1_Day_Average,Young_population,Adults_population,General,Actual,Predicted
0,2018,92274,Riverside,6.63,14,0.000000,14.2,52,873.179322,33.194746,Salton City,33.272753999999999,-115.900616,112.20,99.46,1024.53,1028.21,49.15,31.00,80.75,73.87,1.69100,0.052333,0.002000,0.102667,0.093000,206.066667,31.3400,2581.360000,592.46,98.230769,86.076923,0,0,1,11.3,12.61
1,2016,92254,Riverside,6.38,15,0.000000,14.1,58,888.358913,20.983478,Mecca-66275 Martinez Road,33.561250000000001,-116.153383,119.00,103.00,1017.52,1019.26,49.50,33.30,73.42,72.16,1.96300,0.041000,0.003000,0.097333,0.087667,327.300000,34.1750,1050.440000,289.72,100.142857,97.571429,0,0,1,10.0,14.68
2,2014,92243,Imperial,7.32,12,0.260618,17.6,62,905.658947,14.689474,El Centro-9th Street,32.792149999999999,-115.56299,117.00,103.00,1028.17,1031.39,48.52,34.88,79.81,75.07,1.60000,0.059000,0.003000,0.100000,0.080000,447.400000,27.5000,9717.583333,120.40,98.818182,91.300000,1,0,0,175.2,161.58
3,2016,92264,Riverside,6.38,15,0.000000,14.1,58,888.358913,20.983478,Palm Springs Regional Airport,33.816667000000002,-116.5,122.00,106.00,1017.52,1019.26,46.00,27.20,75.00,71.00,1.96300,0.041000,0.003000,0.097333,0.087667,327.300000,34.1750,1050.440000,289.72,100.000000,94.000000,0,0,1,17.3,26.64
4,2018,92243,Imperial,6.82,12,0.260618,10.4,60,873.179322,33.194746,Naval Test Base,33.169226000000002,-115.855927,118.00,106.00,1030.21,1033.52,46.20,29.36,80.00,75.00,2.91700,0.055250,0.003250,0.087750,0.075500,363.500000,86.0000,3804.800000,756.40,100.000000,88.000000,0,0,1,58.0,54.82
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
124,2014,92227,Imperial,7.32,12,0.260618,17.6,62,905.658947,14.689474,Brawley-220 Main Street,32.97831,-115.53904,117.00,101.00,1044.91,1268.89,48.52,34.88,79.81,75.07,2.91700,0.064667,0.005667,0.097000,0.081667,447.400000,24.3000,980.600000,471.80,98.818182,91.300000,1,0,0,203.6,206.75
125,2017,92243,Imperial,6.38,12,1.000000,12.3,56,880.373269,32.749038,Naval Test Base,33.169226000000002,-115.855927,119.00,105.00,1031.83,1034.53,50.72,32.12,80.00,74.00,3.62275,0.059250,0.004750,0.095500,0.078750,360.000000,70.5000,5951.800000,962.10,100.000000,88.000000,0,0,1,68.5,64.41
126,2013,92274,Riverside,8.22,16,0.000000,14.1,55,913.295862,16.341034,Salton City,33.272753999999999,-115.900616,113.53,97.84,1032.19,1034.56,55.50,36.90,77.00,73.57,2.43700,0.038667,0.002000,0.103000,0.092333,400.950000,23.9250,666.450000,182.40,98.769231,89.230769,0,1,0,17.1,13.05
127,2013,92254,Riverside,8.22,16,0.000000,14.1,55,913.295862,16.341034,Mecca-90-333 Avenue,33.590833000000003,-116.088333,92.00,77.00,1032.19,1034.56,55.50,36.90,77.00,73.57,2.43700,0.038667,0.002000,0.103000,0.092333,400.950000,23.9250,666.450000,182.40,93.000000,74.000000,0,0,1,32.9,28.16


In [None]:
df_latlong.dtypes

Year                             int64
Zip_Code                         int64
county                          object
CO Annual Mean                 float64
NO2_Annual Mean                  int64
SO2_Annual Mean                float64
PM2.5_Wtd_Mean                 float64
PM10_Annual Mean                 int64
surface area                   float64
error                          float64
City                            object
Latitude                        object
Longitude                       object
Temp_Maximum                   float64
Temp_1_Day_Average             float64
Pressure_1_Day_Average_mph     float64
Pressure_Maximum_mph           float64
Wind_Maximum_mph               float64
Wind_1_Day_Average_mph         float64
Dewpoint_Maximum_degF          float64
Dewpoint_1_Day_Average_degF    float64
CO_Maximum                     float64
NO2_Maximum                    float64
SO2_Maximum                    float64
O3_1_hr_maximum                float64
O3_8_hr_maximum          

In [None]:
df_latlong=df_latlong.applymap(str)

Year                           object
Zip_Code                       object
county                         object
CO Annual Mean                 object
NO2_Annual Mean                object
SO2_Annual Mean                object
PM2.5_Wtd_Mean                 object
PM10_Annual Mean               object
surface area                   object
error                          object
City                           object
Latitude                       object
Longitude                      object
Temp_Maximum                   object
Temp_1_Day_Average             object
Pressure_1_Day_Average_mph     object
Pressure_Maximum_mph           object
Wind_Maximum_mph               object
Wind_1_Day_Average_mph         object
Dewpoint_Maximum_degF          object
Dewpoint_1_Day_Average_degF    object
CO_Maximum                     object
NO2_Maximum                    object
SO2_Maximum                    object
O3_1_hr_maximum                object
O3_8_hr_maximum                object
PM25_Maximum

In [None]:
df_latlong["Latitude"]=df_latlong["Latitude"].values.astype(float)
df_latlong["Longitude"]=df_latlong["Longitude"].values.astype(float)
df_latlong["Predicted"]=df_latlong["Predicted"].values.astype(float)

Year                            object
Zip_Code                        object
county                          object
CO Annual Mean                  object
NO2_Annual Mean                 object
SO2_Annual Mean                 object
PM2.5_Wtd_Mean                  object
PM10_Annual Mean                object
surface area                    object
error                           object
City                            object
Latitude                       float64
Longitude                      float64
Temp_Maximum                    object
Temp_1_Day_Average              object
Pressure_1_Day_Average_mph      object
Pressure_Maximum_mph            object
Wind_Maximum_mph                object
Wind_1_Day_Average_mph          object
Dewpoint_Maximum_degF           object
Dewpoint_1_Day_Average_degF     object
CO_Maximum                      object
NO2_Maximum                     object
SO2_Maximum                     object
O3_1_hr_maximum                 object
O3_8_hr_maximum          

In [None]:
fig = px.scatter_mapbox(df_latlong, lat="Latitude", lon="Longitude", color="City", size="Predicted",
                        hover_name="City", hover_data=["Actual", "Predicted",],
                   color_continuous_scale=px.colors.cyclical.IceFire, size_max=20, zoom=7, mapbox_style="open-street-map" )  #"carto-positron" ,color_continuous_scale=px.colors.cyclical.IceFire, size="Predicted",open-street-map
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()