### Importing Packages

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
from pathlib import Path
import pickle
import os


from Html_script import retrieve_html
from Plot_AQI import import_pm
from Extract_Combine import combined_data,Meta_Data

from sklearn.ensemble import ExtraTreesRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeRegressor
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
import xgboost as xgb

import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LeakyReLU,PReLU,ELU
from keras.layers import Dropout

In [None]:
start_time=time.time()

### Importing Data

In [None]:
HTML_Path = '/Data/HTML_Data'  
Path(HTML_Path).mkdir(parents=True, exist_ok=True)
print('Created Folder {}'.format(HTML_Path))
#retrieve_html()
Aqi_Data = import_pm()
HTML_Path = '/Data/HTML_Data'  
Variable_Data = combined_data(2013,2018, HTML_Path)
All_Data = pd.merge(Variable_Data,Aqi_Data,how = 'left',on = 'Date')
Final_Data = All_Data.loc[:,['T','TM','Tm','H','PP','VV','V','VM','PM2.5']]

for columns in Final_Data.columns:
    Final_Data[columns] = pd.to_numeric(Final_Data[columns], errors='coerce')
    

### Feature Engineering

#### Checking for Null Values

In [None]:
sns.heatmap(Final_Data.isnull(), yticklabels = False, cbar = False, cmap = 'viridis')
Final_Data.isnull().sum()


Columns PP, VM and PM2.5 have null values
Lets analyse PP

In [None]:
Final_Data['PP'].value_counts()

PP of the total 558 values, PP has 16 nulls, and 388 records of value as 0. So we replace the value null value with 0. I am using Mode beause I suspect the missing values are of MCAR category.

In [None]:
Final_Data['PP'].fillna(Final_Data['PP'].value_counts().max(), inplace = True)

Lets analyze VM

In [None]:
Final_Data['VM'].value_counts()

In [None]:
print(Final_Data['VM'].median())

Here also i suspect the missing value is of MCAR category. The mode here is only one third of the data, so I would rather use median to replace the missing values.

In [None]:
Final_Data['VM'].fillna(Final_Data['VM'].median(), inplace = True)

PM 2.5 will be our dependent column, we will choose to remove all null values to help the model make correct predictions

In [None]:
Final_Data=Final_Data.dropna()


In [None]:
sns.heatmap(Final_Data.isnull(), yticklabels = False, cbar = False, cmap = 'viridis')
Final_Data.isnull().sum()


#### Spliting the predictors and response

In [None]:
X=Final_Data.iloc[:,:-1] ## independent features
y=Final_Data.iloc[:,-1] ## dependent features

### Feature Selection

#### Checking for Correlation
Correlation states how the features are related to each other or the target variable.

Correlation can be positive (increase in one value of feature increases the value of the target variable) or negative (increase in one value of feature decreases the value of the target variable)

Heatmap makes it easy to identify which features are most related to the target variable, we will plot heatmap of correlated features using the seaborn library.

In [None]:
sns.pairplot(Final_Data)

In [None]:
Final_Data.corr()

In [None]:
#get correlations of each features in dataset
corrmat = Final_Data.corr()
top_corr_features = corrmat.index
plt.figure(figsize=(20,20))
#plot heat map
g=sns.heatmap(Final_Data[top_corr_features].corr(),annot=True,cmap="RdYlGn")

### Model Development

#### Scaling and Train Test Split of Data

In [None]:
sc_X = StandardScaler()
X_Scale = sc_X.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

#### ANN Regression

In [None]:

NN_model = Sequential()

# The Input Layer :
NN_model.add(Dense(128, kernel_initializer='normal',input_dim = X_train.shape[1], activation='relu'))

# The Hidden Layers :
NN_model.add(Dense(256, kernel_initializer='normal',activation='relu'))
NN_model.add(Dense(256, kernel_initializer='normal',activation='relu'))
NN_model.add(Dense(256, kernel_initializer='normal',activation='relu'))

# The Output Layer :
NN_model.add(Dense(1, kernel_initializer='normal',activation='linear'))

# Compile the network :
NN_model.compile(loss='mean_absolute_error', optimizer='adam', metrics=['mean_absolute_error'])
NN_model.summary()

# Fitting the ANN to the Training set
model_history=NN_model.fit(X_train, y_train,validation_split=0.33, batch_size = 10, nb_epoch = 100)

#### Model Evaluation

In [None]:
prediction=NN_model.predict(X_test)
sns.distplot(y_test.values.reshape(-1,1)-prediction)

In [None]:

plt.scatter(y_test,prediction)

In [None]:
print('MAE:', metrics.mean_absolute_error(y_test, prediction))
print('MSE:', metrics.mean_squared_error(y_test, prediction))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, prediction)))

In [None]:
# open a file, where you ant to store the data
file = open('ANN.pkl', 'wb')

# dump information to that file
pickle.dump(ANN, file)

In [None]:
stop_time=time.time()
print("Time taken {} seconds".format(stop_time-start_time))   