## Artificial Neural Network Assignment

### Importing the modules

In [186]:
import pandas as pd
import numpy as np
np.set_printoptions(legacy="1.25")
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense

### Importing and preproccessing data

In [187]:
data = pd.read_csv("updated_air_quality_data.csv")
data.drop("Unique ID", axis=1, inplace=True) # Removing the Unique ID column as it is not needed
data.dropna(inplace=True) # Remove any NaN if there
data

Unnamed: 0,Name,Measure,Geo Type Name,Geo Place Name,Time Period,Start_Date,Data Value,Air Quality Category
0,Emissions,Density,UHF42,Queens,Other,2015-01-01,0.3,Good
1,Emissions,Density,UHF42,Unknown,Other,2015-01-01,1.2,Good
2,General Pollution,Miles,UHF42,Unknown,Annual Average,2011-12-01,8.6,Good
3,General Pollution,Miles,UHF42,Queens,Annual Average,2011-12-01,8.0,Good
4,General Pollution,Miles,UHF42,Queens,Summer,2022-06-01,6.1,Good
...,...,...,...,...,...,...,...,...
18011,General Pollution,Miles,CD,Unknown,Other,2019-01-01,12.9,Moderate
18012,General Pollution,Miles,CD,Unknown,Other,2010-01-01,14.7,Moderate
18013,General Pollution,Miles,UHF42,Unknown,Other,2010-01-01,43.4,Bad
18014,General Pollution,Miles,UHF42,Queens,Other,2010-01-01,65.8,Bad


In [188]:
data_enc = data.copy()
for col in data:
	if (col != 'Start_Date') and (col != 'Data Value'):
		data_enc[col] = data[col].astype("category").cat.codes
data_enc["Start_Date"] = data["Start_Date"].apply(lambda a: int(a.replace("-", "")[2:])/10000) # coverting data by simply removing the first 2 digits of the year and then combining to yyddmm and then making the number small
print(dict(zip(data["Air Quality Category"].astype("category").cat.categories, data["Air Quality Category"].astype("category").cat.codes.unique()))) # print the mapping for the air quality
data_enc

{'Bad': 1, 'Good': 2, 'Moderate': 0}


Unnamed: 0,Name,Measure,Geo Type Name,Geo Place Name,Time Period,Start_Date,Data Value,Air Quality Category
0,1,1,4,3,1,15.0101,0.3,1
1,1,1,4,5,1,15.0101,1.2,1
2,2,2,4,5,0,11.1201,8.6,1
3,2,2,4,3,0,11.1201,8.0,1
4,2,2,4,3,2,22.0601,6.1,1
...,...,...,...,...,...,...,...,...
18011,2,2,1,5,1,19.0101,12.9,2
18012,2,2,1,5,1,10.0101,14.7,2
18013,2,2,4,5,1,10.0101,43.4,0
18014,2,2,4,3,1,10.0101,65.8,0


### Splitting the data

In [189]:
train_val_data = data_enc.sample(frac=0.85) # training + validation data
test_data = data_enc.drop(train_val_data.index)

train_val_target = train_val_data["Air Quality Category"] # seperate the target column
train_val_data.drop("Air Quality Category", axis=1, inplace=True)
test_target = test_data["Air Quality Category"] # seperate the target column
test_data.drop("Air Quality Category", axis=1, inplace=True)

train_val_target = pd.get_dummies(train_val_target).values # converting to one hot encoding
test_target = pd.get_dummies(test_target).values # same^

### Creating the structure of the Neural net then compiling it

In [190]:
model = Sequential()
model.add(Dense(units=16, activation='relu', input_dim=len(train_val_data.columns)))
model.add(Dense(units=8, activation='relu'))
model.add(Dense(units=4, activation='relu'))
model.add(Dense(units=3, activation='softmax')) # output layer. 3 units for each of good, bad, moderate (one-hot endcoded)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [191]:
model.compile(loss='categorical_crossentropy', optimizer='sgd', metrics=['accuracy'])

### Fitting the model

In [192]:
model.fit(train_val_data, train_val_target, validation_split=(15/85), batch_size=10, epochs=20)

Epoch 1/20
[1m1262/1262[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 785us/step - accuracy: 0.7439 - loss: 0.5946 - val_accuracy: 0.8724 - val_loss: 0.3590
Epoch 2/20
[1m1262/1262[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 663us/step - accuracy: 0.8941 - loss: 0.2996 - val_accuracy: 0.8676 - val_loss: 0.3113
Epoch 3/20
[1m1262/1262[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 663us/step - accuracy: 0.9102 - loss: 0.2339 - val_accuracy: 0.8687 - val_loss: 0.2971
Epoch 4/20
[1m1262/1262[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 673us/step - accuracy: 0.9171 - loss: 0.2022 - val_accuracy: 0.8065 - val_loss: 0.4789
Epoch 5/20
[1m1262/1262[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 668us/step - accuracy: 0.9244 - loss: 0.1829 - val_accuracy: 0.8701 - val_loss: 0.3191
Epoch 6/20
[1m1262/1262[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 663us/step - accuracy: 0.9293 - loss: 0.1732 - val_accuracy: 0.9256 - val_loss: 0.1774
Epoc

<keras.src.callbacks.history.History at 0x24b23f033d0>

### Making predictions and calculating accuracy

In [193]:
pred_data = model.predict(test_data)
correct_pred_index = np.argmax(pred_data, axis=1)
correct_test_index = np.argmax(test_target, axis=1)
accuracy = np.mean(correct_pred_index == correct_test_index) * 100
accuracy

[1m85/85[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 765us/step


94.00444115470023

## Performance:

OF: Overfit
| Hidden Layers  | Activation  | Epochs  | Accuracy    |
| -------------- | ----------- | ------- | ----------- |
| 2 (8,8)        | sigmoid     | 20      | 90.52%      |
| 2 (8,8)        | relu + sig  | 10      | 88.71%      |
| 2 (8,8)        | relu + sig  | 20      | 94.97%      |
| 2 (8,8)        | relu + sig  | 25      | 92.26% (OF) |
| 2 (8,8)        | relu        | 20      | 29.34% (OF?)|
| 2 (16,16)      | relu + sig  | 11      | 92.78%      |
| 2 (16,16)      | relu + sig  | 20      | 88.78% (OF) |
| 2 (16,8)       | relu + sig  | 9       | 94.70%      |
| 3 (16,8,4)     | relu + sig  | 9       | 92.97%      |
| **3 (16,8,4)**     | **relu + sig**  | **20**      | **96.15%**      |
| 3 (16,8,4)     | relu + soft | 20      | 94.00%      |
| 3 (16,8,4)     | relu + sig  | 25      | 93.56% (OF) |
| 3 (12,6,3)     | relu + sig  | 23      | 95.67%      |
| 3 (12,6,3)     | relu + sig  | 28      | 93.19% (OF) |