In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import classification_report, accuracy_score


In [3]:
# Load the dataset
data = pd.read_csv('C:/Users/asus/Downloads/Air-Quality-Index--AQI--main/Air-Quality-Index--AQI--main/city_day.csv')

# Display basic information about the dataset to understand the structure
print(data.info())
print(data.head())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29531 entries, 0 to 29530
Data columns (total 17 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   City          29531 non-null  object 
 1   Date          29531 non-null  object 
 2   PM2.5         24933 non-null  float64
 3   PM10          18391 non-null  float64
 4   NO            25949 non-null  float64
 5   NO2           25946 non-null  float64
 6   NOx           25346 non-null  float64
 7   NH3           19203 non-null  float64
 8   CO            27472 non-null  float64
 9   SO2           25677 non-null  float64
 10  O3            25509 non-null  float64
 11  Benzene       23908 non-null  float64
 12  Toluene       21490 non-null  float64
 13  Xylene        11422 non-null  float64
 14  AQI           24850 non-null  float64
 15  AQI_Bucket    24850 non-null  object 
 16  Demographics  29531 non-null  object 
dtypes: float64(13), object(4)
memory usage: 3.8+ MB
None
        City    

In [4]:
# Drop rows where demographic data is missing
data = data.dropna(subset=['Demographics'])

In [5]:
# Define features (pollutant columns) and target (demographics)
features = ['PM2.5', 'PM10', 'NO', 'NO2', 'NOx', 'NH3', 'CO', 'SO2', 'O3', 'Benzene', 'Toluene', 'Xylene']
X = data[features]
y = data['Demographics']


In [6]:
# Impute missing values in the features with the mean
imputer = SimpleImputer(strategy='mean')
X = imputer.fit_transform(X)


In [7]:
# Encode the categorical target variable (Demographics)
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)


In [8]:
# Standardize the features for better model performance
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [9]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_encoded, test_size=0.2, random_state=42)


In [10]:
# Define the neural network model
model = Sequential()
model.add(Dense(64, input_shape=(X_train.shape[1],), activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(16, activation='relu'))
model.add(Dense(len(np.unique(y_encoded)), activation='softmax'))  # Output layer with softmax for classification


In [11]:
# Compile the model with a suitable loss function and optimizer for classification
model.compile(optimizer=Adam(learning_rate=0.001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])


In [12]:
# Train the model
history = model.fit(X_train, y_train, epochs=50, batch_size=16, validation_split=0.2, verbose=1)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [13]:
# Predict on the test set
y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)  # Convert predicted probabilities to class labels

# Display classification report and accuracy score
print("Classification Report:\n", classification_report(y_test, y_pred_classes, target_names=label_encoder.classes_))
print(f"Accuracy: {accuracy_score(y_test, y_pred_classes)}")

Classification Report:
               precision    recall  f1-score   support

  industrial       0.53      0.59      0.56      1893
 residential       0.80      0.75      0.77      4014

    accuracy                           0.70      5907
   macro avg       0.66      0.67      0.67      5907
weighted avg       0.71      0.70      0.70      5907

Accuracy: 0.7003555104113763


# RESIDENTIAL

In [14]:
# Define the sample data point based on the provided details for Amaravati
sample_data = [
    54.73,  # PM2.5
    94.12,  # PM10
    3.49,   # NO
    12.79,  # NO2
    9.73,   # NOx
    22.79,  # NH3
    0.58,   # CO
    8.21,   # SO2
    30.21,  # O3
    0.08,   # Benzene
    2.23,   # Toluene
    0.15    # Xylene
]

# Convert sample data to a DataFrame for preprocessing
sample_df = pd.DataFrame([sample_data], columns=features)

# Impute any missing values in the sample data (if necessary)
sample_df = pd.DataFrame(imputer.transform(sample_df), columns=features)

# Scale the sample data using the same scaler
sample_scaled = scaler.transform(sample_df)

# Predict the demographic category (residential in this case)
predicted_class = model.predict(sample_scaled)
predicted_class_label = label_encoder.inverse_transform([np.argmax(predicted_class)])

print(f'Predicted Demographic: {predicted_class_label[0]}')


Predicted Demographic: industrial




# INDUSTRIAL

In [15]:
# Define the sample data point based on the provided details for Amaravati on 20-02-2018
sample_data = [
    50.91,  # PM2.5
    99.84,  # PM10
    4.55,   # NO
    16.33,  # NO2
    12.39,  # NOx
    23.18,  # NH3
    0.64,   # CO
    10.34,  # SO2
    26.24,  # O3
    0.1,    # Benzene
    2.51,   # Toluene
    0.1     # Xylene
]

# Convert sample data to a DataFrame for preprocessing
sample_df = pd.DataFrame([sample_data], columns=features)

# Impute any missing values in the sample data (if necessary)
sample_df = pd.DataFrame(imputer.transform(sample_df), columns=features)

# Scale the sample data using the same scaler
sample_scaled = scaler.transform(sample_df)

# Predict the demographic category (residential in this case)
predicted_class = model.predict(sample_scaled)
predicted_class_label = label_encoder.inverse_transform([np.argmax(predicted_class)])

print(f'Predicted Demographic: {predicted_class_label[0]}')


Predicted Demographic: industrial




# INDUSTRIAL

In [16]:
# Define the sample data point based on the provided details for Amaravati on 21-02-2018
sample_data = [
    38.5,   # PM2.5
    106.7,  # PM10
    4.5,    # NO
    16.82,  # NO2
    12.69,  # NOx
    19.54,  # NH3
    0.58,   # CO
    11.02,  # SO2
    26.62,  # O3
    0.1,    # Benzene
    2.68,   # Toluene
    0.11    # Xylene
]

# Convert sample data to a DataFrame for preprocessing
sample_df = pd.DataFrame([sample_data], columns=features)

# Impute any missing values in the sample data (if necessary)
sample_df = pd.DataFrame(imputer.transform(sample_df), columns=features)

# Scale the sample data using the same scaler
sample_scaled = scaler.transform(sample_df)

# Predict the demographic category (residential in this case)
predicted_class = model.predict(sample_scaled)
predicted_class_label = label_encoder.inverse_transform([np.argmax(predicted_class)])

print(f'Predicted Demographic: {predicted_class_label[0]}')


Predicted Demographic: residential




# RESIDENTIAL

In [17]:
# Define the sample data point based on the provided details for Amaravati on 22-02-2018
sample_data = [
    32.21,  # PM2.5
    107.43, # PM10
    7.39,   # NO
    17.33,  # NO2
    15.32,  # NOx
    18.13,  # NH3
    0.6,    # CO
    11.81,  # SO2
    24.84,  # O3
    0.12,   # Benzene
    3.28,   # Toluene
    0.13    # Xylene
]

# Convert sample data to a DataFrame for preprocessing
sample_df = pd.DataFrame([sample_data], columns=features)

# Impute any missing values in the sample data (if necessary)
sample_df = pd.DataFrame(imputer.transform(sample_df), columns=features)

# Scale the sample data using the same scaler
sample_scaled = scaler.transform(sample_df)

# Predict the demographic category (residential in this case)
predicted_class = model.predict(sample_scaled)
predicted_class_label = label_encoder.inverse_transform([np.argmax(predicted_class)])

print(f'Predicted Demographic: {predicted_class_label[0]}')




Predicted Demographic: residential


# RESIDENTIAL

In [18]:
# Define the sample data point based on the provided details for Amaravati on 23-02-2018
sample_data = [
    35.36,  # PM2.5
    107.25, # PM10
    6.13,   # NO
    17.75,  # NO2
    14.52,  # NOx
    16.57,  # NH3
    0.85,   # CO
    10.87,  # SO2
    23.99,  # O3
    0.09,   # Benzene
    2.92,   # Toluene
    0.2     # Xylene
]

# Convert sample data to a DataFrame for preprocessing
sample_df = pd.DataFrame([sample_data], columns=features)

# Impute any missing values in the sample data (if necessary)
sample_df = pd.DataFrame(imputer.transform(sample_df), columns=features)

# Scale the sample data using the same scaler
sample_scaled = scaler.transform(sample_df)

# Predict the demographic category (residential in this case)
predicted_class = model.predict(sample_scaled)
predicted_class_label = label_encoder.inverse_transform([np.argmax(predicted_class)])

print(f'Predicted Demographic: {predicted_class_label[0]}')


Predicted Demographic: residential


