<a href="https://colab.research.google.com/github/sakib21103090/AI-Universe/blob/main/AQI_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Data Processing**

In [9]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer

# Load dataset
df = pd.read_csv('/content/Narsingdi data/NARSINGDI 21-23 Data.csv')

# Display the first few rows of the dataset
print("Dataset Preview:")
print(df.head())

# Check for missing values
print("\nMissing Values per Column:")
print(df.isnull().sum())

# Separate numeric and non-numeric columns
numeric_columns = df.select_dtypes(include=[np.number]).columns
non_numeric_columns = df.select_dtypes(exclude=[np.number]).columns

# Impute missing values in numeric columns using the mean
imputer = SimpleImputer(strategy='mean')
df_numeric_imputed = pd.DataFrame(imputer.fit_transform(df[numeric_columns]), columns=numeric_columns)

# For non-numeric columns, we can use a different strategy (e.g., 'most_frequent')
imputer_non_numeric = SimpleImputer(strategy='most_frequent')
df_non_numeric_imputed = pd.DataFrame(imputer_non_numeric.fit_transform(df[non_numeric_columns]), columns=non_numeric_columns)

# Combine numeric and non-numeric columns back together
df_imputed = pd.concat([df_numeric_imputed, df_non_numeric_imputed], axis=1)

# Confirm that missing values are handled
print("\nAfter Imputation, Missing Values per Column:")
print(df_imputed.isnull().sum())

# Save the cleaned dataset to a new CSV file
df_imputed.to_csv('/content/Narsingdi data/Narsingdi 21-23Data_cleaned.csv', index=False)

print("\nCleaned dataset saved as 'Narsingdi 21-23Data_cleaned.csv'.")


Dataset Preview:
          Date     SO2     NO     NO2    NOX    CO     O3   PM10   PM25
0  1/1/2021 1:00   0.33   3.65  13.55  17.20  1.64  42.49    NaN   52.5
1  1/1/2021 2:00   0.34   5.22  12.73  17.95  1.29  32.12    NaN   58.4
2  1/1/2021 3:00   0.33   2.11   8.85  10.96  1.11  41.77    NaN   52.1
3  1/1/2021 4:00   0.38   6.69  10.26  16.95  1.11  36.44    NaN   63.0
4  1/1/2021 5:00   0.40  12.00  12.94  24.94  0.86  31.99    NaN   83.8

Missing Values per Column:
 Date         0
 SO2       2993
 NO        9089
 NO2       6502
 NOX       6453
 CO        4189
 O3        3129
 PM10      4174
 PM25      3198
dtype: int64

After Imputation, Missing Values per Column:
 SO2       0
 NO        0
 NO2       0
 NOX       0
 CO        0
 O3        0
 PM10      0
 PM25      0
 Date      0
dtype: int64

Cleaned dataset saved as 'Narsingdi 21-23Data_cleaned.csv'.


# **AQI Calculation**

In [10]:
import pandas as pd

# Define the function to calculate AQI for a given pollutant
def calculate_aqi(C, C_low, C_high, I_low, I_high):
    if C_low <= C <= C_high:
        aqi = ((I_high - I_low) / (C_high - C_low)) * (C - C_low) + I_low
        return round(aqi, 2)
    else:
        return None  # If C is outside the defined range

# Breakpoints for AQI calculation for different pollutants
aqi_breakpoints = {

    "SO2": [(0, 30, 0, 50), (31, 60, 51, 100),
            (61, 90, 101, 200), (91, 120, 201, 300),
            (121, 250, 301, 400), (251, 350, 401, 500)],

    "NO ": [(0, 50, 0, 50), (51, 100, 51, 100),
             (101, 250, 101, 200), (251, 350, 201, 300),
             (351, 430, 301, 400), (431, 500, 401, 500)],
    "NO2": [(0, 40, 0, 50), (41, 80, 51, 100),
             (81, 380, 101, 200), (381, 800, 201, 300),
            (801, 1600, 301, 400), (1601, 2100, 401, 500)],

    "NOX": [(0, 1, 0, 50), (2, 10, 51, 100),
            (11, 17, 101, 200), (18, 34, 201, 300),
           (35, 50, 301, 400), (51, 60, 401, 500)],

    "CO": [(0, 50, 0, 50), (51, 100, 51, 100),
           (101, 168, 101, 200), (169, 208, 201, 300),
           (209, 748, 301, 400), (749, 1000, 401, 500)],

    "O3": [(0, 50, 0, 50), (51, 100, 51, 100),
           (101, 168, 101, 200), (169, 208, 201, 300),
           (209, 748, 301, 400), (749, 1000, 401, 500)],

    "PM10": [(0, 40, 0, 50), (41, 80, 51, 100),
           (81, 180, 101, 200), (181, 280, 201, 300),
           (281, 400, 301, 400), (401, 500, 401, 500)],
    "PM25": [(0, 40, 0, 50), (41, 80, 51, 100),
           (81, 180, 101, 200), (181, 280, 201, 300),
           (281, 400, 301, 400), (401, 500, 401, 500)]
}




In [11]:

# Load the dataset from a CSV file and clean the column names
def load_data(file_path):
    data = pd.read_csv(file_path)
    # Strip any leading or trailing spaces from column names
    data.columns = data.columns.str.strip()
    print("Column names in dataset:", data.columns)  # Print column names for debugging
    return data

# Function to find the AQI breakpoints based on pollutant concentration
def find_breakpoints(pollutant, C):
    if pollutant in aqi_breakpoints:
        for bp in aqi_breakpoints[pollutant]:
            C_low, C_high, I_low, I_high = bp
            if C_low <= C <= C_high:
                return C_low, C_high, I_low, I_high
    return None, None, None, None

In [12]:
# Calculate AQI for each pollutant in the dataset
def calculate_aqi_for_dataset(data):
    pollutants = [ 'SO2','NO','NO2','NOX','CO','O3','PM25', 'PM10']  # Only pollutants with AQI breakpoints

    for pollutant in pollutants:
        aqi_column = pollutant + '_AQI'
        data[aqi_column] = None  # Create a new column for AQI for each pollutant

        for index, row in data.iterrows():
            if pollutant in row and not pd.isna(row[pollutant]):
                concentration = row[pollutant]
                C_low, C_high, I_low, I_high = find_breakpoints(pollutant, concentration)

                if C_low is not None:
                    aqi = calculate_aqi(concentration, C_low, C_high, I_low, I_high)
                    data.at[index, aqi_column] = aqi

    return data

In [13]:
# Function to calculate the highest AQI for each row
def calculate_highest_aqi(data):
    pollutants = [ 'SO2_AQI','NO_AQI','NO2_AQI','NOX_AQI','CO_AQI','O3_AQI','PM25_AQI', 'PM10_AQI']

    # Create a new column 'AQI' that holds the highest AQI value among the pollutants
    data['AQI'] = data[pollutants].max(axis=1)

    return data

In [14]:
# Main function to load the dataset, calculate the AQI, and save results
def main():
    # Replace with your actual file path
    file_path = '/content/Narsingdi data/Narsingdi 21-23Data_cleaned.csv'
    data = load_data(file_path)

    # Calculate AQI for each pollutant
    aqi_data = calculate_aqi_for_dataset(data)

    # Calculate the highest AQI and add to the 'AQI' column
    aqi_data = calculate_highest_aqi(aqi_data)

    # Reorder columns: pollutants on the left, AQI values on the right
    pollutants = ['SO2', 'NO', 'NO2', 'NOX', 'CO', 'O3', 'PM25', 'PM10']  # Corrected the typo by adding a comma
    aqi_columns = [pollutant + '_AQI' for pollutant in pollutants]
    columns_to_save = ['Date'] + pollutants + aqi_columns + ['AQI']  # 'Date', pollutants, AQI columns, and 'AQI'

    # Filter and reorder columns
    filtered_data = aqi_data[columns_to_save]

    # Save the updated dataset to a new CSV file
    filtered_data.to_csv('/content/Narsingdi data/Narsingdi 21-23Data_cleaned_With_AQI.csv', index=False)
    print('CSV file saved successfully with pollutants, AQI values, and the highest AQI.')

if __name__ == "__main__":
    main()


Column names in dataset: Index(['SO2', 'NO', 'NO2', 'NOX', 'CO', 'O3', 'PM10', 'PM25', 'Date'], dtype='object')
CSV file saved successfully with pollutants, AQI values, and the highest AQI.


# **AQI Buckets**

In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split

In [16]:
df=pd.read_csv('/content/Narsingdi data/Narsingdi 21-23Data_cleaned_With_AQI.csv')

In [17]:
# Function to assign AQI buckets based on AQI values
def assign_aqi_bucket(aqi):
    if aqi <= 50:
        return 'Good'
    elif aqi <= 100:
        return 'Moderate'
    elif aqi <= 150:
        return 'Unhealthy for Sensitive Groups'
    elif aqi <= 200:
        return 'Unhealthy'
    elif aqi <= 300:
        return 'Very Unhealthy'
    else:
        return 'Hazardous'

# Apply the function to create a new column 'AQI_Bucket'
df['AQI_Bucket'] = df['AQI'].apply(assign_aqi_bucket)

# Step 2: Save the updated dataset to a new CSV file
df.to_csv('/content/Narsingdi data/Narsingdi 21-23Data_cleaned_With_AQI_Bucket.csv', index=False)
print('CSV file saved successfully.')

# Display the first few rows to verify
print(df.head())


CSV file saved successfully.
            Date   SO2     NO    NO2    NOX    CO     O3  PM25        PM10  \
0  1/1/2021 1:00  0.33   3.65  13.55  17.20  1.64  42.49  52.5  110.676109   
1  1/1/2021 2:00  0.34   5.22  12.73  17.95  1.29  32.12  58.4  110.676109   
2  1/1/2021 3:00  0.33   2.11   8.85  10.96  1.11  41.77  52.1  110.676109   
3  1/1/2021 4:00  0.38   6.69  10.26  16.95  1.11  36.44  63.0  110.676109   
4  1/1/2021 5:00  0.40  12.00  12.94  24.94  0.86  31.99  83.8  110.676109   

   SO2_AQI  NO_AQI  NO2_AQI  NOX_AQI  CO_AQI  O3_AQI  PM25_AQI  PM10_AQI  \
0     0.55     NaN    16.94      NaN    1.64   42.49     65.45    130.68   
1     0.57     NaN    15.91      NaN    1.29   32.12     72.86    130.68   
2     0.55     NaN    11.06      NaN    1.11   41.77     64.95    130.68   
3     0.63     NaN    12.82   199.17    1.11   36.44     78.64    130.68   
4     0.67     NaN    16.18   243.94    0.86   31.99    103.80    130.68   

      AQI                      AQI_Bucket  
0

# **Logistic Regression Algorithm**

In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split

In [19]:
# Step 1: Load your dataset
file_path = '/content/Narsingdi data/Narsingdi 21-23Data_cleaned_With_AQI_Bucket.csv'  # Replace with your actual file path
df = pd.read_csv(file_path)

In [20]:
# Step 2: Data preprocessing and feature selection
# Selecting relevant columns
X = df[['SO2', 'NO', 'NO2', 'NOX', 'CO', 'O3', 'PM25', 'PM10']]
y = df['AQI_Bucket']  # Assuming AQI_Bucket is categorical

In [21]:
# Step 3: Splitting data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [22]:

# Step 4: Feature scaling (if necessary)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [23]:
# Step 5: Training the Logistic Regression model
model = LogisticRegression()
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [24]:
# Step 6: Predicting on the test set
y_pred = model.predict(X_test)

In [25]:
# Step 7: Evaluating the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)  # Added classification_report

In [26]:
print(f'Accuracy: {accuracy}')
print(f'Confusion Matrix:\n{conf_matrix}')
print("Classification Report:\n", class_report)

# Step 8: Print feature importance
features = X.columns  # Assuming X is a DataFrame with named columns
feature_importance = pd.DataFrame({'feature': features, 'importance': abs(model.coef_[0])})
print("\nFeature Importance:")
print(feature_importance.sort_values('importance', ascending=False))


Accuracy: 0.6774193548387096
Confusion Matrix:
[[  96    0   11    0   28    0]
 [   0  160    0    4   14  196]
 [  13    3  728    0  158    6]
 [   0    8    9   14  499   32]
 [   0    6  127    8 1643    4]
 [   1   27    0   38  248  383]]
Classification Report:
                                 precision    recall  f1-score   support

                          Good       0.87      0.71      0.78       135
                     Hazardous       0.78      0.43      0.55       374
                      Moderate       0.83      0.80      0.82       908
                     Unhealthy       0.22      0.02      0.04       562
Unhealthy for Sensitive Groups       0.63      0.92      0.75      1788
                Very Unhealthy       0.62      0.55      0.58       697

                      accuracy                           0.68      4464
                     macro avg       0.66      0.57      0.59      4464
                  weighted avg       0.64      0.68      0.63      4464


Featur

#**Decision Tree Classifier**


In [27]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Step 1: Load dataset
df = pd.read_csv('/content/Narsingdi data/Narsingdi 21-23Data_cleaned_With_AQI_Bucket.csv')

# Step 2: Preprocessing
X = df[['SO2', 'NO', 'NO2', 'NOX', 'CO', 'O3', 'PM25', 'PM10']]
y = df['AQI_Bucket']

# Step 3: Train-Test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Decision Tree model
model = DecisionTreeClassifier(random_state=42)
model.fit(X_train, y_train)

# Step 5: Predictions
y_pred = model.predict(X_test)

# Step 6: Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.9939516129032258
Confusion Matrix:
 [[ 130    0    5    0    0    0]
 [   0  371    0    1    2    0]
 [   3    1  903    0    1    0]
 [   0    0    1  558    3    0]
 [   0    0    0    1 1787    0]
 [   0    3    2    3    1  688]]
Classification Report:
                                 precision    recall  f1-score   support

                          Good       0.98      0.96      0.97       135
                     Hazardous       0.99      0.99      0.99       374
                      Moderate       0.99      0.99      0.99       908
                     Unhealthy       0.99      0.99      0.99       562
Unhealthy for Sensitive Groups       1.00      1.00      1.00      1788
                Very Unhealthy       1.00      0.99      0.99       697

                      accuracy                           0.99      4464
                     macro avg       0.99      0.99      0.99      4464
                  weighted avg       0.99      0.99      0.99      4464



# **Random Forest Classification**


In [28]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Load the dataset
file_path = '/content/Narsingdi data/Narsingdi 21-23Data_cleaned_With_AQI_Bucket.csv'
df = pd.read_csv(file_path)

# Prepare the features and target
X = df[['SO2', 'NO', 'NO2', 'NOX', 'CO', 'O3', 'PM25', 'PM10']]
y = df['AQI_Bucket']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train the model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train_scaled, y_train)

# Make predictions
rf_pred = rf_model.predict(X_test_scaled)

# Evaluate the model
print("Random Forest Results:")
print(f'Accuracy: {accuracy_score(y_test, rf_pred)}')
print(f'Confusion Matrix:\n{confusion_matrix(y_test, rf_pred)}')
print("Classification Report:\n", classification_report(y_test, rf_pred))

# Print feature importance
feature_importance = pd.DataFrame({'feature': X.columns, 'importance': rf_model.feature_importances_})
print("\nFeature Importance:")
print(feature_importance.sort_values('importance', ascending=False))

Random Forest Results:
Accuracy: 0.9899193548387096
Confusion Matrix:
[[ 128    0    6    0    1    0]
 [   0  371    0    1    1    1]
 [   0    5  893    2    1    7]
 [   0    5    0  554    2    1]
 [   0    3    2    1 1779    3]
 [   0    3    0    0    0  694]]
Classification Report:
                                 precision    recall  f1-score   support

                          Good       1.00      0.95      0.97       135
                     Hazardous       0.96      0.99      0.98       374
                      Moderate       0.99      0.98      0.99       908
                     Unhealthy       0.99      0.99      0.99       562
Unhealthy for Sensitive Groups       1.00      0.99      1.00      1788
                Very Unhealthy       0.98      1.00      0.99       697

                      accuracy                           0.99      4464
                     macro avg       0.99      0.98      0.99      4464
                  weighted avg       0.99      0.99      

# **Support Vector Machine Classification**

In [29]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Load the dataset
file_path = '/content/Narsingdi data/Narsingdi 21-23Data_cleaned_With_AQI_Bucket.csv'
df = pd.read_csv(file_path)

# Prepare the features and target
X = df[['SO2', 'NO', 'NO2', 'NOX', 'CO', 'O3', 'PM25', 'PM10']]
y = df['AQI_Bucket']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train the model
svm_model = SVC(random_state=42)
svm_model.fit(X_train_scaled, y_train)

# Make predictions
svm_pred = svm_model.predict(X_test_scaled)

# Evaluate the model
print("Support Vector Machine Results:")
print(f'Accuracy: {accuracy_score(y_test, svm_pred)}')
print(f'Confusion Matrix:\n{confusion_matrix(y_test, svm_pred)}')
print("Classification Report:\n", classification_report(y_test, svm_pred))

# Note: SVM doesn't have built-in feature importance
print("\nNote: SVM doesn't provide built-in feature importance.")

Support Vector Machine Results:
Accuracy: 0.8828405017921147
Confusion Matrix:
[[  97    1   11   25    1    0]
 [   0  344    0    0    1   29]
 [  10    7  805   25   58    3]
 [   0    7    0  407  108   40]
 [   0    3   48   43 1691    3]
 [   0   29    0   66    5  597]]
Classification Report:
                                 precision    recall  f1-score   support

                          Good       0.91      0.72      0.80       135
                     Hazardous       0.88      0.92      0.90       374
                      Moderate       0.93      0.89      0.91       908
                     Unhealthy       0.72      0.72      0.72       562
Unhealthy for Sensitive Groups       0.91      0.95      0.93      1788
                Very Unhealthy       0.89      0.86      0.87       697

                      accuracy                           0.88      4464
                     macro avg       0.87      0.84      0.85      4464
                  weighted avg       0.88      0

# **K-Nearest Neighbors Classification**

In [30]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Load the dataset
file_path = '/content/Narsingdi data/Narsingdi 21-23Data_cleaned_With_AQI_Bucket.csv'
df = pd.read_csv(file_path)

# Prepare the features and target
X = df[['SO2', 'NO', 'NO2', 'NOX', 'CO', 'O3', 'PM25', 'PM10']]
y = df['AQI_Bucket']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train the model
knn_model = KNeighborsClassifier()
knn_model.fit(X_train_scaled, y_train)

# Make predictions
knn_pred = knn_model.predict(X_test_scaled)

# Evaluate the model
print("K-Nearest Neighbors Results:")
print(f'Accuracy: {accuracy_score(y_test, knn_pred)}')
print(f'Confusion Matrix:\n{confusion_matrix(y_test, knn_pred)}')
print("Classification Report:\n", classification_report(y_test, knn_pred))

# Note: KNN doesn't have built-in feature importance
print("\nNote: KNN doesn't provide built-in feature importance.")

K-Nearest Neighbors Results:
Accuracy: 0.8902329749103942
Confusion Matrix:
[[ 116    0   12    6    0    1]
 [   0  334    0    0    3   37]
 [  11    3  839   10   40    5]
 [   7    9    7  427   83   29]
 [   0    3   37   50 1693    5]
 [  10   47    8   59    8  565]]
Classification Report:
                                 precision    recall  f1-score   support

                          Good       0.81      0.86      0.83       135
                     Hazardous       0.84      0.89      0.87       374
                      Moderate       0.93      0.92      0.93       908
                     Unhealthy       0.77      0.76      0.77       562
Unhealthy for Sensitive Groups       0.93      0.95      0.94      1788
                Very Unhealthy       0.88      0.81      0.84       697

                      accuracy                           0.89      4464
                     macro avg       0.86      0.87      0.86      4464
                  weighted avg       0.89      0.89