In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb

# Load the dataset
AQIData2 = pd.read_csv("./AQI_with_Category.csv")  # Uncomment this to load data from file

print(AQIData2.head())  # Check the first few rows to verify
print(AQIData2.columns)  # Inspect column names

# Assuming AQIData2 is already loaded in the environment
# Encode the AQI categories into numeric values
label_encoder = LabelEncoder()
AQIData2['Category_encoded'] = label_encoder.fit_transform(AQIData2['Category'])
print(AQIData2)

   Ozone,μg/m3  Sulphur Dioxide,μg/m3  PM2.5,μg/m3  PM10,μg/m3  Ammonia,μg/m3  \
0    26.791304              82.430870   342.301739  405.037826      26.720435   
1    20.753333              65.158333   180.754167  226.001250      21.993750   
2    43.925000              41.485000   115.901667  143.952500      20.280000   
3    50.372500              41.515417   197.889167  234.861250      23.145000   
4    34.696250              77.785417   329.381667  404.049583      47.233750   

   Carbon Monoxide,mg/m3   NOx,μg/m3  AQI_calculated   Category  
0                   2.97  108.429130           471.0     Severe  
1                   2.31   89.862500           347.0  Very Poor  
2                   1.59   61.155833           286.0       Poor  
3                   2.46   81.926250           360.0  Very Poor  
4                   4.52  159.669167           461.0     Severe  
Index(['Ozone,μg/m3', 'Sulphur Dioxide,μg/m3', 'PM2.5,μg/m3', 'PM10,μg/m3',
       'Ammonia,μg/m3', 'Carbon Monoxide,

In [6]:
# Assuming AQIData2 is already loaded in the environment
# Encode the AQI categories into numeric values
label_encoder = LabelEncoder()
AQIData2['Category_encoded'] = label_encoder.fit_transform(AQIData2['Category'])

# Features and target
X = AQIData2[['Ozone,μg/m3', 'Sulphur Dioxide,μg/m3', 'PM2.5,μg/m3', 'PM10,μg/m3',
       'Ammonia,μg/m3', 'Carbon Monoxide,mg/m3', 'NOx,μg/m3']]
y = AQIData2['Category_encoded']

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize features for models sensitive to scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize classifiers
models = {
    "Logistic Regression": LogisticRegression(max_iter=500),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(n_estimators=100),
    "K-Nearest Neighbors": KNeighborsClassifier(n_neighbors=5),
    "SVM": SVC(kernel='rbf'),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=100),
    "XGBoost": xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
}

# Train and evaluate all classifiers
for model_name, model in models.items():
    print(f"\nTraining {model_name}...")
    if model_name in ["Logistic Regression", "SVM", "K-Nearest Neighbors"]:  # Use scaled data for these models
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
    else:  # Use unscaled data for tree-based models
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

    # Evaluation
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy:.4f}")
    # print("Classification Report:")
    # print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))
    # print("Confusion Matrix:")
    # print(confusion_matrix(y_test, y_pred))


Training Logistic Regression...
Accuracy: 0.8418

Training Decision Tree...
Accuracy: 0.9557

Training Random Forest...
Accuracy: 0.9873

Training K-Nearest Neighbors...
Accuracy: 0.7785

Training SVM...
Accuracy: 0.8418

Training Gradient Boosting...
Accuracy: 0.9810

Training XGBoost...


Parameters: { "use_label_encoder" } are not used.



Accuracy: 0.9810
