In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, mutual_info_regression

In [2]:
df = pd.read_csv("datasets\clean_data.csv")

In [3]:
# Split the data into features and target variable
X = df.drop(['City', 'Date', 'AQI'], axis=1)  # Features
y = df['AQI']  # Target variable

In [4]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Perform feature selection using mutual information score
selector = SelectKBest(mutual_info_regression, k=9)  # Select top  features based on mutual information score
X_train_selected = selector.fit_transform(X_train, y_train)
X_test_selected = selector.transform(X_test)

# Get the indices of the selected features
selected_feature_indices = selector.get_support(indices=True)

# Get the names of the selected features
selected_feature_names = X.columns[selected_feature_indices]

# Print the top 5 selected features along with their mutual information scores
print("Top 9 Selected Features:")
for feature, score in zip(selected_feature_names, selector.scores_[selected_feature_indices]):
    print(f"{feature}: {score}")


Top 9 Selected Features:
PM2.5: 0.9434145273613321
PM10: 0.6495358145594574
NO: 0.3437910936701627
NO2: 0.3456981003469899
NOx: 0.25179444952080576
NH3: 0.21810070465443765
CO: 0.43237888141382186
SO2: 0.28583436033104714
O3: 0.2569481392119535
