In [1]:
import pandas as pd

data = pd.read_csv('AQI.csv')

print("Shape of the dataset:", data.shape)
print("\nColumns in the dataset:", data.columns)
print("\nData types of columns:\n", data.dtypes)

print("\nFirst few rows of the dataset:\n", data.head())

print("\nSummary statistics of numerical columns:\n", data.describe())

print("\nMissing values:\n", data.isnull().sum())


Shape of the dataset: (16695, 14)

Columns in the dataset: Index(['Country', 'City', 'AQI Value', 'AQI Category', 'CO AQI Value',
       'CO AQI Category', 'Ozone AQI Value', 'Ozone AQI Category',
       'NO2 AQI Value', 'NO2 AQI Category', 'PM2.5 AQI Value',
       'PM2.5 AQI Category', 'lat', 'lng'],
      dtype='object')

Data types of columns:
 Country                object
City                   object
AQI Value               int64
AQI Category           object
CO AQI Value            int64
CO AQI Category        object
Ozone AQI Value         int64
Ozone AQI Category     object
NO2 AQI Value           int64
NO2 AQI Category       object
PM2.5 AQI Value         int64
PM2.5 AQI Category     object
lat                   float64
lng                   float64
dtype: object

First few rows of the dataset:
               Country              City  AQI Value AQI Category  CO AQI Value  \
0  Russian Federation        Praskoveya         51     Moderate             1   
1              Brazi

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

data = pd.read_csv('AQI.csv')


data = data.dropna()

encoder = LabelEncoder()
data['Country'] = encoder.fit_transform(data['Country'])
data['City'] = encoder.fit_transform(data['City'])


features = ['CO AQI Value', 'Ozone AQI Value', 'NO2 AQI Value', 'PM2.5 AQI Value', 'lat', 'lng']
X = data[features]
y = data['AQI Category']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


pipeline = Pipeline([
    ('scaler', StandardScaler()), 
    ('logistic_regression', LogisticRegression())  
])


pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy * 100)

Accuracy: 93.90057944495273


In [3]:
import joblib

filename = 'aqi_model.pkl'

joblib.dump(pipeline, filename)

print("Pipeline saved as:", filename)



Pipeline saved as: aqi_model.pkl
