In [18]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [19]:
import os
os.chdir('/content/drive/My Drive/')

In [20]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [21]:
df = pd.read_csv('city_day.csv')
print(df)

                City        Date  PM2.5   PM10     NO    NO2    NOx    NH3  \
0          Ahmedabad  2015-01-01    NaN    NaN   0.92  18.22  17.15    NaN   
1          Ahmedabad  2015-01-02    NaN    NaN   0.97  15.69  16.46    NaN   
2          Ahmedabad  2015-01-03    NaN    NaN  17.40  19.30  29.70    NaN   
3          Ahmedabad  2015-01-04    NaN    NaN   1.70  18.48  17.97    NaN   
4          Ahmedabad  2015-01-05    NaN    NaN  22.10  21.42  37.76    NaN   
...              ...         ...    ...    ...    ...    ...    ...    ...   
29526  Visakhapatnam  2020-06-27  15.02  50.94   7.68  25.06  19.54  12.47   
29527  Visakhapatnam  2020-06-28  24.38  74.09   3.42  26.06  16.53  11.99   
29528  Visakhapatnam  2020-06-29  22.91  65.73   3.45  29.53  18.33  10.71   
29529  Visakhapatnam  2020-06-30  16.64  49.97   4.05  29.26  18.80  10.03   
29530  Visakhapatnam  2020-07-01  15.00  66.00   0.40  26.85  14.05   5.20   

          CO    SO2      O3  Benzene  Toluene  Xylene   AQI    

In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29531 entries, 0 to 29530
Data columns (total 16 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   City        29531 non-null  object 
 1   Date        29531 non-null  object 
 2   PM2.5       24933 non-null  float64
 3   PM10        18391 non-null  float64
 4   NO          25949 non-null  float64
 5   NO2         25946 non-null  float64
 6   NOx         25346 non-null  float64
 7   NH3         19203 non-null  float64
 8   CO          27472 non-null  float64
 9   SO2         25677 non-null  float64
 10  O3          25509 non-null  float64
 11  Benzene     23908 non-null  float64
 12  Toluene     21490 non-null  float64
 13  Xylene      11422 non-null  float64
 14  AQI         24850 non-null  float64
 15  AQI_Bucket  24850 non-null  object 
dtypes: float64(13), object(3)
memory usage: 3.6+ MB


In [23]:
# 2. Count missing values in each column
print("Missing Values Count:")
print(df.isnull().sum())

Missing Values Count:
City              0
Date              0
PM2.5          4598
PM10          11140
NO             3582
NO2            3585
NOx            4185
NH3           10328
CO             2059
SO2            3854
O3             4022
Benzene        5623
Toluene        8041
Xylene        18109
AQI            4681
AQI_Bucket     4681
dtype: int64


In [24]:
# 1. Drop rows where the target variable 'PM2.5' is missing
df.dropna(subset=['PM2.5'], inplace=True)
print(df.isnull().sum())

City              0
Date              0
PM2.5             0
PM10           7301
NO              351
NO2             377
NOx            1470
NH3            6644
CO              370
SO2             505
O3              752
Benzene        3151
Toluene        5555
Xylene        15273
AQI             761
AQI_Bucket      761
dtype: int64


In [25]:
# 2. Impute missing values for other numerical columns using their median
# Get a list of numerical columns (excluding the target and object types)
numerical_cols = df.select_dtypes(include=np.number).columns.tolist()
numerical_cols.remove('PM2.5')
print(numerical_cols)

['PM10', 'NO', 'NO2', 'NOx', 'NH3', 'CO', 'SO2', 'O3', 'Benzene', 'Toluene', 'Xylene', 'AQI']


In [26]:
for col in numerical_cols:
    median_val = df[col].median()
    df[col].fillna(median_val, inplace=True)
print(df.isnull().sum())

City            0
Date            0
PM2.5           0
PM10            0
NO              0
NO2             0
NOx             0
NH3             0
CO              0
SO2             0
O3              0
Benzene         0
Toluene         0
Xylene          0
AQI             0
AQI_Bucket    761
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(median_val, inplace=True)


In [27]:
# Convert 'Date' column from object to datetime
df['Date'] = pd.to_datetime(df['Date'])

print("\nData types after converting 'Date' column:")
df.info()


Data types after converting 'Date' column:
<class 'pandas.core.frame.DataFrame'>
Index: 24933 entries, 27 to 29530
Data columns (total 16 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   City        24933 non-null  object        
 1   Date        24933 non-null  datetime64[ns]
 2   PM2.5       24933 non-null  float64       
 3   PM10        24933 non-null  float64       
 4   NO          24933 non-null  float64       
 5   NO2         24933 non-null  float64       
 6   NOx         24933 non-null  float64       
 7   NH3         24933 non-null  float64       
 8   CO          24933 non-null  float64       
 9   SO2         24933 non-null  float64       
 10  O3          24933 non-null  float64       
 11  Benzene     24933 non-null  float64       
 12  Toluene     24933 non-null  float64       
 13  Xylene      24933 non-null  float64       
 14  AQI         24933 non-null  float64       
 15  AQI_Bucket  24172 non-null  ob

In [28]:
# Extract features from the 'Date' column
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Day'] = df['Date'].dt.day

In [29]:
df['City']

Unnamed: 0,City
27,Ahmedabad
28,Ahmedabad
29,Ahmedabad
30,Ahmedabad
31,Ahmedabad
...,...
29526,Visakhapatnam
29527,Visakhapatnam
29528,Visakhapatnam
29529,Visakhapatnam


In [30]:
# One-Hot Encode the 'City' column
df = pd.get_dummies(df, columns=['City'], drop_first=True)

In [31]:

# Drop columns that are not needed or cause data leakage
df.drop(['Date', 'AQI', 'AQI_Bucket'], axis=1, inplace=True)

print("\nDataFrame after Feature Engineering:")
print(df.head())


DataFrame after Feature Engineering:
     PM2.5    PM10     NO    NO2    NOx    NH3     CO    SO2      O3  Benzene  \
27   73.24  95.595   5.72  21.11  25.84  16.53   5.72  36.52   62.42     0.03   
28   83.13  95.595   6.93  28.71  33.72  16.53   6.93  49.52   59.76     0.02   
29   79.84  95.595  13.85  28.68  41.08  16.53  13.85  48.49   97.07     0.04   
30   94.52  95.595  24.39  32.66  52.61  16.53  24.39  67.39  111.33     0.24   
31  135.99  95.595  43.48  42.08  84.57  16.53  43.48  75.23  102.70     0.40   

    ...  City_Jorapokhar  City_Kochi  City_Kolkata  City_Lucknow  City_Mumbai  \
27  ...            False       False         False         False        False   
28  ...            False       False         False         False        False   
29  ...            False       False         False         False        False   
30  ...            False       False         False         False        False   
31  ...            False       False         False         False      

In [32]:
# Define your features (X) and target (y)
X = df.drop('PM2.5', axis=1)
y = df['PM2.5']

In [33]:
# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the numerical features
scaler = StandardScaler()

# Fit the scaler on the training data and transform it
X_train_scaled = scaler.fit_transform(X_train)

In [34]:
# Transform the test data using the SAME scaler
X_test_scaled = scaler.transform(X_test)

print(f"\nTraining data shape: {X_train_scaled.shape}")
print(f"Testing data shape: {X_test_scaled.shape}")


Training data shape: (19946, 39)
Testing data shape: (4987, 39)


In [35]:
print(X_train_scaled)

[[-0.72498815 -0.68265466 -0.73996923 ... -0.17776592 -0.2129156
  -0.22731347]
 [-0.20480822  2.65488744  3.67108744 ... -0.17776592 -0.2129156
  -0.22731347]
 [ 0.69935018  1.00378218  1.58090758 ... -0.17776592 -0.2129156
  -0.22731347]
 ...
 [-0.20480822  2.61007665  2.32249194 ... -0.17776592 -0.2129156
  -0.22731347]
 [-1.00478539 -0.24531859 -0.35179297 ... -0.17776592 -0.2129156
  -0.22731347]
 [-1.05324485 -0.4986719  -0.75224035 ... -0.17776592  4.69669666
  -0.22731347]]
