In [1]:
# ================================
# 📌 Week 1 Project - Environment Monitoring
# Dataset: city_day.csv (Air Quality Data)
# ================================

# Step 1: Import libraries
import pandas as pd
import numpy as np

# Step 2: Load dataset
file_path = "/content/city_day.csv"   # Adjust if path is different
df = pd.read_csv(file_path)

# Step 3: Initial Exploration
print("---- DATASET SHAPE ----")
print(df.shape)

print("\n---- FIRST 5 ROWS ----")
display(df.head())

print("\n---- INFO ----")
print(df.info())

print("\n---- DESCRIBE ----")
print(df.describe(include='all'))

print("\n---- MISSING VALUES ----")
print(df.isnull().sum())

# Step 4: Preprocessing

# ✅ Convert 'Date' column to datetime
if 'Date' in df.columns:
    df['Date'] = pd.to_datetime(df['Date'], errors='coerce')

# ✅ Handle missing values
print("\nMissing values BEFORE cleaning:")
print(df.isnull().sum())

# Fill numeric columns with mean, categorical with mode
for col in df.columns:
    if df[col].dtype == 'O':
        df[col].fillna(df[col].mode()[0], inplace=True)
    else:  # numeric
        df[col].fillna(df[col].mean(), inplace=True)

print("\nMissing values AFTER cleaning:")
print(df.isnull().sum())

# ✅ Double check datatypes
print("\nUpdated Data Types:")
print(df.dtypes)

# ✅ Final dataset preview
print("\nFinal cleaned dataset preview:")
display(df.head())


---- DATASET SHAPE ----
(29531, 16)

---- FIRST 5 ROWS ----


Unnamed: 0,City,Date,PM2.5,PM10,NO,NO2,NOx,NH3,CO,SO2,O3,Benzene,Toluene,Xylene,AQI,AQI_Bucket
0,Ahmedabad,2015-01-01,,,0.92,18.22,17.15,,0.92,27.64,133.36,0.0,0.02,0.0,,
1,Ahmedabad,2015-01-02,,,0.97,15.69,16.46,,0.97,24.55,34.06,3.68,5.5,3.77,,
2,Ahmedabad,2015-01-03,,,17.4,19.3,29.7,,17.4,29.07,30.7,6.8,16.4,2.25,,
3,Ahmedabad,2015-01-04,,,1.7,18.48,17.97,,1.7,18.59,36.08,4.43,10.14,1.0,,
4,Ahmedabad,2015-01-05,,,22.1,21.42,37.76,,22.1,39.33,39.31,7.01,18.89,2.78,,



---- INFO ----
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29531 entries, 0 to 29530
Data columns (total 16 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   City        29531 non-null  object 
 1   Date        29531 non-null  object 
 2   PM2.5       24933 non-null  float64
 3   PM10        18391 non-null  float64
 4   NO          25949 non-null  float64
 5   NO2         25946 non-null  float64
 6   NOx         25346 non-null  float64
 7   NH3         19203 non-null  float64
 8   CO          27472 non-null  float64
 9   SO2         25677 non-null  float64
 10  O3          25509 non-null  float64
 11  Benzene     23908 non-null  float64
 12  Toluene     21490 non-null  float64
 13  Xylene      11422 non-null  float64
 14  AQI         24850 non-null  float64
 15  AQI_Bucket  24850 non-null  object 
dtypes: float64(13), object(3)
memory usage: 3.6+ MB
None

---- DESCRIBE ----
             City        Date         PM2.5          PM10  

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mean(), inplace=True)


Unnamed: 0,City,Date,PM2.5,PM10,NO,NO2,NOx,NH3,CO,SO2,O3,Benzene,Toluene,Xylene,AQI,AQI_Bucket
0,Ahmedabad,2015-01-01,67.450578,118.127103,0.92,18.22,17.15,23.483476,0.92,27.64,133.36,0.0,0.02,0.0,166.463581,Moderate
1,Ahmedabad,2015-01-02,67.450578,118.127103,0.97,15.69,16.46,23.483476,0.97,24.55,34.06,3.68,5.5,3.77,166.463581,Moderate
2,Ahmedabad,2015-01-03,67.450578,118.127103,17.4,19.3,29.7,23.483476,17.4,29.07,30.7,6.8,16.4,2.25,166.463581,Moderate
3,Ahmedabad,2015-01-04,67.450578,118.127103,1.7,18.48,17.97,23.483476,1.7,18.59,36.08,4.43,10.14,1.0,166.463581,Moderate
4,Ahmedabad,2015-01-05,67.450578,118.127103,22.1,21.42,37.76,23.483476,22.1,39.33,39.31,7.01,18.89,2.78,166.463581,Moderate
