In [15]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [16]:
data_path = '../data/processed/air_quality_cleaned.csv'
df = pd.read_csv(data_path)

In [17]:
city_cols = [col for col in df.columns if col.startswith("City_")]
df['City'] = df[city_cols].idxmax(axis=1).str.replace('City_', '')
df.head()

Unnamed: 0,PM2.5,PM10,NO2,NOx,NH3,CO,SO2,O3,Benzene,Toluene,...,date_2020-06-23,date_2020-06-24,date_2020-06-25,date_2020-06-26,date_2020-06-27,date_2020-06-28,date_2020-06-29,date_2020-06-30,date_2020-07-01,City
0,65.37,72.38,8.07,8.57,13.395,0.64,16.31,8.02,1.63,4.34,...,False,False,False,False,False,False,False,False,False,Aizawl
1,56.28,72.38,8.02,8.51,13.395,0.6,18.93,6.2,1.52,3.74,...,False,False,False,False,False,False,False,False,False,Aizawl
2,48.17,72.38,8.0,8.55,13.395,0.65,18.99,7.97,1.23,4.74,...,False,False,False,False,False,False,False,False,False,Aizawl
3,33.56,72.38,7.89,8.56,13.395,0.72,11.28,10.18,0.61,3.85,...,False,False,False,False,False,False,False,False,False,Aizawl
4,31.3,72.38,8.1,8.49,13.395,0.55,10.29,7.44,0.52,2.31,...,False,False,False,False,False,False,False,False,False,Aizawl


In [18]:
print("Dataset shape:", df.shape)
print("Columns:", df.columns.tolist())
print("\nSummary Statistics:")
display(df.describe())

Dataset shape: (10926, 1967)
Columns: ['PM2.5', 'PM10', 'NO2', 'NOx', 'NH3', 'CO', 'SO2', 'O3', 'Benzene', 'Toluene', 'Xylene', 'AQI', 'City_Aizawl', 'City_Amaravati', 'City_Amritsar', 'City_Chandigarh', 'City_Chennai', 'City_Coimbatore', 'City_Delhi', 'City_Ernakulam', 'City_Gurugram', 'City_Guwahati', 'City_Hyderabad', 'City_Jaipur', 'City_Jorapokhar', 'City_Kochi', 'City_Kolkata', 'City_Lucknow', 'City_Mumbai', 'City_Patna', 'City_Shillong', 'City_Talcher', 'City_Thiruvananthapuram', 'City_Visakhapatnam', 'date_2015-01-02', 'date_2015-01-03', 'date_2015-01-04', 'date_2015-01-05', 'date_2015-01-06', 'date_2015-01-07', 'date_2015-01-08', 'date_2015-01-09', 'date_2015-01-10', 'date_2015-01-11', 'date_2015-01-12', 'date_2015-01-13', 'date_2015-01-14', 'date_2015-01-15', 'date_2015-01-16', 'date_2015-01-17', 'date_2015-01-18', 'date_2015-01-19', 'date_2015-01-20', 'date_2015-01-21', 'date_2015-01-22', 'date_2015-01-23', 'date_2015-01-24', 'date_2015-01-25', 'date_2015-01-26', 'date_2015-

Unnamed: 0,PM2.5,PM10,NO2,NOx,NH3,CO,SO2,O3,Benzene,Toluene,Xylene,AQI
count,10926.0,10926.0,10926.0,10926.0,10926.0,10926.0,10926.0,10926.0,10926.0,10926.0,10926.0,10926.0
mean,37.607246,65.211178,17.815038,19.268731,13.041588,0.741157,8.401679,30.069938,0.813746,2.244017,1.179577,93.502105
std,22.05568,42.203992,11.647746,13.767499,10.088482,0.503991,4.652358,16.158271,1.156725,2.708498,1.246722,40.445444
min,0.04,0.42,0.01,0.0,0.01,0.0,0.01,0.01,0.0,0.0,0.0,14.0
25%,20.5225,31.58,9.3825,8.8525,5.02,0.39,5.25,17.82,0.0,0.0,0.18,65.0
50%,34.73,54.59,14.74,16.505,11.75,0.68,7.36,28.8,0.22,1.11,0.46,85.0
75%,50.3275,86.7175,24.14,26.63,17.9,1.0375,10.78,39.7275,1.23,3.83,2.14,116.0
max,147.88,234.51,63.07,65.41,47.54,2.37,23.16,77.47,5.19,10.87,6.26,221.0


In [19]:
plt.figure(figsize=(10,5))
sns.histplot(df['PM2.5'], bins=30, kde=True, color='skyblue')
plt.title('Distribution of PM2.5')
plt.xlabel('PM2.5')
plt.ylabel('Frequency')
plt.show()

  plt.show()


In [20]:
plt.figure(figsize=(12,6))
sns.boxplot(x='City', y='PM2.5', data=df)
plt.xticks(rotation=45)
plt.title('PM2.5 Levels Across Cities')
plt.show()

  plt.show()


In [22]:
numeric_cols = df.select_dtypes(include='number').columns  # Only numeric columns
plt.figure(figsize=(12,8))
sns.heatmap(df[numeric_cols].corr(), annot=True, fmt=".2f", cmap='coolwarm')
plt.title('Correlation Matrix of Pollutants')
plt.show()

  plt.show()


In [23]:
if 'Date' in df.columns:
    df['Date'] = pd.to_datetime(df['Date'])
    # Example: visualize first city
    city_choice = df['City'].unique()[0]
    df_city = df[df['City'] == city_choice]
    
    plt.figure(figsize=(14,6))
    sns.lineplot(x='Date', y='PM2.5', data=df_city)
    plt.title(f'PM2.5 Trend over Time in {city_choice}')
    plt.xlabel('Date')
    plt.ylabel('PM2.5')
    plt.show()

In [24]:
if 'AQI' in df.columns:
    plt.figure(figsize=(8,5))
    sns.countplot(x='AQI', data=df, palette='viridis', 
                  order=df['AQI'].value_counts().index)
    plt.title('Count of AQI Categories')
    plt.xlabel('AQI Category')
    plt.ylabel('Count')
    plt.show()


Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.countplot(x='AQI', data=df, palette='viridis',
  plt.show()
