Data Visualization using matplotlib
Problem Statement: Analyzing Air Quality Index (AQI) Trends in a City 
Dataset: "City_Air_Quality.csv"
Description: The dataset contains information about air quality measurements in a specific 
city over a period of time. It includes attributes such as date, time, pollutant levels (e.g., PM2.5, 
PM10, CO), and the Air Quality Index (AQI) values. The goal is to use the matplotlib library 
to create visualizations that effectively represent the AQI trends and patterns for different 
pollutants in the city. 
Tasks to Perform:
1. Import the "City_Air_Quality.csv" dataset.
2. Explore the dataset to understand its structure and content.
3. Identify the relevant variables for visualizing AQI trends, such as date, pollutant levels, 
and AQI values.
4. Create line plots or time series plots to visualize the overall AQI trend over time.
5. Plot individual pollutant levels (e.g., PM2.5, PM10, CO) on separate line plots to 
visualize their trends over time.
6. Use bar plots or stacked bar plots to compare the AQI values across different dates or 
time periods.
7. Create box plots or violin plots to analyze the distribution of AQI values for different 
pollutant categories.
8. Use scatter plots or bubble charts to explore the relationship between AQI values and 
pollutant levels.
9. Customize the visualizations by adding labels, titles, legends, and appropriate color 
schemes.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import re
import numpy as np
from collections import Counter
# Load the dataset
data = pd.read_csv("PNQ_AQI.csv")


In [None]:
data['Date'] = pd.to_datetime(data['Date'])
#data['Date'] = data['Date'].apply(lambda x: int(x.timestamp()))
data.sort_values(by=['Date'], inplace=True, ignore_index=True)

In [None]:
for _, col in enumerate(list(data.columns[1:3])):
    data[f'{col} BDL'] = data[f'{col}'].map(lambda x: 1 if 'BDL' in x else 0)
    data[f'{col}'] = data[f'{col}'].apply(lambda x: x[-3:])
    data[f'{col}'] = data[f'{col}'].apply(lambda x: 0 if 'NA' in x else int((re.findall(r'\d+',x))[0]))

In [None]:
outlier_features = list(data.columns[1:5])
def detect_outliers(df,n,features):
    outlier_indices = []
    
    for col in features:
        q1 = np.nanpercentile(df[col], 25)
        q3 = np.nanpercentile(df[col], 75)
        iqr = q3 - q1
        outlier_step = 1.5 * iqr
        outlier_list_col = df[(df[col] < q1 - outlier_step) | (df[col] > q3 + outlier_step )].index
        outlier_indices.extend(outlier_list_col)
    outlier_indices = Counter(outlier_indices)        
    multiple_outliers = list( k for k, v in outlier_indices.items() if v > n)
    return multiple_outliers

Outliers_to_drop = detect_outliers(data,1,outlier_features)
data.loc[Outliers_to_drop]

In [None]:
data.drop(Outliers_to_drop, axis = 0, inplace=True)


In [None]:
rep={'MPCB-KR':'Karve Road','MPCB-SWGT':'Swargate','MPCB-BSRI':'Bhosari',\
     'MPCB-NS':'Nal Stop','MPCB-PMPR':'Pimpri','Pimpri Chinchwad':'Chinchwad'}
data['Location'].replace(rep,inplace=True)

In [None]:
data.dropna(axis=0, subset=['AQI'], inplace=True)
data.drop(['CO2 µg/m3'], axis=1, inplace=True)
data.fillna(method='bfill', axis=0, inplace=True)

In [None]:
data = data[['AQI'] + [c for c in data if c not in ['AQI']]]
data.describe()

In [None]:
monthly_aqi = data.resample('M', on='Date')['AQI'].mean()

# Create a line plot for AQI vs. month
plt.figure(figsize=(12, 6))
plt.plot(monthly_aqi.index, monthly_aqi.values, color='b', marker='o', linestyle='-', label='AQI (Monthly Average)')
plt.title('Monthly AQI Trend Over Time')
plt.xlabel('Date')
plt.ylabel('Average AQI Value')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
data.columns = data.columns.str.replace(' µg/m3', '')

In [None]:
data.head()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Assuming you have a DataFrame named 'data' with the specified columns

# Create separate line plots for each pollutant
plt.figure(figsize=(12, 6))

# PM2.5
plt.plot(data['Date'], data['SO2'], label='SO2', color='b')
plt.title('SO2 Trend Over Time')
plt.xlabel('Date')
plt.ylabel('SO2 Level')
plt.legend()
plt.grid(True)
plt.show()

plt.figure(figsize=(12, 6))

# PM10
plt.plot(data['Date'], data['Nox'], label='NOx', color='g')
plt.title('NOx Trend Over Time')
plt.xlabel('Date')
plt.ylabel('NOx Level')
plt.legend()
plt.grid(True)
plt.show()

plt.figure(figsize=(12, 6))

# CO
plt.plot(data['Date'], data['RSPM'], label='RSPM', color='r')
plt.title('RSPM Trend Over Time')
plt.xlabel('Date')
plt.ylabel('RSPM Level')
plt.legend()
plt.grid(True)
plt.show()

plt.figure(figsize=(12, 6))
# SPM
plt.plot(data['Date'], data['SPM'], label='SPM', color='y')
plt.title('SPM Trend Over Time')
plt.xlabel('Date')
plt.ylabel('SPM Level')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Assuming you have a DataFrame named 'data' with 'Date' and 'AQI' columns

# Create a bar plot to compare AQI values across different dates
plt.figure(figsize=(12, 6))
plt.bar(data['Date'], data['AQI'], color='skyblue')
plt.title('AQI Comparison Across Dates')
plt.xlabel('Date')
plt.ylabel('AQI Value')
plt.xticks(rotation=45)
plt.grid(True)
plt.show()


In [None]:
data.head()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming you have a DataFrame 'data' with columns 'AQI', 'SO2', 'NOx', 'RSPM', 'SPM'

# Create a violin plot for AQI values for different pollutant categories
plt.figure(figsize=(10, 6))
sns.violinplot(x='variable', y='value', data=data[['AQI', 'SO2', 'Nox', 'RSPM', 'SPM']].melt(), inner='quart')
plt.title('Distribution of AQI Values for Different Pollutant Categories')
plt.xlabel('Pollutant Categories')
plt.ylabel('Value')
plt.xticks(rotation=45)
plt.show()


In [None]:
import matplotlib.pyplot as plt
import pandas as pd

# Assuming you have a DataFrame 'data' with columns 'AQI', 'SO2', 'NOx', 'RSPM', 'SPM'

# Scatter plot of AQI vs. SO2
plt.figure(figsize=(10, 6))
plt.scatter(data['SO2'], data['AQI'], alpha=0.5, color='blue')
plt.title('AQI vs. SO2 Scatter Plot')
plt.xlabel('SO2 Level')
plt.ylabel('AQI')
plt.grid(True)
plt.show()

# Scatter plot of AQI vs. NOx
plt.figure(figsize=(10, 6))
plt.scatter(data['Nox'], data['AQI'], alpha=0.5, color='green')
plt.title('AQI vs. NOx Scatter Plot')
plt.xlabel('NOx Level')
plt.ylabel('AQI')
plt.grid(True)
plt.show()

# Bubble chart of AQI vs. RSPM
plt.figure(figsize=(10, 6))
plt.scatter(data['RSPM'], data['AQI'], s=data['AQI'], alpha=0.5, color='red')
plt.title('AQI vs. RSPM Bubble Chart')
plt.xlabel('RSPM Level')
plt.ylabel('AQI')
plt.grid(True)
plt.show()

# Bubble chart of AQI vs. SPM
plt.figure(figsize=(10, 6))
plt.scatter(data['SPM'], data['AQI'], s=data['AQI'], alpha=0.5, color='purple')
plt.title('AQI vs. SPM Bubble Chart')
plt.xlabel('SPM Level')
plt.ylabel('AQI')
plt.grid(True)
plt.show()
