# Erwin Antepuesto

Data: https://archive.ics.uci.edu.

Instruction: Choose a dataset and perform a data storytelling, choose the proper data visualization, do not use words, your graphs must tell a story.

Dataset: Air Quality

Link: https://archive.ics.uci.edu/dataset/360/air+quality

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
df = pd.read_csv('AirQualityUCI.csv', sep=';', parse_dates=[['Date', 'Time']], index_col='Date_Time')

# Ensure the columns are numeric and handle missing values
for column in ['CO(GT)', 'NMHC(GT)', 'NOx(GT)', 'NO2(GT)']:
    df[column] = pd.to_numeric(df[column], errors='coerce')


In [None]:
# Time Series Plot of Gas Concentrations
plt.figure(figsize=(14, 6))
for column in ['CO(GT)', 'NMHC(GT)', 'NOx(GT)', 'NO2(GT)']:
    plt.plot(df[column].dropna(), label=column)  # Drop NA values before plotting
plt.title('Gas Concentrations Over Time')
plt.xlabel('Time')
plt.ylabel('Concentration')
plt.legend()
plt.show()

In [None]:

# Correlation Heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(df[['CO(GT)', 'NMHC(GT)', 'NOx(GT)', 'NO2(GT)']].corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()



In [None]:

# Scatter Plot Matrix
sns.pairplot(df[['CO(GT)', 'NMHC(GT)', 'NOx(GT)', 'NO2(GT)']].dropna())
plt.title('Scatter Plot Matrix')
plt.show()


In [None]:
# Box Plots for Seasonal Variations
df['Season'] = df.index.month%12 // 3 + 1
seasons = {1: 'Winter', 2: 'Spring', 3: 'Summer', 4: 'Autumn'}
df['Season'] = df['Season'].map(seasons)
for column in ['CO(GT)', 'NMHC(GT)', 'NOx(GT)', 'NO2(GT)']:
    plt.figure(figsize=(10, 6))
    sns.boxplot(x='Season', y=column, data=df.dropna(subset=[column]))  # Drop NA values for the specific column
    plt.title(f'{column} Seasonal Variations')
    plt.show()

In [None]:

# Heatmap of Daily Patterns
df['Hour'] = df.index.hour
daily_patterns = df.groupby('Hour')[['CO(GT)', 'NMHC(GT)', 'NOx(GT)', 'NO2(GT)']].mean()
plt.figure(figsize=(10, 6))
sns.heatmap(daily_patterns, cmap='YlGnBu')
plt.title('Heatmap of Daily Patterns')
plt.xlabel('Hour of the Day')
plt.ylabel('Pollutant')
plt.show()


In [None]:

# Line Graphs for Trends Over Time
plt.figure(figsize=(14, 6))
for column in ['CO(GT)', 'NMHC(GT)', 'NOx(GT)', 'NO2(GT)']:
    df.resample('M')[column].mean().dropna().plot(label=column)  # Drop NA values before plotting
plt.title('Trends Over Time')
plt.xlabel('Time')
plt.ylabel('Average Concentration')
plt.legend()
plt.show()