In [9]:
pip install pandas numpy scikit-learn matplotlib seaborn streamlit

Note: you may need to restart the kernel to use updated packages.


In [2]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.pipeline import make_pipeline

# Visualisation settings
sns.set(style='whitegrid')

# Task 1 Data Handling

In [None]:
# Load folder
data_folder = 'Assessment Data-20250428/'

# Load needed files
dongsi = pd.read_csv(data_folder + 'PRSA_Data_Dongsi_20130301-20170228.csv')
changping = pd.read_csv(data_folder + 'PRSA_Data_Changping_20130301-20170228.csv')
huairou = pd.read_csv(data_folder + 'PRSA_Data_Huairou_20130301-20170228.csv')
aotizhongxin = pd.read_csv(data_folder + 'PRSA_Data_Aotizhongxin_20130301-20170228.csv')

# Merge data to dataframe
air_quality_data = pd.concat([dongsi, changping, huairou, aotizhongxin], ignore_index=True)

# Preview merged dataset to ensure data is loaded correctly
print("Preview of merged dataset:")
air_quality_data.head()

# Export merged dataset
air_quality_data.to_csv("merged_air_data.csv", index=False)

Preview of merged dataset:


The provided dataset contained hourly air quality and weather-based data from 12 monitoring stations in Beijing. For this analysis, four stations were selected to represent a diverse range of environments:

- Dongsi - Urban
- Changping - Suburban
- Huairou - Rural
- Aotizhongin - Industrial/Hotspot

Each station's data was loaded from its respective file and merged into a dataframe named 'air_quality_data', a preview of the merged dataset is output to ensure that the structure and columns are loaded correctly.

# Task 2 Fundamental Data Understanding

In [None]:
# Import merged dataset
air_quality_data = pd.read_csv("merged_air_data.csv")

# Number of rows and columns using formatted string
print(f"Dataset contains {air_quality_data.shape[0]:,} rows and {air_quality_data.shape[1]} columns.\n")

# Data types, number of missing values and filled values
data_information = pd.DataFrame({
    "Data Type": air_quality_data.dtypes,
    "Missing Values": air_quality_data.isnull().sum(),
    "Filled Values": air_quality_data.notnull().sum()
})

print("Data information:")
display(data_information.T)

# Pollution data
pollution = ['PM2.5', 'PM10', 'SO2', 'NO2', 'CO', 'O3']
print("Pollution data:")
display(air_quality_data[pollution].describe().T.round(2))

# Weather data
weather = ['TEMP', 'DEWP', 'PRES', 'RAIN', 'WSPM']
print("Weather data:")
display(air_quality_data[weather].describe().T.round(2))

Dataset contains 140,256 rows and 18 columns.

Data information:


Unnamed: 0,No,year,month,day,hour,PM2.5,PM10,SO2,NO2,CO,O3,TEMP,PRES,DEWP,RAIN,wd,WSPM,station
Data Type,int64,int64,int64,int64,int64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,object,float64,object
Missing Values,0,0,0,0,0,3402,2630,3206,4930,7916,4138,144,143,146,146,601,120,0
Filled Values,140256,140256,140256,140256,140256,136854,137626,137050,135326,132340,136118,140112,140113,140110,140110,139655,140136,140256


Pollution data:


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
PM2.5,136854.0,77.43,78.67,2.0,19.0,53.0,108.0,898.0
PM10,137626.0,101.64,90.71,2.0,34.0,78.0,142.0,999.0
SO2,137050.0,15.75,21.61,0.29,2.0,7.0,19.0,341.0
NO2,135326.0,47.45,33.61,1.03,21.0,40.0,66.0,290.0
CO,132340.0,1189.96,1115.46,100.0,500.0,800.0,1500.0,10000.0
O3,136118.0,57.84,56.25,0.21,13.0,46.0,81.0,1071.0


Weather data:


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
TEMP,140112.0,13.35,11.51,-19.9,3.0,14.3,23.1,41.4
DEWP,140110.0,2.33,13.86,-43.4,-9.2,2.9,15.0,29.1
PRES,140113.0,1009.94,10.48,982.4,1001.5,1009.5,1018.1,1042.0
RAIN,140110.0,0.06,0.83,0.0,0.0,0.0,0.0,72.5
WSPM,140136.0,1.77,1.25,0.0,0.9,1.4,2.2,12.9


The merged dataset was explored to gain an understanding of its structure and contents. This included the number of rows and columns, data types and the number of filled and missing values.

A table was created to display each columns data type and number of filled and missing values. The table showed that several of the statistics such as 'CO', 'NO2' and 'O3' had a signinficant number of missing values that require preprocessing.

Evaluative metrics were created and split into two groups:
- Pollution: PM2.5, PM10, SO2, NO2, CO, O3
- Weather: TEMP, DEWP, PRES, RAIN, WSPM

These metrics showed that PM2.5 reached a max value of 898 units, indicating severe pollution at times. The CO max value reached 10000, which suggests an incorrect measurement or heavy outlier, when compared to the mean of 1189.96.

This initial analysis proves the dataset is suitable for analysing the air quality across Beijing, although the data will need cleaning.