In [1]:
# To install necessary libraries (uncomment if running in a new environment)
# pip install pandas numpy matplotlib seaborn scikit-learn

In [2]:
# Import necessary libraries

import pandas as pd  # Data manipulation
import numpy as np  # Numerical operations - Linear Algebra

# Scikit-learn tools for modeling and evaluation
# Wrap regressor to handle multiple target outputs
from sklearn.multioutput import MultiOutputRegressor
# Initialize the RandomForestRegressor model
from sklearn.ensemble import RandomForestRegressor
# Split dataset into training and testing sets
from sklearn.model_selection import train_test_split
# Evaluate the model using mean squared error
from sklearn.metrics import mean_squared_error, r2_score

# Optional: Set display options for better readability
pd.set_option('display.float_format', lambda x: '%.2f' % x)

In [3]:
# Load the dataset
# Load data from CSV file with semicolon as separator
df = pd.read_csv('PB_All_2000_2021.csv', sep=';')

# Preview first few rows
# Display the first few rows of the dataset
df.head()

Unnamed: 0,id,date,NH4,BSK5,Suspended,O2,NO3,NO2,SO4,PO4,CL
0,1,17.02.2000,0.33,2.77,12.0,12.3,9.5,0.06,154.0,0.45,289.5
1,1,11.05.2000,0.04,3.0,51.6,14.61,17.75,0.03,352.0,0.09,1792.0
2,1,11.09.2000,0.03,2.1,24.5,9.87,13.8,0.17,416.0,0.2,2509.0
3,1,13.12.2000,0.17,2.23,35.6,12.4,17.13,0.1,275.2,0.38,1264.0
4,1,02.03.2001,0.0,3.03,48.8,14.69,10.0,0.07,281.6,0.13,1462.0


In [4]:
# Basic dataset info
# Display info about DataFrame including data types and non-null counts
df.info()

# Check for missing values
print("\nMissing values per column:")
# Check for missing values in each column
print(df.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2861 entries, 0 to 2860
Data columns (total 11 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   id         2861 non-null   int64  
 1   date       2861 non-null   object 
 2   NH4        2858 non-null   float64
 3   BSK5       2860 non-null   float64
 4   Suspended  2845 non-null   float64
 5   O2         2858 non-null   float64
 6   NO3        2860 non-null   float64
 7   NO2        2858 non-null   float64
 8   SO4        2812 non-null   float64
 9   PO4        2833 non-null   float64
 10  CL         2812 non-null   float64
dtypes: float64(9), int64(1), object(1)
memory usage: 246.0+ KB

Missing values per column:
id            0
date          0
NH4           3
BSK5          1
Suspended    16
O2            3
NO3           1
NO2           3
SO4          49
PO4          28
CL           49
dtype: int64


In [5]:
# Dataset dimensions
print(f"Dataset contains {df.shape[0]} rows and {df.shape[1]} columns.")

Dataset contains 2861 rows and 11 columns.


In [6]:
# Statistics of the data
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,2861.0,12.4,6.08,1.0,8.0,14.0,16.0,22.0
NH4,2858.0,0.76,2.49,0.0,0.08,0.22,0.5,39.43
BSK5,2860.0,4.32,2.97,0.0,2.16,3.8,5.8,50.9
Suspended,2845.0,12.93,16.54,0.0,6.0,10.0,15.0,595.0
O2,2858.0,9.51,4.43,0.0,7.09,9.0,11.52,90.0
NO3,2860.0,4.32,6.88,0.0,1.39,2.8,5.58,133.4
NO2,2858.0,0.25,2.18,0.0,0.03,0.06,0.13,109.0
SO4,2812.0,59.36,96.58,0.0,27.05,37.8,64.64,3573.4
PO4,2833.0,0.42,0.77,0.0,0.13,0.27,0.47,13.88
CL,2812.0,93.73,394.51,0.02,26.8,33.9,45.61,5615.28


In [7]:
# Missing values
# Check for missing values in each column
df.isnull().sum()

id            0
date          0
NH4           3
BSK5          1
Suspended    16
O2            3
NO3           1
NO2           3
SO4          49
PO4          28
CL           49
dtype: int64

In [8]:
# date is in object - date format
df['date'] = pd.to_datetime(df['date'], format='%d.%m.%Y')
df

Unnamed: 0,id,date,NH4,BSK5,Suspended,O2,NO3,NO2,SO4,PO4,CL
0,1,2000-02-17,0.33,2.77,12.00,12.30,9.50,0.06,154.00,0.45,289.50
1,1,2000-05-11,0.04,3.00,51.60,14.61,17.75,0.03,352.00,0.09,1792.00
2,1,2000-09-11,0.03,2.10,24.50,9.87,13.80,0.17,416.00,0.20,2509.00
3,1,2000-12-13,0.17,2.23,35.60,12.40,17.13,0.10,275.20,0.38,1264.00
4,1,2001-03-02,0.00,3.03,48.80,14.69,10.00,0.07,281.60,0.13,1462.00
...,...,...,...,...,...,...,...,...,...,...,...
2856,22,2020-10-06,0.05,2.69,3.60,8.28,3.80,0.04,160.00,0.73,77.85
2857,22,2020-10-27,0.00,1.52,0.50,11.26,0.56,0.03,147.20,0.63,71.95
2858,22,2020-12-03,0.03,0.29,0.80,11.09,2.58,0.04,209.92,0.48,61.17
2859,22,2021-01-12,0.00,2.10,0.00,14.31,3.94,0.03,121.60,0.42,63.49


In [9]:
# Display info about DataFrame including data types and non-null counts
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2861 entries, 0 to 2860
Data columns (total 11 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   id         2861 non-null   int64         
 1   date       2861 non-null   datetime64[ns]
 2   NH4        2858 non-null   float64       
 3   BSK5       2860 non-null   float64       
 4   Suspended  2845 non-null   float64       
 5   O2         2858 non-null   float64       
 6   NO3        2860 non-null   float64       
 7   NO2        2858 non-null   float64       
 8   SO4        2812 non-null   float64       
 9   PO4        2833 non-null   float64       
 10  CL         2812 non-null   float64       
dtypes: datetime64[ns](1), float64(9), int64(1)
memory usage: 246.0 KB


In [10]:
df = df.sort_values(by=['id', 'date'])
# Display the first few rows of the dataset
df.head()

Unnamed: 0,id,date,NH4,BSK5,Suspended,O2,NO3,NO2,SO4,PO4,CL
0,1,2000-02-17,0.33,2.77,12.0,12.3,9.5,0.06,154.0,0.45,289.5
1,1,2000-05-11,0.04,3.0,51.6,14.61,17.75,0.03,352.0,0.09,1792.0
2,1,2000-09-11,0.03,2.1,24.5,9.87,13.8,0.17,416.0,0.2,2509.0
3,1,2000-12-13,0.17,2.23,35.6,12.4,17.13,0.1,275.2,0.38,1264.0
4,1,2001-03-02,0.0,3.03,48.8,14.69,10.0,0.07,281.6,0.13,1462.0


In [11]:
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month

In [12]:
# Display the first few rows of the dataset
df.head()

Unnamed: 0,id,date,NH4,BSK5,Suspended,O2,NO3,NO2,SO4,PO4,CL,year,month
0,1,2000-02-17,0.33,2.77,12.0,12.3,9.5,0.06,154.0,0.45,289.5,2000,2
1,1,2000-05-11,0.04,3.0,51.6,14.61,17.75,0.03,352.0,0.09,1792.0,2000,5
2,1,2000-09-11,0.03,2.1,24.5,9.87,13.8,0.17,416.0,0.2,2509.0,2000,9
3,1,2000-12-13,0.17,2.23,35.6,12.4,17.13,0.1,275.2,0.38,1264.0,2000,12
4,1,2001-03-02,0.0,3.03,48.8,14.69,10.0,0.07,281.6,0.13,1462.0,2001,3


In [13]:
df.columns

Index(['id', 'date', 'NH4', 'BSK5', 'Suspended', 'O2', 'NO3', 'NO2', 'SO4',
       'PO4', 'CL', 'year', 'month'],
      dtype='object')

In [14]:
pollutants = ['O2', 'NO3', 'NO2', 'SO4',
       'PO4', 'CL']

In [15]:
# Convert 'date' column to datetime format
df['date'] = pd.to_datetime(df['date'], errors='coerce')

# Extract year, month, and day as new features
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day

In [16]:
# Fill missing values with column means
df.fillna(df.mean(numeric_only=True), inplace=True)

In [17]:
# Drop 'id' and original 'date' columns if not needed
df.drop(columns=['id', 'date'], inplace=True)

In [18]:
# Check the dataset after preprocessing
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2861 entries, 0 to 2860
Data columns (total 12 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   NH4        2861 non-null   float64
 1   BSK5       2861 non-null   float64
 2   Suspended  2861 non-null   float64
 3   O2         2861 non-null   float64
 4   NO3        2861 non-null   float64
 5   NO2        2861 non-null   float64
 6   SO4        2861 non-null   float64
 7   PO4        2861 non-null   float64
 8   CL         2861 non-null   float64
 9   year       2861 non-null   int32  
 10  month      2861 non-null   int32  
 11  day        2861 non-null   int32  
dtypes: float64(9), int32(3)
memory usage: 234.8 KB


Unnamed: 0,NH4,BSK5,Suspended,O2,NO3,NO2,SO4,PO4,CL,year,month,day
0,0.33,2.77,12.0,12.3,9.5,0.06,154.0,0.45,289.5,2000,2,17
1,0.04,3.0,51.6,14.61,17.75,0.03,352.0,0.09,1792.0,2000,5,11
2,0.03,2.1,24.5,9.87,13.8,0.17,416.0,0.2,2509.0,2000,9,11
3,0.17,2.23,35.6,12.4,17.13,0.1,275.2,0.38,1264.0,2000,12,13
4,0.0,3.03,48.8,14.69,10.0,0.07,281.6,0.13,1462.0,2001,3,2
