In [36]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import RandomForestRegressor

In [4]:
# load the dataset
df = pd.read_csv('/content/waterprediction.csv', sep=';')
df

Unnamed: 0,id,date,NH4,BSK5,Suspended,O2,NO3,NO2,SO4,PO4,CL
0,1,17.02.2000,0.330,2.77,12.0,12.30,9.50,0.057,154.00,0.454,289.50
1,1,11.05.2000,0.044,3.00,51.6,14.61,17.75,0.034,352.00,0.090,1792.00
2,1,11.09.2000,0.032,2.10,24.5,9.87,13.80,0.173,416.00,0.200,2509.00
3,1,13.12.2000,0.170,2.23,35.6,12.40,17.13,0.099,275.20,0.377,1264.00
4,1,02.03.2001,0.000,3.03,48.8,14.69,10.00,0.065,281.60,0.134,1462.00
...,...,...,...,...,...,...,...,...,...,...,...
2856,22,06.10.2020,0.046,2.69,3.6,8.28,3.80,0.038,160.00,0.726,77.85
2857,22,27.10.2020,0.000,1.52,0.5,11.26,0.56,0.031,147.20,0.634,71.95
2858,22,03.12.2020,0.034,0.29,0.8,11.09,2.58,0.042,209.92,0.484,61.17
2859,22,12.01.2021,0.000,2.10,0.0,14.31,3.94,0.034,121.60,0.424,63.49


In [5]:
# Check null values
print(df.isnull().sum())

id            0
date          0
NH4           3
BSK5          1
Suspended    16
O2            3
NO3           1
NO2           3
SO4          49
PO4          28
CL           49
dtype: int64


In [6]:
df.fillna(df.mean(numeric_only=True), inplace=True)


In [7]:
print(df.isnull().sum())


id           0
date         0
NH4          0
BSK5         0
Suspended    0
O2           0
NO3          0
NO2          0
SO4          0
PO4          0
CL           0
dtype: int64


In [8]:
df.info() # dataset info

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2861 entries, 0 to 2860
Data columns (total 11 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   id         2861 non-null   int64  
 1   date       2861 non-null   object 
 2   NH4        2861 non-null   float64
 3   BSK5       2861 non-null   float64
 4   Suspended  2861 non-null   float64
 5   O2         2861 non-null   float64
 6   NO3        2861 non-null   float64
 7   NO2        2861 non-null   float64
 8   SO4        2861 non-null   float64
 9   PO4        2861 non-null   float64
 10  CL         2861 non-null   float64
dtypes: float64(9), int64(1), object(1)
memory usage: 246.0+ KB


In [18]:
df = df.sort_values(by=['id', 'date'])
df.head()

Unnamed: 0,id,date,NH4,BSK5,Suspended,O2,NO3,NO2,SO4,PO4,CL,year,month
0,1,2000-02-17,0.33,2.77,12.0,12.3,9.5,0.057,154.0,0.454,289.5,2000,2
1,1,2000-05-11,0.044,3.0,51.6,14.61,17.75,0.034,352.0,0.09,1792.0,2000,5
2,1,2000-09-11,0.032,2.1,24.5,9.87,13.8,0.173,416.0,0.2,2509.0,2000,9
3,1,2000-12-13,0.17,2.23,35.6,12.4,17.13,0.099,275.2,0.377,1264.0,2000,12
4,1,2001-03-02,0.0,3.03,48.8,14.69,10.0,0.065,281.6,0.134,1462.0,2001,3


In [19]:
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month

In [20]:
df.head()

Unnamed: 0,id,date,NH4,BSK5,Suspended,O2,NO3,NO2,SO4,PO4,CL,year,month
0,1,2000-02-17,0.33,2.77,12.0,12.3,9.5,0.057,154.0,0.454,289.5,2000,2
1,1,2000-05-11,0.044,3.0,51.6,14.61,17.75,0.034,352.0,0.09,1792.0,2000,5
2,1,2000-09-11,0.032,2.1,24.5,9.87,13.8,0.173,416.0,0.2,2509.0,2000,9
3,1,2000-12-13,0.17,2.23,35.6,12.4,17.13,0.099,275.2,0.377,1264.0,2000,12
4,1,2001-03-02,0.0,3.03,48.8,14.69,10.0,0.065,281.6,0.134,1462.0,2001,3


In [21]:
df.columns

Index(['id', 'date', 'NH4', 'BSK5', 'Suspended', 'O2', 'NO3', 'NO2', 'SO4',
       'PO4', 'CL', 'year', 'month'],
      dtype='object')

In [22]:
pollutants = ['O2', 'NO3', 'NO2', 'SO4',
       'PO4', 'CL']

In [23]:
# Feature and target selection - Feature - independent variable and Target dependent variable
X = df[['id', 'year']]
y = df[pollutants]

In [24]:
# Encoding - onehotencoder - 22 stations - 1 - 1
X_encoded = pd.get_dummies(X, columns=['id'], drop_first=True)

In [33]:
# Train, Test and Split
X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, y, test_size=0.2, random_state=42
)

In [37]:
# Train the model
model = MultiOutputRegressor(RandomForestRegressor(n_estimators=100, random_state=42))
model.fit(X_train, y_train)

In [38]:
# Evaluate model
y_pred = model.predict(X_test)

In [39]:
print("Model Performance on the Test Data:")
for i, pollutant in enumerate(pollutants):
    print(f'{pollutant}:')
    print('   MSE:', mean_squared_error(y_test.iloc[:, i], y_pred[:, i]))
    print('   R2:', r2_score(y_test.iloc[:, i], y_pred[:, i]))
    print()

Model Performance on the Test Data:
O2:
   MSE: 22.861110802060825
   R2: 0.007540039572995205

NO3:
   MSE: 17.765613544688055
   R2: 0.5194041858679409

NO2:
   MSE: 5.692627739774803
   R2: -7.247223037386126

SO4:
   MSE: 2688.4436530163603
   R2: 0.31966744857823437

PO4:
   MSE: 0.4466762565622493
   R2: 0.3512239952967634

CL:
   MSE: 34492.87945466878
   R2: 0.730477204034535

