In [1]:
import pandas as pd

# Correct file name from your upload
data = pd.read_csv('/content/water_quality.csv.csv')

# Show the first few rows
data.head()

Unnamed: 0,id;date;NH4;BSK5;Suspended;O2;NO3;NO2;SO4;PO4;CL
0,1;17.02.2000;0.33;2.77;12;12.3;9.5;0.057;154;0...
1,1;11.05.2000;0.044;3;51.6;14.61;17.75;0.034;35...
2,1;11.09.2000;0.032;2.1;24.5;9.87;13.8;0.173;41...
3,1;13.12.2000;0.17;2.23;35.6;12.4;17.13;0.099;2...
4,1;02.03.2001;0;3.03;48.8;14.69;10;0.065;281.6;...


In [2]:
# Read the file again with correct separator
data = pd.read_csv('/content/water_quality.csv.csv', sep=';')

# Show the cleaned version
data.head()


Unnamed: 0,id,date,NH4,BSK5,Suspended,O2,NO3,NO2,SO4,PO4,CL
0,1,17.02.2000,0.33,2.77,12.0,12.3,9.5,0.057,154.0,0.454,289.5
1,1,11.05.2000,0.044,3.0,51.6,14.61,17.75,0.034,352.0,0.09,1792.0
2,1,11.09.2000,0.032,2.1,24.5,9.87,13.8,0.173,416.0,0.2,2509.0
3,1,13.12.2000,0.17,2.23,35.6,12.4,17.13,0.099,275.2,0.377,1264.0
4,1,02.03.2001,0.0,3.03,48.8,14.69,10.0,0.065,281.6,0.134,1462.0


In [3]:
data.isnull().sum()


Unnamed: 0,0
id,0
date,0
NH4,3
BSK5,1
Suspended,16
O2,3
NO3,1
NO2,3
SO4,49
PO4,28


In [4]:
# Just for example: If O2 > 5 and NH4 < 0.2, mark water as "safe" (1), else "unsafe" (0)
data['Safe'] = ((data['O2'] > 5) & (data['NH4'] < 0.2)).astype(int)

# Show the new column
data[['O2', 'NH4', 'Safe']].head()


Unnamed: 0,O2,NH4,Safe
0,12.3,0.33,0
1,14.61,0.044,1
2,9.87,0.032,1
3,12.4,0.17,1
4,14.69,0.0,1


In [5]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Split: X = input columns, y = output
X = data.drop(['Safe', 'id', 'date'], axis=1)  # drop things we don't want to learn from
y = data['Safe']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

# Show results
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       304
           1       1.00      1.00      1.00       269

    accuracy                           1.00       573
   macro avg       1.00      1.00      1.00       573
weighted avg       1.00      1.00      1.00       573



In [6]:
data.shape


(2861, 12)

In [8]:
data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,2861.0,12.397064,6.084226,1.0,8.0,14.0,16.0,22.0
NH4,2858.0,0.758734,2.486247,0.0,0.08,0.22,0.5,39.427
BSK5,2860.0,4.316182,2.973997,0.0,2.16,3.8,5.8,50.9
Suspended,2845.0,12.931905,16.543097,0.0,6.0,10.0,15.0,595.0
O2,2858.0,9.508902,4.42826,0.0,7.0925,8.995,11.52,90.0
NO3,2860.0,4.316846,6.881188,0.0,1.39,2.8,5.5825,133.4
NO2,2858.0,0.246128,2.182777,0.0,0.03,0.059,0.12575,109.0
SO4,2812.0,59.362313,96.582641,0.0,27.0525,37.8,64.64,3573.4
PO4,2833.0,0.418626,0.771326,0.0,0.13,0.27,0.47,13.879
CL,2812.0,93.731991,394.512184,0.02,26.8,33.9,45.6075,5615.28


In [9]:
data.isnull().sum()

Unnamed: 0,0
id,0
date,0
NH4,3
BSK5,1
Suspended,16
O2,3
NO3,1
NO2,3
SO4,49
PO4,28


In [11]:
data=data.sort_values(by=['id','date'])
data.head()

Unnamed: 0,id,date,NH4,BSK5,Suspended,O2,NO3,NO2,SO4,PO4,CL,Safe
43,1,01.12.2010,0.291,3.27,16.7,12.3,59.2,0.09,278.4,0.37,1375.46,0
4,1,02.03.2001,0.0,3.03,48.8,14.69,10.0,0.065,281.6,0.134,1462.0,1
41,1,02.06.2010,0.29,3.52,37.2,7.16,57.87,0.21,268.8,0.24,1120.2,0
22,1,02.09.2005,0.0,3.68,28.5,14.2,15.51,0.033,422.4,0.9,2063.8,1
62,1,02.09.2015,0.035,6.28,104.8,7.8,4.68,0.076,694.4,0.53,5615.28,1


In [13]:
# Convert 'date' column to datetime format
data['date'] = pd.to_datetime(data['date'], dayfirst=True)

# Extract year and month
data['year'] = data['date'].dt.year
data['month'] = data['date'].dt.month

# Show the updated DataFrame
data.head()


Unnamed: 0,id,date,NH4,BSK5,Suspended,O2,NO3,NO2,SO4,PO4,CL,Safe,year,month
43,1,2010-12-01,0.291,3.27,16.7,12.3,59.2,0.09,278.4,0.37,1375.46,0,2010,12
4,1,2001-03-02,0.0,3.03,48.8,14.69,10.0,0.065,281.6,0.134,1462.0,1,2001,3
41,1,2010-06-02,0.29,3.52,37.2,7.16,57.87,0.21,268.8,0.24,1120.2,0,2010,6
22,1,2005-09-02,0.0,3.68,28.5,14.2,15.51,0.033,422.4,0.9,2063.8,1,2005,9
62,1,2015-09-02,0.035,6.28,104.8,7.8,4.68,0.076,694.4,0.53,5615.28,1,2015,9


In [14]:
pollutants = ['O2', 'NO3', 'NO2', 'SO4', 'PO4', 'CL']
data[pollutants].head()


Unnamed: 0,O2,NO3,NO2,SO4,PO4,CL
43,12.3,59.2,0.09,278.4,0.37,1375.46
4,14.69,10.0,0.065,281.6,0.134,1462.0
41,7.16,57.87,0.21,268.8,0.24,1120.2
22,14.2,15.51,0.033,422.4,0.9,2063.8
62,7.8,4.68,0.076,694.4,0.53,5615.28


In [15]:
data.columns

Index(['id', 'date', 'NH4', 'BSK5', 'Suspended', 'O2', 'NO3', 'NO2', 'SO4',
       'PO4', 'CL', 'Safe', 'year', 'month'],
      dtype='object')