In [2]:
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
import pandas as pd

In [3]:
file1 = "scalar_1215309_191a_0eca_90dc.csv" #Prince Rupert Atlin Terminal Station Conductivity Temperature Depth (May 1st, 2024 to May 2nd, 2024)
file2 = "scalar_1215311_9e77_a900_b059.csv" #Prince Rupert Atlin Terminal Station Oxygen Sensor Deployed (May 1st, 2024 to May 2nd, 2024)

In [4]:
df1_cols = ['timeseries_id',
            'time',
            'Sound_Speed',
            'turbidity',
            'Temperature',
            'density',
            'cond',
            'salinity',
            'pressure',
            'latitude',
            'longitude',
            'depth']

df2_cols = ['timeseries_id',
            'time',
            'oxygen_corrected',
            'oxygen_uncorrected',
            'latitude',
            'longitude',
            'depth']

In [5]:
df1 = pd.read_csv(file1,header=None,skiprows=2,names=df1_cols)
df1.head(5)

Unnamed: 0,timeseries_id,time,Sound_Speed,turbidity,Temperature,density,cond,salinity,pressure,latitude,longitude,depth
0,scalar_1215309,2024-05-01T00:39:03.643Z,1480.276013,-0.16,282.2,0.001023,3.2238,29.858658,12.04,54.31877,-130.32105,11.0
1,scalar_1215309,2024-05-01T00:39:04.641Z,1480.308081,-0.07,282.21,0.001023,3.2242,29.854378,12.04,54.31877,-130.32105,11.0
2,scalar_1215309,2024-05-01T00:39:05.642Z,1480.372628,0.02,282.233,0.001023,3.2244,29.837168,12.03,54.31877,-130.32105,11.0
3,scalar_1215309,2024-05-01T00:39:06.641Z,1480.412788,0.21,282.246,0.001023,3.2248,29.830385,12.03,54.31877,-130.32105,11.0
4,scalar_1215309,2024-05-01T00:39:07.641Z,1480.393826,0.25,282.238,0.001023,3.225,29.839124,12.04,54.31877,-130.32105,11.0


In [6]:
df2 = pd.read_csv(file2,header=None,skiprows=2,names=df2_cols)
df2.head(5)

Unnamed: 0,timeseries_id,time,oxygen_corrected,oxygen_uncorrected,latitude,longitude,depth
0,scalar_1215311,2024-05-01T05:24:41.671Z,6.62736,8.045374,54.31877,-130.32105,11.0
1,scalar_1215311,2024-05-01T05:24:42.671Z,6.632853,8.052091,54.31877,-130.32105,11.0
2,scalar_1215311,2024-05-01T05:24:43.671Z,6.632757,8.052091,54.31877,-130.32105,11.0
3,scalar_1215311,2024-05-01T05:24:44.671Z,6.63262,8.052091,54.31877,-130.32105,11.0
4,scalar_1215311,2024-05-01T05:24:45.671Z,6.630587,8.049852,54.31877,-130.32105,11.0


In [7]:
df1['time'] = pd.to_datetime(df1['time'])
df2['time'] = pd.to_datetime(df2['time'])

In [8]:
#Converting the Temperature from Kelvin to Celcius
df1['Temperature'] = df1['Temperature'] - 273.15

In [9]:
df1['Temperature'].head(5)

0    9.050
1    9.060
2    9.083
3    9.096
4    9.088
Name: Temperature, dtype: float64

In [10]:
merged = pd.merge_asof(df1.sort_values('time'), df2.sort_values('time'), on='time', direction='nearest')

In [11]:
merged.columns

Index(['timeseries_id_x', 'time', 'Sound_Speed', 'turbidity', 'Temperature',
       'density', 'cond', 'salinity', 'pressure', 'latitude_x', 'longitude_x',
       'depth_x', 'timeseries_id_y', 'oxygen_corrected', 'oxygen_uncorrected',
       'latitude_y', 'longitude_y', 'depth_y'],
      dtype='object')

In [12]:
features = merged[['Temperature', 'salinity', 'oxygen_corrected']].dropna()

In [13]:
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

In [14]:
model = IsolationForest(contamination=0.05, random_state=42)
merged.loc[features.index, 'Alert'] = model.fit_predict(scaled_features)
merged['Alert'] = merged['Alert'].map({1: 'Normal', -1: 'Anomaly Detected'})

In [15]:
output = merged[['time', 'Temperature', 'salinity', 'oxygen_corrected', 'Alert']]

In [16]:
output.head(5)

Unnamed: 0,time,Temperature,salinity,oxygen_corrected,Alert
0,2024-05-01 00:39:03.643000+00:00,9.05,29.858658,6.62736,Normal
1,2024-05-01 00:39:04.641000+00:00,9.06,29.854378,6.62736,Normal
2,2024-05-01 00:39:05.642000+00:00,9.083,29.837168,6.62736,Normal
3,2024-05-01 00:39:06.641000+00:00,9.096,29.830385,6.62736,Normal
4,2024-05-01 00:39:07.641000+00:00,9.088,29.839124,6.62736,Normal


In [17]:
output[output['Alert']=='Anomaly Detected'].head(20)

Unnamed: 0,time,Temperature,salinity,oxygen_corrected,Alert
60797,2024-05-02 05:41:59.461000+00:00,9.392,29.422826,6.609366,Anomaly Detected
60798,2024-05-02 05:42:00.461000+00:00,9.418,29.400468,6.610573,Anomaly Detected
60799,2024-05-02 05:42:01.460000+00:00,9.419,29.39661,6.610743,Anomaly Detected
60800,2024-05-02 05:42:02.460000+00:00,9.518,29.307505,6.617351,Anomaly Detected
60801,2024-05-02 05:42:03.460000+00:00,9.547,29.286877,6.61851,Anomaly Detected
60802,2024-05-02 05:42:04.460000+00:00,9.543,29.289124,6.618378,Anomaly Detected
60803,2024-05-02 05:42:05.460000+00:00,9.531,29.294874,6.636588,Anomaly Detected
60804,2024-05-02 05:42:06.459000+00:00,9.533,29.292232,6.63672,Anomaly Detected
60805,2024-05-02 05:42:07.459000+00:00,9.526,29.296932,6.636452,Anomaly Detected
60806,2024-05-02 05:42:08.459000+00:00,9.561,29.266373,6.652961,Anomaly Detected


In [18]:
anomaly_scores = model.decision_function(scaled_features)
merged.loc[features.index, 'Anomaly_Score'] = anomaly_scores

In [19]:
min_score = merged['Anomaly_Score'].min()
max_score = merged['Anomaly_Score'].max()
merged['Risk_Score'] = merged['Anomaly_Score'].apply(
    lambda x: round(100 * (1 - (x - min_score) / (max_score - min_score)), 2)
)

In [20]:
def interpret_anomaly(row):
    reasons = []
    if row['Temperature'] > 20:
        reasons.append("High temperature")
    if row['oxygen_corrected'] < 5:
        reasons.append("Low dissolved oxygen")
    if row['salinity'] < 28 or row['salinity'] > 35:
        reasons.append("Salinity out of range")
    return "; ".join(reasons) if reasons else "Unusual pattern detected (no single variable outside safe range)"

In [21]:
def generate_detailed_alert(row):
    if row['Alert'] == 'Anomaly Detected':
        return f"Risk Score: {row['Risk_Score']} | {interpret_anomaly(row)}"
    else:
        return "Normal"

In [22]:
merged['Detailed_Alert'] = merged.apply(generate_detailed_alert, axis=1)

In [23]:
enhanced_sample = merged[['time', 'Temperature', 'salinity', 'oxygen_corrected', 'Anomaly_Score', 'Detailed_Alert']].head(10)


In [24]:
merged[['Alert','Detailed_Alert']][merged['Alert']=='Anomaly Detected'].head(30)

Unnamed: 0,Alert,Detailed_Alert
60797,Anomaly Detected,Risk Score: 49.45 | Unusual pattern detected (...
60798,Anomaly Detected,Risk Score: 49.67 | Unusual pattern detected (...
60799,Anomaly Detected,Risk Score: 49.9 | Unusual pattern detected (n...
60800,Anomaly Detected,Risk Score: 56.87 | Unusual pattern detected (...
60801,Anomaly Detected,Risk Score: 61.3 | Unusual pattern detected (n...
60802,Anomaly Detected,Risk Score: 60.14 | Unusual pattern detected (...
60803,Anomaly Detected,Risk Score: 58.22 | Unusual pattern detected (...
60804,Anomaly Detected,Risk Score: 58.6 | Unusual pattern detected (n...
60805,Anomaly Detected,Risk Score: 57.5 | Unusual pattern detected (n...
60806,Anomaly Detected,Risk Score: 63.26 | Unusual pattern detected (...


In [25]:
merged[merged['Alert']=='Anomaly Detected'].head(1)

Unnamed: 0,timeseries_id_x,time,Sound_Speed,turbidity,Temperature,density,cond,salinity,pressure,latitude_x,...,timeseries_id_y,oxygen_corrected,oxygen_uncorrected,latitude_y,longitude_y,depth_y,Alert,Anomaly_Score,Risk_Score,Detailed_Alert
60797,scalar_1215309,2024-05-02 05:41:59.461000+00:00,1481.019566,0.8,9.392,0.001023,3.2089,29.422826,12.33,54.31877,...,scalar_1215311,6.609366,7.975959,54.31877,-130.32105,11.0,Anomaly Detected,-0.002483,49.45,Risk Score: 49.45 | Unusual pattern detected (...


In [26]:
import joblib

joblib.dump(model, "aquahealth_anomalydetection_model.pkl")
joblib.dump(scaler, "aquahealth_feature_scaler.pkl")

['aquahealth_feature_scaler.pkl']