# Illegal Tap Water Detection Using Pipeline Sensor Data
To build a machine learning model that detects water theft through illegal pipeline taps.

## Overview of Dataset
This section gives a brief overview of the dataset, including its columns and data types.

In [1]:
import pandas as pd

df = pd.read_csv("pipeline_sensor_data.csv")
df.head()

Unnamed: 0,Time,SensorID,pressure,flow_rate,leak_intensity,illegal_tap_flag,event_type
0,1:00 PM,S001,2.82,123.91,0.0,0,normal
1,1:10 PM,S001,2.71,113.4,0.0,0,normal
2,1:20 PM,S001,2.84,121.48,0.0,0,normal
3,1:30 PM,S001,2.82,121.73,0.0,0,normal
4,1:40 PM,S001,2.75,121.16,0.0,0,normal


## Handling Categorical Variables: Columns like “SensorID”, “Time” and "event_type" need to be encoded numerically
Loads the sensor dataset, identifies categorical columns (`SensorID`, `Time`, and `event_type`), and encodes them into numeric values using pandas’ built-in category codes. The encoded data is then saved as a new CSV file (`encoded_pipeline_sensor_data.csv`) for further processing or machine learning.

* normal - 5
* low_leak - 3
* blockage - 0
* leak_high - 2
* irregular_flow - 1
* negative_pressure - 4

In [3]:
df = pd.read_csv('pipeline_sensor_data.csv')
print(df.dtypes)

df['SensorID_encoded'] = df['SensorID'].astype('category').cat.codes
print(df[['SensorID', 'SensorID_encoded']].head())

df['event_type_encoded'] = df['event_type'].astype('category').cat.codes
print(df[['event_type', 'event_type_encoded']].head())

df['Time_encoded'] = df['Time'].astype('category').cat.codes
print(df[['Time', 'Time_encoded']].head())

df.to_csv('encoded_pipeline_sensor_data.csv', index=False)
df.head()

Time                 object
SensorID             object
pressure            float64
flow_rate           float64
leak_intensity      float64
illegal_tap_flag      int64
event_type           object
dtype: object
  SensorID  SensorID_encoded
0     S001                 0
1     S001                 0
2     S001                 0
3     S001                 0
4     S001                 0
  event_type  event_type_encoded
0     normal                   5
1     normal                   5
2     normal                   5
3     normal                   5
4     normal                   5
      Time  Time_encoded
0  1:00 PM             0
1  1:10 PM             1
2  1:20 PM             2
3  1:30 PM             3
4  1:40 PM             4


Unnamed: 0,Time,SensorID,pressure,flow_rate,leak_intensity,illegal_tap_flag,event_type,SensorID_encoded,event_type_encoded,Time_encoded
0,1:00 PM,S001,2.82,123.91,0.0,0,normal,0,5,0
1,1:10 PM,S001,2.71,113.4,0.0,0,normal,0,5,1
2,1:20 PM,S001,2.84,121.48,0.0,0,normal,0,5,2
3,1:30 PM,S001,2.82,121.73,0.0,0,normal,0,5,3
4,1:40 PM,S001,2.75,121.16,0.0,0,normal,0,5,4


## Splitting Dataset by Sensor Segments for Train and Test
The dataset is split into training and testing sets based on sensor arguements. We are randomly choosing 80% of them for training and 20% for testing. This helps make sure the model is tested on new sensors it hasn’t seen before.

In [58]:
from sklearn.model_selection import train_test_split

# Load dataset
df = pd.read_csv('encoded_pipeline_sensor_data.csv')

# Get unique segments
unique_segments = df['SensorID'].unique()

# Randomly split segments (without stratification)
train_segments, test_segments = train_test_split(
    unique_segments,
    test_size=0.2,
    random_state=42
)

# Select full segments for train and test datasets
train_df = df[df['SensorID'].isin(train_segments)]
test_df = df[df['SensorID'].isin(test_segments)]

# Display counts and class distribution
print(f'Train segments: {len(train_segments)}, data points: {len(train_df)}')
print(f'Test segments: {len(test_segments)}, data points: {len(test_df)}')

print('Train class distribution:')
print(train_df['event_type_encoded'].value_counts(normalize=True).sort_index())

print('Test class distribution:')
print(test_df['event_type_encoded'].value_counts(normalize=True).sort_index())

train_df.to_csv('train_split.csv', index=False)
test_df.to_csv('test_split.csv', index=False)

Train segments: 160, data points: 8000
Test segments: 40, data points: 2000
Train class distribution:
event_type_encoded
1    0.012125
2    0.017250
3    0.008250
4    0.006125
5    0.956250
Name: proportion, dtype: float64
Test class distribution:
event_type_encoded
0    0.1420
1    0.1820
2    0.1095
3    0.0995
4    0.0230
5    0.4440
Name: proportion, dtype: float64


# Past-based features
We make new features from the past readings of each sensor. Instead of looking at just the current pressure and flow rate, we also add information about how these values changed over time.

In [12]:
import numpy as np
import pandas as pd

ID_COL = 'SensorID_encoded'
TIME_COL = 'Time_encoded'
SIG_COLS = ['pressure', 'flow_rate']
LAGS = [1, 3, 5, 10]
ROLL_WINDOWS = [5, 10, 20]
EPS = 1e-6

def rolling_slope_past_only(values: np.ndarray, window: int) -> np.ndarray:
    n = len(values)
    out = np.full(n, np.nan)
    t = np.arange(window, dtype=float)
    t_mean = t.mean()
    var_t = ((t - t_mean) ** 2).sum()
    if var_t == 0:
        return out
    for i in range(window - 1, n):
        w = values[i - window + 1 : i + 1]
        if np.isnan(w).any():
            continue
        x_mean = w.mean()
        cov_tx = ((t - t_mean) * (w - x_mean)).sum()
        out[i] = cov_tx / var_t
    return out

def add_past_only_features(df: pd.DataFrame) -> pd.DataFrame:
    df = df.sort_values([ID_COL, TIME_COL]).copy()
    g = df.groupby(ID_COL, group_keys=False)

    # Lags
    for s in SIG_COLS:
        for L in LAGS:
            df[f'{s}_lag{L}'] = g[s].shift(L)

    # Delta and percent change from lag1
    for s in SIG_COLS:
        df[f'{s}_delta1'] = df[s] - df[f'{s}_lag1']
        df[f'{s}_pct1'] = df[f'{s}_delta1'] / (np.abs(df[f'{s}_lag1']) + EPS)

    # Rolling stats (past-only)
    for s in SIG_COLS:
        s_past = g[s].shift(1)
        for w in ROLL_WINDOWS:
            roll = s_past.groupby(df[ID_COL]).rolling(window=w, min_periods=w)
            df[f'{s}_roll{w}_mean'] = roll.mean().reset_index(level=0, drop=True)
            df[f'{s}_roll{w}_std']  = roll.std().reset_index(level=0, drop=True)
            df[f'{s}_roll{w}_min']  = roll.min().reset_index(level=0, drop=True)
            df[f'{s}_roll{w}_max']  = roll.max().reset_index(level=0, drop=True)
            df[f'{s}_roll{w}_med']  = roll.median().reset_index(level=0, drop=True)

    # Rolling slopes (past-only)
    for s in SIG_COLS:
        s_past = g[s].shift(1)
        arr = s_past.to_numpy()
        for w in ROLL_WINDOWS:
            col = f'{s}_roll{w}_slope'
            df[col] = np.nan
            for _, idx in g.indices.items():
                idx = np.array(idx)
                sl = rolling_slope_past_only(arr[idx], w)
                df.loc[idx, col] = sl

    # Simple interaction
    df['ratio_flow_pressure'] = df['flow_rate_lag1'] / (np.abs(df['pressure_lag1']) + EPS)

    return df

# Build features on the entire df first
df_feat = add_past_only_features(df)

# Drop rows without enough history
df_feat = df_feat.dropna().reset_index(drop=True)

# Recreate the same time-aware mask on df_feat
df_feat_sorted = df_feat.sort_values([ID_COL, TIME_COL]).reset_index(drop=True)
mask_feat = df_feat_sorted.groupby(ID_COL, group_keys=False).apply(
    lambda g: pd.Series([True]*int(len(g)*0.75) + [False]*(len(g)-int(len(g)*0.75)), index=g.index)
)

# Define features to use in X
FEATURE_COLS = (
    ['pressure','flow_rate','leak_intensity','illegal_tap_flag','SensorID_encoded','ratio_flow_pressure'] +
    [f'pressure_lag{k}' for k in LAGS] + [f'flow_rate_lag{k}' for k in LAGS] +
    ['pressure_delta1','pressure_pct1','flow_rate_delta1','flow_rate_pct1'] +
    ['pressure_roll5_mean','pressure_roll10_std','pressure_roll20_min',
     'flow_rate_roll5_mean','flow_rate_roll10_std','flow_rate_roll20_min',
     'pressure_roll5_slope','pressure_roll10_slope','pressure_roll20_slope',
     'flow_rate_roll5_slope','flow_rate_roll10_slope','flow_rate_roll20_slope']
)

X_all = df_feat_sorted[FEATURE_COLS].copy()
y_all = df_feat_sorted['event_type_encoded'].copy()

X_train, X_test = X_all[mask_feat.values], X_all[~mask_feat.values]
y_train, y_test = y_all[mask_feat.values], y_all[~mask_feat.values]

print("Train:", X_train.shape, "Test:", X_test.shape)

Train: (4400, 30) Test: (1600, 30)


## Training the model
We train an XGBoost model to predict the type of pipeline event (for example, normal flow, leakage, or illegal tap) using the sensor data. We read the pre-split training and testing datasets and then use key sensor readings (`pressure`, `flow_rate`, `leak_intensity`, etc.) to predict the column `event_type_encoded`. We also handle class imbalance by giving rare classes more importance during training. The classification report (precision, recall, F1-score). The confusion matrix, which shows how often each class was confused with others. Finally, we map the numeric predictions back to their original event type names to make the results easier to understand.

This gives us a clear view of how well the model can detect different types of pipeline events from the sensor data.

In [2]:
import pandas as pd
train = pd.read_csv('train_split.csv')
print(train['event_type'].value_counts())

event_type
normal               7650
leak_high             138
irregular_flow         97
leak_low               66
negative_pressure      49
Name: count, dtype: int64


In [4]:
train = pd.read_csv('train_split.csv')
test = pd.read_csv('test_split.csv')

print("TRAIN:", train['event_type'].value_counts())
print("TEST:", test['event_type'].value_counts())

TRAIN: event_type
normal               7650
leak_high             138
irregular_flow         97
leak_low               66
negative_pressure      49
Name: count, dtype: int64
TEST: event_type
normal               888
irregular_flow       364
blockage             284
leak_high            219
leak_low             199
negative_pressure     46
Name: count, dtype: int64


In [7]:
test_blockage = test[test['event_type'] == 'blockage']
print(test_blockage)


         Time SensorID  pressure  flow_rate  leak_intensity  illegal_tap_flag  \
129   5:50 PM     S017      7.25       5.07             0.0                 1   
130   6:00 PM     S017      7.32       5.22             0.0                 1   
131   6:10 PM     S017      7.24       6.43             0.0                 1   
132   6:20 PM     S017      7.25       5.04             0.0                 1   
133   6:30 PM     S017      7.18       6.87             0.0                 1   
...       ...      ...       ...        ...             ...               ...   
1795  8:30 PM     S171      7.36       5.17             0.0                 1   
1796  8:40 PM     S171      6.68       6.39             0.0                 1   
1797  8:50 PM     S171      6.98       6.21             0.0                 1   
1798  9:00 PM     S171      6.66       4.83             0.0                 1   
1799  9:10 PM     S171      7.07       5.39             0.0                 1   

     event_type  SensorID_e

In [10]:
import pandas as pd

# Load your existing files
train = pd.read_csv('train_split.csv')
test_split = pd.read_csv('test_split.csv')

# Select blockage rows from test set
test_blockage = test_split[test_split['event_type'] == 'blockage']

# Randomly sample 70 blockage rows
blockage_sample = test_blockage.sample(n=70, random_state=42)

# Remove these rows from test set (so you don't duplicate in test)
test_split = test_split.drop(blockage_sample.index)

# Add them to the train set
train = pd.concat([train, blockage_sample])

# Shuffle train set to mix new rows well
train = train.sample(frac=1, random_state=42).reset_index(drop=True)

# Save updated files
train.to_csv('train.csv', index=False)
test_split.to_csv('test_split.csv', index=False)

In [11]:
import pandas as pd

# Load the new train file
train = pd.read_csv('train.csv')

# Show the number of each event type in train
print(train['event_type'].value_counts())

event_type
normal               7650
leak_high             138
irregular_flow         97
blockage               70
leak_low               66
negative_pressure      49
Name: count, dtype: int64


In [19]:
import pandas as pd

test_df = pd.read_csv('test_split.csv')
test_df = test_df.sample(frac=1, random_state=567).reset_index(drop=True)
# rerun your evaluation using this shuffled test_df

In [20]:
import pandas as pd
import numpy as np
from sklearn.utils.class_weight import compute_class_weight
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
from sklearn.metrics import classification_report, confusion_matrix

# 1. LOAD UPDATED TRAIN/TEST SPLIT FROM CSV
train_df = pd.read_csv('train.csv')           # <--- changed here!
test_df = pd.read_csv('test_split.csv')       # use your current test split

# 2. DEFINE FEATURES & TARGET
feature_cols = ['pressure', 'flow_rate', 'leak_intensity', 'illegal_tap_flag', 'SensorID_encoded', 'Time_encoded']
target_col = 'event_type_encoded'

X_train = train_df[feature_cols]
y_train = train_df[target_col]
X_test = test_df[feature_cols]
y_test = test_df[target_col]

# 3. FIX LABEL ENCODING SO XGBOOST DOESN'T ERROR
le = LabelEncoder()
le.fit(list(y_train) + list(y_test))
y_train_clean = le.transform(y_train)
y_test_clean = le.transform(y_test)

# 4. COMPUTE SAMPLE WEIGHTS FOR CLASS IMBALANCE
classes = np.unique(y_train_clean)
class_weights = compute_class_weight('balanced', classes=classes, y=y_train_clean)
weight_dict = dict(zip(classes, class_weights))
weights = pd.Series(y_train_clean).map(weight_dict)

# 5. CREATE DMATRIX FOR XGBOOST
dtrain = xgb.DMatrix(X_train, label=y_train_clean, weight=weights)
dtest = xgb.DMatrix(X_test, label=y_test_clean)

# 6. TRAIN XGBOOST MULTI-CLASS MODEL
params = {
    'objective': 'multi:softprob',
    'num_class': len(le.classes_),
    'eval_metric': 'mlogloss',
    'seed': 42
}
bst = xgb.train(params, dtrain, num_boost_round=100)

# 7. EVALUATE
y_pred_prob = bst.predict(dtest)
y_pred = y_pred_prob.argmax(axis=1)
print(classification_report(y_test_clean, y_pred))
print(confusion_matrix(y_test_clean, y_pred))

# Create mapping from encoded labels to human-readable event types
all_labels = pd.concat([
    train_df[['event_type_encoded', 'event_type']],
    test_df[['event_type_encoded', 'event_type']]
]).drop_duplicates()

mapping = dict(zip(all_labels['event_type_encoded'], all_labels['event_type']))

actual_event_types = [mapping[x] for x in y_test_clean]
predicted_event_types = [mapping[x] for x in y_pred]

print("Actual :", actual_event_types[:20])
print("Predicted:", predicted_event_types[:20])

predicted_labels = le.inverse_transform(y_pred)
actual_labels = le.inverse_transform(y_test_clean)

# Save model
bst.save_model("xgboost_pipeline_event_model.json")

              precision    recall  f1-score   support

           0       0.96      1.00      0.98       214
           1       1.00      0.97      0.98       364
           2       1.00      1.00      1.00       219
           3       1.00      1.00      1.00       199
           4       0.96      1.00      0.98        46
           5       1.00      1.00      1.00       888

    accuracy                           0.99      1930
   macro avg       0.99      0.99      0.99      1930
weighted avg       0.99      0.99      0.99      1930

[[213   1   0   0   0   0]
 [  8 354   0   0   2   0]
 [  0   0 219   0   0   0]
 [  0   0   0 199   0   0]
 [  0   0   0   0  46   0]
 [  0   0   0   0   0 888]]
Actual : ['normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal']
Predicted: ['normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'n