In [None]:
from scipy.io import arff
import urllib.request
import io

url = "http://www.ece.uah.edu/~thm0009/icsdatasets/water_final.arff"
ftpstream = urllib.request.urlopen(url)
data, meta = arff.loadarff(io.StringIO(ftpstream.read().decode('utf-8')))

In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.svm import SVC

import time

In [None]:
df = pd.DataFrame(data)
df['result'] = df['result'].apply(lambda x: int(x.decode('utf-8')))
df

Unnamed: 0,command_address,response_address,command_memory,response_memory,command_memory_count,response_memory_count,comm_read_function,comm_write_fun,resp_read_fun,resp_write_fun,...,'H','L',LL,control_mode,control_scheme,pump,crc_rate,measurement,time,result
0,7.0,7.0,183.0,233.0,9.0,10.0,3.0,10.0,3.0,10.0,...,80.0,20.0,10.0,2.0,1.0,0.0,1.0,14.894827,1.13,0
1,7.0,7.0,183.0,233.0,9.0,10.0,3.0,10.0,3.0,10.0,...,80.0,20.0,10.0,2.0,1.0,0.0,1.0,16.259237,1.25,0
2,7.0,7.0,183.0,233.0,9.0,10.0,3.0,10.0,3.0,10.0,...,80.0,20.0,10.0,2.0,1.0,0.0,1.0,17.254122,1.04,0
3,7.0,7.0,183.0,233.0,9.0,10.0,3.0,10.0,3.0,10.0,...,80.0,20.0,10.0,2.0,1.0,0.0,1.0,17.936329,1.07,0
4,7.0,7.0,183.0,233.0,9.0,10.0,3.0,10.0,3.0,10.0,...,80.0,20.0,10.0,2.0,1.0,0.0,1.0,18.703810,1.20,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
236174,7.0,7.0,183.0,233.0,10.0,10.0,3.0,10.0,3.0,10.0,...,80.0,20.0,10.0,2.0,1.0,0.0,0.0,89.198410,1.30,0
236175,7.0,7.0,183.0,233.0,10.0,10.0,3.0,10.0,3.0,10.0,...,80.0,20.0,10.0,2.0,1.0,0.0,0.0,89.567932,1.20,0
236176,7.0,7.0,183.0,233.0,10.0,10.0,3.0,10.0,3.0,10.0,...,80.0,20.0,10.0,2.0,1.0,0.0,0.0,88.317230,1.05,0
236177,7.0,7.0,183.0,233.0,10.0,10.0,3.0,10.0,3.0,10.0,...,80.0,20.0,10.0,2.0,1.0,0.0,0.0,88.374077,1.07,0


In [None]:
# Count the number of samples for each class
class_counts = df['result'].value_counts()

# Display the counts
print(class_counts)

0    172415
7     34002
2     12460
1      9187
4      3725
3      1833
5      1320
6      1237
Name: result, dtype: int64


In [None]:
X = df.drop(columns=['result'])
y = df['result']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
#XgBoost
start_time = time.perf_counter()
model1 = xgb.XGBClassifier(objective='multi:softmax', num_class=8, random_state=42)
model1.fit(X_train, y_train)
end_time = time.perf_counter()
y_pred = model1.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
precision = precision_score(y_test, y_pred, average='weighted')
print(f"Precison: {precision:.4f}")
recall = recall_score(y_test, y_pred, average='weighted')
print(f"Recall: {recall:.4f}")
f1 = f1_score(y_test, y_pred, average='weighted')
print(f"F1_score: {f1:.4f}")

computation_time = end_time - start_time
print("Computation time:", computation_time, "seconds")


Accuracy: 0.9668
Precison: 0.9746
Recall: 0.9668
F1_score: 0.9693
Computation time: 24.77116343200001 seconds


In [None]:
n_classes = len(np.unique(y_test))
overall_FPR = 0

for class_of_interest in range(n_classes):
  # Create a binary confusion matrix for the chosen class versus the rest
  binary_true = [1 if label == class_of_interest else 0 for label in y_test]
  binary_pred = [1 if label == class_of_interest else 0 for label in y_pred]

  conf_matrix = confusion_matrix(binary_true, binary_pred)

  # Extract values from the confusion matrix
  TN, FP, FN, TP = conf_matrix.ravel()

  # Calculate the False Positive Rate (FPR) for the class
  class_FPR = FP / (FP + TN)

  # Weight the FPR by the class distribution
  class_weight = sum(binary_true) / len(binary_true)
  overall_FPR += class_weight * class_FPR

print(f"Overall False Positive Rate: {overall_FPR}")

Overall False Positive Rate: 0.015195529412627895


In [None]:
#Decision Tree
start_time = time.perf_counter()
model2 = DecisionTreeClassifier(random_state=42)
model2.fit(X_train, y_train)
end_time = time.perf_counter()
y_pred = model2.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
precision = precision_score(y_test, y_pred, average='weighted')
print(f"Precison: {precision:.4f}")
recall = recall_score(y_test, y_pred, average='weighted')
print(f"Recall: {recall:.4f}")
f1 = f1_score(y_test, y_pred, average='weighted')
print(f"F1_score: {f1:.4f}")
computation_time = end_time - start_time
print("Computation time:", computation_time, "seconds")


Accuracy: 0.9887
Precison: 0.9892
Recall: 0.9887
F1_score: 0.9889
Computation time: 2.103538350000008 seconds


In [None]:
n_classes = len(np.unique(y_test))
overall_FPR = 0

for class_of_interest in range(n_classes):
  # Create a binary confusion matrix for the chosen class versus the rest
  binary_true = [1 if label == class_of_interest else 0 for label in y_test]
  binary_pred = [1 if label == class_of_interest else 0 for label in y_pred]

  conf_matrix = confusion_matrix(binary_true, binary_pred)

  # Extract values from the confusion matrix
  TN, FP, FN, TP = conf_matrix.ravel()

  # Calculate the False Positive Rate (FPR) for the class
  class_FPR = FP / (FP + TN)

  # Weight the FPR by the class distribution
  class_weight = sum(binary_true) / len(binary_true)
  overall_FPR += class_weight * class_FPR

print(f"Overall False Positive Rate: {overall_FPR}")

Overall False Positive Rate: 0.008786970243834763


In [None]:
#Random Forest
start_time = time.perf_counter()
model3 = RandomForestClassifier(random_state=42)
model3.fit(X_train, y_train)
end_time = time.perf_counter()
y_pred = model3.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
precision = precision_score(y_test, y_pred, average='weighted')
print(f"Precison: {precision:.4f}")
recall = recall_score(y_test, y_pred, average='weighted')
print(f"Recall: {recall:.4f}")
f1 = f1_score(y_test, y_pred, average='weighted')
print(f"F1_score: {f1:.4f}")
computation_time = end_time - start_time
print("Computation time:", computation_time, "seconds")

Accuracy: 0.9859
Precison: 0.9871
Recall: 0.9859
F1_score: 0.9863
Computation time: 28.046680934999983 seconds


In [None]:
n_classes = len(np.unique(y_test))
overall_FPR = 0

for class_of_interest in range(n_classes):
  # Create a binary confusion matrix for the chosen class versus the rest
  binary_true = [1 if label == class_of_interest else 0 for label in y_test]
  binary_pred = [1 if label == class_of_interest else 0 for label in y_pred]

  conf_matrix = confusion_matrix(binary_true, binary_pred)

  # Extract values from the confusion matrix
  TN, FP, FN, TP = conf_matrix.ravel()

  # Calculate the False Positive Rate (FPR) for the class
  class_FPR = FP / (FP + TN)

  # Weight the FPR by the class distribution
  class_weight = sum(binary_true) / len(binary_true)
  overall_FPR += class_weight * class_FPR

print(f"Overall False Positive Rate: {overall_FPR}")

Overall False Positive Rate: 0.00900704261589686


In [None]:
#Bagging Classifier
start_time = time.perf_counter()
model4 = BaggingClassifier(random_state=42)
model4.fit(X_train, y_train)
end_time = time.perf_counter()
y_pred = model4.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
precision = precision_score(y_test, y_pred, average='weighted')
print(f"Precison: {precision:.4f}")
recall = recall_score(y_test, y_pred, average='weighted')
print(f"Recall: {recall:.4f}")
f1 = f1_score(y_test, y_pred, average='weighted')
print(f"F1_score: {f1:.4f}")
computation_time = end_time - start_time
print("Computation time:", computation_time, "seconds")

Accuracy: 0.9895
Precison: 0.9899
Recall: 0.9895
F1_score: 0.9896
Computation time: 8.095305785000022 seconds


In [None]:
n_classes = len(np.unique(y_test))
overall_FPR = 0

for class_of_interest in range(n_classes):
  # Create a binary confusion matrix for the chosen class versus the rest
  binary_true = [1 if label == class_of_interest else 0 for label in y_test]
  binary_pred = [1 if label == class_of_interest else 0 for label in y_pred]

  conf_matrix = confusion_matrix(binary_true, binary_pred)

  # Extract values from the confusion matrix
  TN, FP, FN, TP = conf_matrix.ravel()

  # Calculate the False Positive Rate (FPR) for the class
  class_FPR = FP / (FP + TN)

  # Weight the FPR by the class distribution
  class_weight = sum(binary_true) / len(binary_true)
  overall_FPR += class_weight * class_FPR

print(f"Overall False Positive Rate: {overall_FPR}")

Overall False Positive Rate: 0.008463601423746773


In [None]:
from sklearn.linear_model import LogisticRegression

start_time = time.perf_counter()
model5 = LogisticRegression(max_iter=1000, random_state=42, multi_class='ovr')  # 'ovr' for multiclass
model5.fit(X_train, y_train)
end_time = time.perf_counter()

y_pred = model5.predict(X_test)

# Calculate performance metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Print the results
print(f"Accuracy: {accuracy:.4f}")
print(f"Precison: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1_score: {f1:.4f}")
computation_time = end_time - start_time
print("Computation time:", computation_time, "seconds")

ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy: 0.7348
Precison: 0.5399
Recall: 0.7348
F1_score: 0.6225
Computation time: 7.616477517999982 seconds


  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
n_classes = len(np.unique(y_test))
overall_FPR = 0

for class_of_interest in range(n_classes):
  # Create a binary confusion matrix for the chosen class versus the rest
  binary_true = [1 if label == class_of_interest else 0 for label in y_test]
  binary_pred = [1 if label == class_of_interest else 0 for label in y_pred]

  conf_matrix = confusion_matrix(binary_true, binary_pred)

  # Extract values from the confusion matrix
  TN, FP, FN, TP = conf_matrix.ravel()

  # Calculate the False Positive Rate (FPR) for the class
  class_FPR = FP / (FP + TN)

  # Weight the FPR by the class distribution
  class_weight = sum(binary_true) / len(binary_true)
  overall_FPR += class_weight * class_FPR

print(f"Overall False Positive Rate: {overall_FPR}")

Overall False Positive Rate: 0.7347997290202388


In [None]:
from sklearn.neighbors import KNeighborsClassifier

# Create and train the KNN classifier
start_time = time.perf_counter()
k = 5  # You can adjust the value of k
model6 = KNeighborsClassifier(n_neighbors=k)
model6.fit(X_train, y_train)
end_time = time.perf_counter()

# Make predictions on the test set
y_pred = model6.predict(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
precision = precision_score(y_test, y_pred, average='weighted')
print(f"Precison: {precision:.4f}")
recall = recall_score(y_test, y_pred, average='weighted')
print(f"Recall: {recall:.4f}")
f1 = f1_score(y_test, y_pred, average='weighted')
print(f"F1_score: {f1:.4f}")
computation_time = end_time - start_time
print("Computation time:", computation_time, "seconds")

Accuracy: 0.9792
Precison: 0.9821
Recall: 0.9792
F1_score: 0.9801
Computation time: 0.03923214899998584 seconds


In [None]:
n_classes = len(np.unique(y_test))
overall_FPR = 0

for class_of_interest in range(n_classes):
  # Create a binary confusion matrix for the chosen class versus the rest
  binary_true = [1 if label == class_of_interest else 0 for label in y_test]
  binary_pred = [1 if label == class_of_interest else 0 for label in y_pred]

  conf_matrix = confusion_matrix(binary_true, binary_pred)

  # Extract values from the confusion matrix
  TN, FP, FN, TP = conf_matrix.ravel()

  # Calculate the False Positive Rate (FPR) for the class
  class_FPR = FP / (FP + TN)

  # Weight the FPR by the class distribution
  class_weight = sum(binary_true) / len(binary_true)
  overall_FPR += class_weight * class_FPR

print(f"Overall False Positive Rate: {overall_FPR}")

Overall False Positive Rate: 0.012079033742679686
