# This k-Nearest Neighbors model predicts the type of fault using gas ratios:
## Class 0: Partial Discharge
## Class 1: Low Energy Discharge
## Class 2: High Energy Discharge
## Class 3: Thermal Fault - t < 300C
## Class 4: Thermal Fault - 300C < t < 700C
## Class 5: Thermal Fault - t > 700C
## Class 6: No Fault / Insufficient Data

### The data was preprocessed on Excel and all labels were added there. This is due to convenience.
### Required format of dataset is in the repo.

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn import neighbors, datasets
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [None]:
#Import all data files. The ratios are calculated from the raw data and stored in csv files.
ratios = pd.read_csv('')
classes = pd.read_csv('')
test = pd.read_csv('')
test_classes = pd.read_csv('')

ratios = ratios.iloc[:-1,:]
classes = classes.iloc[:-1,:]
test = test.iloc[:-1,:]
test_classes = test_classes.iloc[:-1,:]

ratios['ratio1'] = pd.to_numeric(ratios['ratio1'],errors='coerce')
ratios['ratio2'] = pd.to_numeric(ratios['ratio2'],errors='coerce')
ratios['ratio3'] = pd.to_numeric(ratios['ratio3'],errors='coerce')
classes['Class'] = pd.to_numeric(classes['Class'],errors='coerce')
test['ratio1'] = pd.to_numeric(test['ratio1'],errors='coerce')
test['ratio2'] = pd.to_numeric(test['ratio2'],errors='coerce')
test['ratio3'] = pd.to_numeric(test['ratio3'],errors='coerce')
test_classes['Class'] = pd.to_numeric(test_classes['Class'],errors='coerce')

ratios = ratios.dropna()
classes = classes.dropna()
test = test.dropna()
test_classes = test_classes.dropna()

ratios.isnull().any()
classes.isnull().any()
test.isnull().any()
test_classes.isnull().any()

#Ratios are determined by the following formulae:
#data['ratio1'] = data['c2h2']/data['c2h4']
#data['ratio2'] = data['ch4']/data['h2']
#data['ratio3'] = data['c2h4']/data['c2h6']

#Convert df to array.
array_ratios = ratios.to_numpy()
#array_ratios
array_classes = classes.to_numpy()
#array_classes
array_test = test.to_numpy()
#array_test
array_test_classes = test_classes.to_numpy()
#array_test_classes

In [None]:
# k-NN model training and prediction:
X_train, y_train = array_ratios, array_classes
X_test, y_test = array_test, array_test_classes
knn = neighbors.KNeighborsClassifier(n_neighbors=7)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

# Trial with Random Forest classifier:
#clf = RandomForestClassifier(max_depth=2, random_state=0, n_estimators=180)
#clf.fit(X_train, y_train)
#y_pred = clf.predict(X_test)

y_pred

In [None]:
from sklearn import metrics
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

In [None]:
print(knn.predict([[0.05, 0.1, 0.1]]))

In [None]:
print(knn.predict([[0.1, 2, 5]]))

In [None]:
# View confusion matrix for test data and predictions
confusion_matrix(y_test, y_pred)

In [None]:
# Get and reshape confusion matrix data
matrix = confusion_matrix(y_test, y_pred)
matrix = matrix.astype('float') / matrix.sum(axis=1)[:, np.newaxis]

# Build the plot
plt.figure(figsize=(10,7))
sns.set(font_scale=1.4)
sns.heatmap(matrix, annot=True, annot_kws={'size':10},
            cmap=plt.cm.Reds, linewidths=0.2)

# Add labels to the plot
class_names = ['Class0', 'Class1', 'Class2', 'Class3', 'Class4', 'Class5', 'Class6']
tick_marks = np.arange(len(class_names))
tick_marks2 = tick_marks + 0.5
plt.xticks(tick_marks, class_names, rotation=25)
plt.yticks(tick_marks2, class_names, rotation=0)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix for k-NN Model')
plt.show()

In [None]:
# View the classification report for test data and predictions
print(classification_report(y_test, y_pred))