In [71]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

class ELMOutlierDetector:
    def __init__(self, hidden_layer_size=100, activation_func='relu'):
        self.hidden_layer_size = hidden_layer_size
        self.activation_func = activation_func

    def fit(self, X):
        input_size = X.shape[1]
        self.hidden_weights = np.random.randn(input_size, self.hidden_layer_size)
        H = np.maximum(0, np.dot(X, self.hidden_weights))
        self.output_weights = np.linalg.pinv(H)
        print(input_size, self.hidden_weights.shape, H.shape, self.output_weights.shape)

    def predict(self, X):
        H = np.maximum(0, np.dot(X, self.hidden_weights))
        # self.output_weights= np.linalg.pinv(self.output_weights)
        print(X.shape,self.hidden_weights.shape, H.shape, self.output_weights.shape, np.dot(H, self.output_weights).shape) 
        predictions = np.dot(H, self.output_weights)
        print(predictions.shape)
        errors = np.linalg.norm(X - np.dot(H, self.hidden_weights), axis=1)
        return errors




# Example usage
file_path = "statlog_data_set.csv"
dataset = pd.DataFrame(pd.read_csv(file_path))

# # Standardize the data
# scaler = StandardScaler()
# dataset_scaled = scaler.fit_transform(dataset)

# Instantiate and use ELMOutlierDetector
elm_detector = ELMOutlierDetector(hidden_layer_size=dataset.shape[1])
elm_detector.fit(dataset)
outlier_scores = elm_detector.predict(dataset)
threshold = np.percentile(outlier_scores, 99.9) #the value is the threshold for the outliers
outliers = dataset[outlier_scores > threshold]
print(outlier_scores,threshold)
print("outliers row values")
print(outliers)


# #comparing the outliers with the data and printing their row numbers    
# for i in range(len(outliers)):
#     print("Row number of the outlier is: ", dataset[dataset.eq(outliers[i]).all(1)].index.values)
#     # print("The outlier is: ", outliers[i])
#     print("\n")



37 (37, 37) (4438, 37) (37, 4438)
(4438, 37) (37, 37) (4438, 37) (37, 4438) (4438, 4438)
(4438, 4438)
[31970.18186175 17563.91405087 21724.35734702 ... 13183.65436399
 12714.33497109 13017.20164931] 17644.587582004657
outliers row values
     a1  a2   a3   a4  a5  a6   a7   a8  a9  a10  ...  a28  a29  a30  a31  \
0    50  12  547   84  25  63   45   12   5  415  ...   12    5  415   50   
2    50  12  547   84  25  63   45  102  83  101  ...   50   12  547   84   
529  41  32  139  150  44  32  139  154  44   29  ...  109   46   40  119   
530  44  32  139  154  44  29  145  150  44   29  ...  139   42   30  135   
588  46  29  138  151  49  32  138  151  46   29  ...  115   44   34  129   

     a32  a33  a34  a35  a36  a37  
0     12  547   84   25   63   45  
2     25   63   45  102   83  101  
529  139   42   30  135  157    2  
530  157   42   30  140  150    2  
588  143   42   29  135  150    2  

[5 rows x 37 columns]
