In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
# from scipy.stats import shapiro
from scipy.stats import anderson
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
from pyclustertend import hopkins
from scipy.stats import shapiro

In [4]:
from sklearn.ensemble import IsolationForest
from pyod.models.mcd import MCD

In [5]:
def outlier_KMeans(data,contamination=0.05,clustering=True):
    data = np.array(data)
    data_dimension = data.ndim
    if data_dimension == 1:
        # Single Dimensional Data
        if anderson(data).statistic < anderson(data).critical_values[2]:
            # data_distribution = 'normal'
            sc = StandardScaler()
            x_sc = sc.fit_transform(data.reshape(-1,1))
            
            if clustering == False:
                outliers = x_sc[(x_sc > 3) | (x_sc < -3)]
                labels = np.ones(len(data))
                outlier_mask = np.isin(data, outliers)
                labels[outlier_mask] = -1
                return labels
            else:
                filtered_data = x_sc[(-3 <= x_sc) & (x_sc <= 3)]
                hopkins_stats = hopkins(filtered_data,len(filtered_data)//10)
                if hopkins_stats < 0.1:
                    return ('Data can not be clustered')
                else:
                    k_values = range(2, 11)
                    models = [KMeans(n_clusters=k, random_state=42,n_init='auto').fit(filtered_data.reshape(-1,1)) for k in k_values]
                    labels = [model.labels_ for model in models]
                    sil_score = [silhouette_score(filtered_data,label) for label in labels]
                    optimal_k = k_values[np.argmax(sil_score)]
                    
                    model = KMeans(n_clusters=optimal_k,random_state=0,n_init='auto')
                    model.fit(filtered_data.reshape(-1,1))
                    labels = model.labels_
                    cluster_label = -1 * np.ones_like(x_sc.flatten())
                    cluster_label[np.unique(np.where((-3 <= x_sc) & (x_sc <= 3))[0])] = labels
                    return cluster_label
        
        else:
            # data_distribution = 'non_normal'
            Q1 = np.quantile(data,0.25)
            Q3 = np.quantile(data,0.75)
            IQR = Q3-Q1
            lw = Q1 - 1.5*IQR
            rw = Q3 + 1.5*IQR
            
            if clustering == False:
                outliers = data[(data > rw) | (data < lw)]
                labels = np.ones(len(data))
                outlier_mask = np.isin(data, outliers)
                labels[outlier_mask] = -1
                return labels
            else:
                filtered_data = data[(lw <= data) & (data <= rw)]
                hopkins_stats = hopkins(filtered_data,len(filtered_data)//10)
                if hopkins_stats < 0.1:
                    return ('Data can not be clustered')
                else:
                    k_values = range(2, 11)
                    models = [KMeans(n_clusters=k, random_state=42,n_init='auto').fit(filtered_data.reshape(-1,1)) for k in k_values]
                    labels = [model.labels_ for model in models]
                    sil_score = [silhouette_score(filtered_data,label) for label in labels]
                    optimal_k = k_values[np.argmax(sil_score)]
                    
                    model = KMeans(n_clusters=optimal_k,random_state=0,n_init='auto')
                    model.fit(filtered_data.reshape(-1,1))
                    labels = model.labels_
                    cluster_label = -1 * np.ones_like(x_sc.flatten())
                    cluster_label[np.unique(np.where((-3 <= x_sc) & (x_sc <= 3))[0])] = labels
                    return cluster_label
    else:
        # Multi Dimensional Data
        if shapiro(data)[1] < 0.05:
            #data_distribution = 'non_normal'
            df = pd.DataFrame(data)
            model = IsolationForest(contamination=contamination)
            df['anomaly'] = model.fit_predict(df)
            
            if clustering == False:
                return np.array(df['anomaly'])
            else:
                fil = df['anomaly'] == 1
                filtered_data = df.loc[fil].iloc[:,:-1].values
                hopkins_stats = hopkins(filtered_data,len(filtered_data)//10)
                print(hopkins_stats)
                if hopkins_stats < 0.1:
                    return ('Data can not be clustered')
                else:
                    k_values = range(2, 11)
                    models = [KMeans(n_clusters=k, random_state=42,n_init='auto').fit(filtered_data) for k in k_values]
                    labels = [model.labels_ for model in models]
                    sil_score = [silhouette_score(filtered_data,label) for label in labels]
                    optimal_k = k_values[np.argmax(sil_score)]
                    
                    model = KMeans(n_clusters=optimal_k,random_state=42,n_init='auto')
                    model.fit(filtered_data)
                    labels = model.labels_
                    df.loc[df['anomaly'] == -1,'label'] = -1
                    df.loc[df['anomaly'] == 1,'label'] = labels
                    return np.array(df['label'])
                
        else:
            # data_distribution = 'normal'
            df = pd.DataFrame(data)
            model = MCD(contamination=contamination)
            model.fit(data)
            df['anomaly'] = model.predict(data)
            df['anomaly'] = df['anomaly'].replace({0: 1, 1: -1})

            if clustering == False:
                return np.array(df['anomaly'])
            else:
                fil = df['anomaly'] == 1
                filtered_data = df.loc[fil].iloc[:,:-1].values
                hopkins_stats = hopkins(filtered_data,len(filtered_data)//10)
                print(hopkins_stats)
                if hopkins_stats < 0.1:
                    return ('Data can not be clustered')
                else:
                    k_values = range(2, 11)
                    models = [KMeans(n_clusters=k, random_state=42,n_init='auto').fit(filtered_data) for k in k_values]
                    labels = [model.labels_ for model in models]
                    sil_score = [silhouette_score(filtered_data,label) for label in labels]
                    optimal_k = k_values[np.argmax(sil_score)]
                    
                    model = KMeans(n_clusters=optimal_k,random_state=42,n_init='auto')
                    model.fit(filtered_data)
                    labels = model.labels_
                    df.loc[df['anomaly'] == -1,'label'] = -1
                    df.loc[df['anomaly'] == 1,'label'] = labels
                    return np.array(df['label'])

### Dataset 1

In [7]:
df = pd.read_csv('../../Outlier_Datasets/Dataset_1.csv')
df.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,Target
0,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,0
1,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,0
2,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,0
3,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,0
4,0.425,0.3,0.095,0.3515,0.141,0.0775,0.12,0


In [8]:
df.iloc[:,-1].value_counts()

Target
0    2730
1      55
Name: count, dtype: int64

In [None]:
model = 