In [1]:
#Import necessary libraries
from sklearn.preprocessing import StandardScaler
from sklearn.covariance import EllipticEnvelope
from utils import elliptic_envelope
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import copy
import os

In [2]:
#path of folders and datasets
sheet_names = []
xlsx_folder = None
result_folder = None
anomaly_condition = None
cleaning_condition = None

In [3]:
#parameters
test_sheet = None

In [4]:
#A function that makes csv datasets from multi-sheet excels
def make_dataset(input_folder, output_folder, sheet_names):
    contents = os.listdir(input_folder)
    root = input_folder
    total_data = {}
    for sheet in sheet_names:
        total_data[sheet] = pd.DataFrame()
    for content in contents:
        adr = root + "/" + content
        if(os.path.isfile(adr)):
            file_data = pd.read_excel(adr, sheet_name = sheet_names)
            for sheet in sheet_names:
                total_data[sheet] = pd.concat([total_data[sheet], file_data[sheet]])
    for sheet in sheet_names:
        total_data[sheet] = total_data[sheet].reset_index(drop = True)
        total_data[sheet].to_csv(output_folder + "/" + sheet + ".csv", index = False)
    return total_data

In [5]:
#A function that loads all sheet datasets from csv files and shuffles them, then puts them in a dict
def load_dataset(data_folder, sheet_names):
    data = {}
    for sheet in sheet_names:
        path = data_folder + "/" + sheet + ".csv"
        data[sheet] = pd.read_csv(path, low_memory=False).sample(frac = 1).reset_index(drop = True)
    return data

In [6]:
#data = make_dataset(xlsx_folder, result_folder, sheet_names)

In [7]:
all_data = load_dataset(result_folder, sheet_names)

In [None]:
print(len(all_data))

In [9]:
#A function that drops rows of the dataset which have null values in specific columns
def cleaning(dataframe):
    data = copy.deepcopy(dataframe)
    all_cols = data.columns
    cols = []
    for i in range(len(data.columns)):
        if(isinstance(data.iloc[0, i], str) or data.iloc[0, i] is np.NaN):
            pass
        else:
            cols.append(data.columns[i])
    data = data.dropna(subset = cols)
    data = data[cleaning_condition]
    print(cols)
    data.reset_index(drop = True, inplace = True)
    return data

In [10]:
def normalize(dataframe):
    return pd.DataFrame(data = StandardScaler().fit_transform(dataframe))

In [None]:
row_data = {}
data = {}
for sheet in sheet_names:
    row_data[sheet] = cleaning(all_data[sheet])
    data[sheet] = copy.deepcopy(row_data[sheet])

In [None]:
all_data[test_sheet]

In [None]:
row_data[test_sheet]

In [None]:
# Drop columns of dataframe that their type is not number then normalizing them
for sheet in sheet_names:
    cols = []
    for i in range(len(row_data[sheet].columns)):
        if(isinstance(row_data[sheet].iloc[0, i], str) or row_data[sheet].iloc[0, i] is np.NaN):
            col = row_data[sheet].columns[i]
            cols.append(col)
    data[sheet] = data[sheet].drop(columns = cols)
    data[sheet] = normalize(data[sheet])
data[test_sheet]

In [None]:
#Run the elliptic envelope algorithm on all sheets and show some results of the test sheet
#model = elliptic_envelope.elliptic_envelope(data[test_sheet], 0.01, flag = True)
#model.detection()
results = {}
for sheet in sheet_names:
    if(sheet == test_sheet):
        model = elliptic_envelope.elliptic_envelope(row_data[sheet], data[sheet], 0.01, flag = True)
    else:
        model = elliptic_envelope.elliptic_envelope(row_data[sheet], data[sheet], 0.01)
    results[sheet] = model.detection()
print(results)