In [1]:
import numpy as np 
import os

def load_files(directory: str) -> list[str]:
    file_paths = np.array([])
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith('.csv'):
                file_paths = np.append(file_paths, os.path.join(root, file))
    return file_paths


In [2]:
import pandas as pd 
timestamp = 'timestamp'
unix_timestamp = 'unix_timestamp'

def load_data(file0: str, file1:str, label_key:str) -> tuple[pd.DataFrame, pd.DataFrame]:
    normals = pd.read_csv(file0, encoding='latin-1') 
    anomalies = pd.read_csv(file1, encoding='latin-1') 
    normals[label_key] = 0
    anomalies[label_key] = 1
    normals[unix_timestamp], normals[timestamp] = normals[timestamp], pd.to_datetime(normals[timestamp], unit='ms')
    anomalies[unix_timestamp], anomalies[timestamp] = anomalies[timestamp], pd.to_datetime(anomalies[timestamp], unit='ms')
    print(f'loaded normals: {normals.shape},  loaded anomalies: {anomalies.shape}')
    return normals, anomalies

In [3]:
from  datetime import timedelta, datetime

def random_time_range(anomalies: pd.DataFrame, iteration: int) -> tuple[datetime, datetime]:
    #determine start and end time. subtrat one second from start time to ensure the real start time is always included
    start_time, end_time = anomalies[timestamp].min() - timedelta(seconds=1) , anomalies[timestamp].max()
    print(f'anomalies start_time: {start_time} - end_time: {end_time}')
    
    #pick a random time delta between a range, weighted by the number of iterations
    delta = timedelta(minutes= np.random.randint(5,30) + 2 ** iteration)
    latest_start  = np.maximum(end_time - delta, start_time)
        
    random_start = start_time + timedelta(seconds=np.random.randint(0, int((latest_start - start_time).total_seconds() + 1))) 
    random_end = random_start + delta
    print(f'random time rand selected {random_start} - {random_end}')
    return random_start, random_end

In [4]:

def inject_anomalies(normal:pd.DataFrame, anomalies:pd.DataFrame, start_time: datetime, end_time:datetime) -> tuple[pd.DataFrame, pd.Index]:
    anomalies_condition = (start_time <= anomalies[timestamp]) & (anomalies[timestamp] <= end_time)
    normal_conditoin = (start_time <= normal[timestamp]) & (normal[timestamp] <= end_time)
    #find anomalies that fall within the random time range
    subset_anomalies = anomalies[anomalies_condition]
    index = subset_anomalies.index
    print(f'{subset_anomalies.shape[0]} anomalies selected')

    #if no anomalies found do nothing
    if subset_anomalies.empty:
        return normal, index
    
    #find normals that fall within the same random time range
    subset_normal = normal[normal_conditoin]
    
    #if no normals found append anomalies to normals list
    if subset_normal.empty:
        print('no normal data within range, appending anomalies')
        return pd.concat([normal,subset_anomalies], ignore_index=True), index
    
    #if normals found remove them and append anomalies to list to prevent unrealistic data where 2 users are interating with 1 machine
    print(f'{subset_normal.shape[0]} normals selected to be overwritten')
    
    return pd.concat([normal.drop(normal[(subset_anomalies[timestamp].min() <= normal[timestamp]) & (subset_anomalies[timestamp].max() <= end_time)].index), subset_anomalies], ignore_index=True), index 

In [5]:
def merge_data(normals:pd.DataFrame, anomalies:pd.DataFrame, label_key:str, anomaly_percentage_target:float=0.05 ) -> tuple[pd.DataFrame,float]:
    max_size = normals.shape[0] + anomalies.shape[0]
    anomaly_percentage = 0
    iteration = 0
    #merge two datasets
    while anomaly_percentage < anomaly_percentage_target and normals.shape[0] < max_size and not anomalies.empty:
        iteration+=1
        print(f'current iteration: {iteration} - anomaly_size: {anomalies.shape} - anomaly_percentage: {anomaly_percentage} - current_size: {normals.shape[0]} -  max_size: {max_size}')
        print(f'starting shape: {normals.shape}')
        normals, index = inject_anomalies(normals, anomalies, *random_time_range(anomalies, iteration))
        anomalies.drop(index, inplace=True)
        print(f'ending shape: {normals.shape}')
        anomaly_percentage = normals[normals[label_key] == 1].shape[0] / normals.shape[0]

    print(f'total iterations: {iteration} - final anomaly_percentage: {anomaly_percentage}')
    normals.sort_values(timestamp)
    print(f'merged shape: {normals.shape}')
    return normals, anomaly_percentage

In [6]:
from sklearn.preprocessing import LabelEncoder

def reduce_dimensions(data: pd.DataFrame) -> pd.DataFrame:
    columns_to_drop = [
        timestamp,
        # 'active_apps_average',
        # 'current_app',
        # 'penultimate_app',
        # 'changes_between_apps',
        # 'current_app_foreground_time',
        # 'current_app_average_processes',
        # 'current_app_stddev_processes',
        # 'current_app_average_cpu',
        # 'current_app_stddev_cpu',
        # 'system_average_cpu',
        # 'system_stddev_cpu',
        # 'current_app_average_mem',
        # 'current_app_stddev_mem',
        # 'system_average_mem',
        # 'system_stddev_mem',
        # 'received_bytes',
        # 'sent_bytes',
        'USER'
    ]
    return data.iloc[:, ~data.columns.isin(columns_to_drop)]

def encode_columns(data: pd.DataFrame) -> pd.DataFrame:
    string_columns = data.select_dtypes(include=['object'])
    for column in string_columns:
        label_encoder = LabelEncoder()
        print(f"encoding column: {column}")
        data.loc[:,column] = label_encoder.fit_transform(data[column])
    
    return data

In [7]:
from sklearn.preprocessing import StandardScaler

def normalize_data(data: pd.DataFrame, label_key: str) -> tuple[np.ndarray,np.ndarray]:
    user_mask = data.columns.isin([label_key])
    features = data.iloc[:, ~user_mask].to_numpy()
    labels = data.iloc[:, user_mask].to_numpy()
    
    #normalizing data 
    scaler = StandardScaler()
    features_scaled = scaler.fit_transform(features)
    return features_scaled, labels

In [8]:
from sklearn.model_selection import train_test_split 
from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator

def train_test_validate_split_data(features: np.ndarray, labels: np.ndarray, train_size: float = 0.70, test_size: float = 0.15, validate_size: float = 0.15, look_back: int = 0, batch_size: int = 128) -> tuple[TimeseriesGenerator,TimeseriesGenerator,TimeseriesGenerator]:

    #split data
    non_train_size = test_size + validate_size
    features_train, features_temp, labels_train, labels_temp = train_test_split(features,labels, train_size=train_size, test_size= non_train_size, shuffle=True, random_state=42) 

    # Split the temporary data into 50% validation and 50% test (50% of 30% is 15% each)
    features_validation, features_test, labels_validation, labels_test = train_test_split(features_temp, labels_temp, train_size= validate_size / non_train_size, test_size=test_size / non_train_size, shuffle=False, random_state=42)
    
    train_generator = TimeseriesGenerator(features_train, labels_train, length=look_back, batch_size=batch_size)
    test_generator = TimeseriesGenerator(features_test, labels_test, length=look_back, batch_size=batch_size)
    validation_generator = TimeseriesGenerator(features_validation, labels_validation, length=look_back, batch_size=batch_size)
    return train_generator, test_generator, validation_generator 


In [9]:
from sklearn.metrics import accuracy_score,mean_absolute_error, root_mean_squared_error, confusion_matrix, ConfusionMatrixDisplay

def safe_divide(numerator: float, denomonator: float) -> float:
    return 0 if denomonator == 0 else numerator / denomonator

class Results:
    def __init__(self, actual, predictions):
        self.accuracy = accuracy_score(actual, predictions)
        self.mae = mean_absolute_error(actual, predictions)
        self.rmse = root_mean_squared_error(actual, predictions)
        self.cm = confusion_matrix(actual, predictions, labels=[True, False])
        self.true_negatives, self.false_positives, self.false_negatives, self.true_positives = self.cm.ravel()
        self.total = self.true_negatives + self.false_positives + self.false_negatives + self.true_positives
        self.false_positives_rate = self.false_positives / self.total
        self.false_negatives_rate = self.false_negatives / self.total
        self.precision = safe_divide(self.true_positives, (self.true_positives + self.false_positives) )  
        self.recall = safe_divide(self.true_positives, (self.true_positives + self.false_negatives))  
        self.f1_score =  2 * safe_divide((self.precision * self.recall) ,(self.precision + self.recall))

    
    def __str__(self):
        return f'''Results 
    Accuracy: {self.accuracy:.2f}
    Mean Absolute Error: {self.mae:.2f}
    Root Mean Squared Error: {self.rmse:.2f}
    False Positives: {self.false_positives_rate:.2f}
    Fales Negatives: {self.false_negatives_rate: .2f}
    Precision: {self.precision:.2f}
    Recall: {self.recall:.2f}
    F1 Score: {self.f1_score:.2f}'''


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Input, Dropout, Bidirectional
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import BinaryAccuracy
from tensorflow.keras.metrics import Recall, Precision, TruePositives, TrueNegatives, FalseNegatives, FalsePositives, F1Score


def build_train_test_validate_model(train: TimeseriesGenerator, test:TimeseriesGenerator,  validation: TimeseriesGenerator, epochs:int, look_back:int=0):
    
    #build model
    model = Sequential()
    features_count = train.data.shape[1]
    model.add(Input(shape=(look_back,features_count)))
    model.add(Bidirectional(LSTM(50, kernel_regularizer=l2(0.01), recurrent_regularizer=l2(0.01), bias_regularizer=l2(0.01), activation='relu')))
    # model.add(Dropout(0.5))
    model.add(Dense(1, activation='sigmoid'))
    
    learning_rate = 0.0001
    optimizer = Adam(learning_rate=learning_rate)
    accuracy = BinaryAccuracy(name="binary_accuracy", dtype=int, threshold=0.5)
    metrics = [
        accuracy,
        Recall(),
        Precision(),
        TruePositives(),
        TrueNegatives(),
        FalseNegatives(),
        FalsePositives(),
        F1Score()
    ]
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=metrics)
    model.summary()
    
    #train and test model
    history = model.fit(train, steps_per_epoch=len(train), epochs=epochs, verbose=1, validation_data=test)
    
    #validate model
    predictions = model.predict(validation)
    predictions = (predictions >= 0.5).astype(int)
    actual = validation.targets[look_back:]
    results = Results(actual, predictions)

    return history, results  



In [11]:


import matplotlib.pyplot as plt


def plotHistory(axes, data, history):
    anomaly_percentage = f'{data["anomaly_percentage"]:1.2%}'
    plt1 = axes[0,0]
    plt1.plot(history['loss'], label=anomaly_percentage)
    plt1.set_title(f'Training Loss')
    plt1.set_ylabel('Loss')
    
    plt2 = axes[0,1]
    plt2.plot(history['val_loss'], label=anomaly_percentage)
    plt2.set_title(f'Validation Loss')
    
    plt3 = axes[1,0]
    plt3.plot(history['binary_accuracy'], label=anomaly_percentage)
    plt3.set_title(f'Training Accuracy')
    plt3.set_ylabel('Accuracy')
    
    plt4 = axes[1,1]
    plt4.plot(history['val_binary_accuracy'], label=anomaly_percentage)
    plt4.set_title(f'Validation Accuracy')
        

def plotConfusionMatrix(matrix):
    disp = ConfusionMatrixDisplay(confusion_matrix=matrix, display_labels=['Normal', 'Anomaly'])
    disp.plot()

def plotMetrics(metrics: list):
    fig, axes = plt.subplots(2, 2, sharex=True, sharey='row')
    fig.supxlabel('Epoch')

    for  metric in metrics:
        plotHistory(axes, metric['merged'], metric['history'])
        plotConfusionMatrix(metric['results'].cm)
        
    handles, labels = fig.axes[0].get_legend_handles_labels()
    fig.legend(handles, labels, loc='upper center',title='Anomaly Percentage' )
    plt.tight_layout()
    plt.show()
    
        

In [None]:
working_dir = '/Users/osono/Library/CloudStorage/OneDrive-NorthCarolinaA&TStateUniversity/COMP 850 - 1A Big Data Analytics/Project/continuous authentication/datasets/Behacom'
os.chdir(working_dir)
current_directory = os.getcwd()
print(f'changed working dir to: {current_directory}')

label_key = 'is_anomaly'
look_back = 3
epochs = 5
file_paths = load_files(current_directory)

metrics = []
while file_paths.size > 0 and len(metrics)  < 1:
    choices =  np.random.choice(file_paths, size=2, replace=False)
    file_paths = file_paths[np.isin(file_paths, choices, invert=True)]
    normals, anomalies = load_data(choices[0], choices[1], label_key)
    metric = {
        'normals': {
            'path': choices[0],
            'shape': normals.shape
        },
        'anomalies': {
            'path': choices[1],
            'shape': anomalies.shape
        }
    }
    merged, anomaly_percentage = merge_data(normals, anomalies, label_key)
    merged = encode_columns(reduce_dimensions(merged))
    metric['merged'] = {
        'shape': merged.shape,
        'anomaly_percentage': anomaly_percentage
    }
    
    features, labels = normalize_data(merged, label_key)
    train, test, validation = train_test_validate_split_data(features, labels, look_back=look_back, batch_size=10)
    history, results = build_train_test_validate_model(train, test, validation, epochs=epochs, look_back=look_back)
    metric['history'] = history.history
    metric['results'] = results
    metrics.append(metric)

plotMetrics(metrics)    

changed working dir to: /Users/osono/Library/CloudStorage/OneDrive-NorthCarolinaA&TStateUniversity/COMP 850 - 1A Big Data Analytics/Project/continuous authentication/datasets/Behacom
loaded normals: (17284, 12053),  loaded anomalies: (2128, 12053)
current iteration: 1 - anomaly_size: (2128, 12053) - anomaly_percentage: 0 - current_size: 17284 -  max_size: 19412
starting shape: (17284, 12053)
anomalies start_time: 2019-12-02 20:51:43.401000 - end_time: 2019-12-06 11:08:33.639000
random time rand selected 2019-12-03 08:27:52.401000 - 2019-12-03 08:46:52.401000
0 anomalies selected
ending shape: (17284, 12053)
current iteration: 2 - anomaly_size: (2128, 12053) - anomaly_percentage: 0.0 - current_size: 17284 -  max_size: 19412
starting shape: (17284, 12053)
anomalies start_time: 2019-12-02 20:51:43.401000 - end_time: 2019-12-06 11:08:33.639000
random time rand selected 2019-12-04 09:46:44.401000 - 2019-12-04 09:57:44.401000
0 anomalies selected
ending shape: (17284, 12053)
current iteratio

Epoch 1/5


  self._warn_if_super_not_called()
