We are going to create 3 types of the dataset: 
1. Random injection - We are going to get random indexes inside of the benign dataset and going to insert the random malicious data into there regarding the session ID (which is the timestamp in integer)
2. 5 minute injection - We are going to find the 5 minute gap that exists in the benign dataset and going to inject the **whole** attacking dataset into the benign therefore it would have 5 minute(since one attacking dataset is 5 minute) malicious activities
3. Organized dataset - This dataset we are going to combine the malicious dataset and benign dataset(by pandas concat method) and going to sort the dataset by the session ID therefore they are going to be organized. 
----


## Preprocessing the dataset

In [56]:
import pandas as pd
import random

In [58]:
columns = ['ID', 'Date', 'Time', 'Session_ID', 'Depth', 'Path', 'Label'] # Set the names for each columns
dataset_benign = pd.read_csv("user1_log.txt", sep='|', header = None, names = columns ) # Read the datasets --> Change this line as needed depending on the name of the dataset 
dataset_benign.drop(['Date','Time'], axis = 1, inplace = True) # Drop the date and time since we are going to use session_ID column for the time
dataset_benign = dataset_benign.sort_values(by = 'Session_ID').reset_index(drop = True)
dataset_benign['Label'] = 0

## same for these lines, change the names if needed. 

malicious_logs1 = pd.read_csv('Attack1_log.txt', sep='|', header=None, names=columns)
malicious_logs2 = pd.read_csv('Attack2_log.txt', sep='|', header=None, names=columns)
malicious_logs3 = pd.read_csv('Attack3_log.txt', sep='|', header=None, names=columns)
malicious_logs1['Label'] = 1
malicious_logs2['Label'] = 1
malicious_logs3['Label'] = 1

combined_malicious_logs = pd.concat([malicious_logs1, malicious_logs2, malicious_logs3], ignore_index= True)
combined_malicious_logs['Label'] = 1
combined_malicious_logs.drop(['Date','Time'], axis = 1, inplace = True)
combined_malicious_logs

combined_malicious_logs

Unnamed: 0,ID,Session_ID,Depth,Path,Label
0,0,40675566,4,0\1\2\3\4,1
1,1,40675566,4,0\1\2\3\4,1
2,2,40675566,4,0\1\2\3\4,1
3,3,40675566,4,0\1\2\3\4,1
4,4,40675566,4,0\1\2\3\4,1
...,...,...,...,...,...
21657,14908,40242060,7,0\1\2\3\180\677\695\697,1
21658,14909,40242060,8,0\1\2\3\180\677\695\697\698,1
21659,14910,40242060,8,0\1\2\3\180\677\695\697\698,1
21660,14911,40242060,9,0\1\2\3\180\677\695\697\698\699,1


### 5-Minute Injection ###

In [70]:
def five_minute_injection(benign, attacker):
    injected_dataset = benign.copy()
    difference = pd.DataFrame([], columns= ["time_diff"])
    difference['time_diff'] = benign['Session_ID'].astype(int).shift(-1) - benign['Session_ID'].astype(int)
    gaps = difference[difference['time_diff'] >= 301]
    random_gap = int(random.choice(gaps.index))

    attacker['Session_ID'] = attacker["Session_ID"].astype(int) - int(attacker.loc[0, "Session_ID"]) + 1 + int(injected_dataset.loc[random_gap, "Session_ID"])

    injected_dataset = pd.concat([
        injected_dataset.iloc[:random_gap+1],  
        attacker,                      
        injected_dataset.iloc[random_gap+1:]  
    ]).reset_index(drop=True)
    return injected_dataset

# Quick sanity test for the function
sample_benign = pd.DataFrame([{'ID': "", 'Date': "", 'Time': "", 'Session_ID': "1", 'Depth': "4", 'Path': "", 'Label': "0"}, 
                              {'ID': "", 'Date': "", 'Time': "", 'Session_ID': "302", 'Depth': "4", 'Path': "", 'Label': "0"},
                              {'ID': "", 'Date': "", 'Time': "", 'Session_ID': "303", 'Depth': "4", 'Path': "", 'Label': "0"},
                              {'ID': "", 'Date': "", 'Time': "", 'Session_ID': "603", 'Depth': "4", 'Path': "", 'Label': "0"},
                              {'ID': "", 'Date': "", 'Time': "", 'Session_ID': "904", 'Depth': "4", 'Path': "", 'Label': "0"}])
sample_attacker = pd.DataFrame([{'ID': "", 'Date': "", 'Time': "", 'Session_ID': "1", 'Depth': "4", 'Path': "", 'Label': "1"}, {'ID': "", 'Date': "", 'Time': "", 'Session_ID': "3", 'Depth': "4", 'Path': "", 'Label': "1"}])

sample_injected_df = five_minute_injection(sample_benign, sample_attacker)
sample_injected_df


Unnamed: 0,ID,Date,Time,Session_ID,Depth,Path,Label
0,,,,1,4,,0
1,,,,302,4,,0
2,,,,303,4,,0
3,,,,603,4,,0
4,,,,604,4,,1
5,,,,606,4,,1
6,,,,904,4,,0


## Sedanspot
Start generating the Sedanspot dataset which is in format of **Timestamp, source, Destination, weight and label**

Lets work on the first type of the dataset which is having random indexes and inserting malicious dataset into the benign 

In [26]:
injected_dataset = dataset_benign.copy() # We copy the benign dataset therefore we don't have to import the benign dataset everytime when we work on it
injected_indices = [] # this is for checking if the dataset really have been randomly inserted
f = combined_malicious_logs # Which dataset we are going to insert. We adjust this line for changing which file we are trying to insert
for _, malicious_row in f.iterrows(): # We read the malicious dataset here
        malicious_data = {
                'Session_ID': malicious_row['Session_ID'],
                'Depth': malicious_row['Depth'],
                'Path': malicious_row['Path'],
                'Label': malicious_row['Label']
    }
        malicious_row_df = pd.DataFrame([malicious_data])
        
        random_index = random.randint(0, len(injected_dataset)) # To make sure that the malicious are being injected to random index 
        injected_indices.append(random_index)
        injected_dataset = pd.concat([
                injected_dataset.iloc[:random_index],  
                malicious_row_df,                      
                injected_dataset.iloc[random_index:]  
        ]).reset_index(drop=True)

print(f"Injected {len(f)} malicious rows into the benign dataset.")
print(f"Original benign dataset length: {len(dataset_benign)}")
print(f"Injected dataset length: {len(injected_dataset)}")
print(injected_indices)

NameError: name 'dataset_benign' is not defined

In [None]:
edges = []
prev_path = None

for _, row in injected_dataset.iterrows():
    current_path = row['Path']
    timestamp = row['Session_ID']
    label = row['Label']

    # Self-edge
    if prev_path == current_path:
        edges.append({
            'src_node' : current_path,
            'dst_node': current_path,
            'timestamp' : timestamp,
            'weight' : 1,
            'label' : label
            
        })
    elif prev_path is not None:
        edges.append({      
            'src_node' : prev_path,
            'dst_node': current_path,
            'timestamp' : row['Session_ID'],
            'weight' : 1, # Keeps the weight as 1 as default 
            'label' : label
        })
    prev_path = current_path

edges_df = pd.DataFrame(edges)
edges_df[['timestamp','src_node', 'dst_node',  'weight', 'label']].to_csv(
    'random_combined.csv', sep = ',', header = False, index = False
)

Work on the 5 minute gaps as we start by finding the 5 minute gaps

In [6]:
dataset_benign['time_diff'] = dataset_benign['Session_ID'].diff()
dataset_benign = dataset_benign.sort_values(by = "Session_ID", ascending= True)
gaps = dataset_benign[dataset_benign['time_diff'] >= 3000]
print(gaps[['Session_ID', 'time_diff']])

        Session_ID  time_diff
862       35414007     4035.0
2092      35482614    51278.0
3164      35504865     4653.0
4049      35568126    49681.0
6403      35655762    50812.0
...            ...        ...
257213    40319583   176128.0
260670    40404232    48167.0
262195    40422830     3068.0
266526    40491750    49672.0
271546    40574674    48456.0

[80 rows x 2 columns]


In [None]:
injected_dataset = dataset_benign.copy()
injected_dataset = pd.concat([injected_dataset.iloc[:862], # We insert the malicious_logs1 into the gap 
                    malicious_logs1,
                    injected_dataset.iloc[862:]])
print(len(injected_dataset))

Organized version by session ID

In [None]:
injected_dataset = dataset_benign.copy()
injected_dataset = pd.concat((injected_dataset, combined_malicious_logs),ignore_index= True)
injected_dataset = injected_dataset.sort_values(by="Session_ID", ascending=True)
injected_dataset

Code for converting the formatted data into the graph that self-edges and into csv


In [None]:
edges = []
prev_path = None
edge_weights = {}  
for _, row in injected_dataset.iterrows():
    current_path = row['Path']
    timestamp = row['Session_ID']
    label = row['Label']

    # Self-edge
    if prev_path == current_path:
        edge_key = (current_path, current_path)  
    elif prev_path is not None:
        edge_key = (prev_path, current_path)  
    else:
        prev_path = current_path
        continue  


    if edge_key in edge_weights:
        edge_weights[edge_key] += 1
    else:
        edge_weights[edge_key] = 1

    edges.append({
        'src_node': edge_key[0],
        'dst_node': edge_key[1],
        'timestamp': timestamp,
        'weight': edge_weights[edge_key], # Increment the weight as the same node appears 
        'label': label
    })

    prev_path = current_path  


edges_df = pd.DataFrame(edges)
edges_df
edges_df[['timestamp', 'src_node', 'dst_node', 'weight', 'label']].to_csv(
    'random_attack3_with.csv', sep = ',', header = False, index = False
)


## Anomrank ##

In [None]:
injected_dataset = dataset_benign.copy() # We copy the benign dataset therefore we don't have to import the benign dataset everytime when we work on it
injected_indices = [] # this is for checking if the dataset really have been randomly inserted
f = combined_malicious_logs # Which dataset we are going to insert. We adjust this line for changing which file we are trying to insert
for _, malicious_row in f.iterrows(): # We read the malicious dataset here
        malicious_data = {
                'Session_ID': malicious_row['Session_ID'],
                'Depth': malicious_row['Depth'],
                'Path': malicious_row['Path'],
                'Label': malicious_row['Label']
    }
        malicious_row_df = pd.DataFrame([malicious_data])
        
        random_index = random.randint(0, len(injected_dataset)) # To make sure that the malicious are being injected to random index 
        injected_indices.append(random_index)
        injected_dataset = pd.concat([
                injected_dataset.iloc[:random_index],  
                malicious_row_df,                      
                injected_dataset.iloc[random_index:]  
        ]).reset_index(drop=True)

print(f"Injected {len(f)} malicious rows into the benign dataset.")
print(f"Original benign dataset length: {len(dataset_benign)}")
print(f"Injected dataset length: {len(injected_dataset)}")
print(injected_indices)

In [None]:
injected_dataset = dataset_benign.copy()
injected_dataset = pd.concat([injected_dataset.iloc[:862],
                    combined_malicious_logs,
                    injected_dataset.iloc[862:]])
print(len(injected_dataset))

In [None]:
injected_dataset = dataset_benign.copy()
injected_dataset = pd.concat((injected_dataset, combined_malicious_logs))
injected_dataset = injected_dataset.sort_values(by = "Session_ID", ascending= True )

In [None]:
import hashlib 

edges = []
prev_path = None

for _, row in injected_dataset.iterrows():
    current_path = row['Path']
    timestamp = row['Session_ID']
    label = row['Label']
    current_path_hashed = int(hashlib.md5(current_path.encode()).hexdigest(), 16) % (10**8)

    # Self-edge
    if prev_path == current_path:
        edges.append({
            'src_node' : current_path_hashed,
            'dst_node': current_path_hashed,
            'timestamp' : timestamp,
            'label' : label
            
        })
    elif prev_path is not None:
        prev_path_hashed = int(hashlib.md5(prev_path.encode()).hexdigest(), 16) % (10**8)
        edges.append({      
            'src_node' : prev_path_hashed,
            'dst_node': current_path_hashed,
            'timestamp' : row['Session_ID'],
            'label' : label
        })
    prev_path = current_path

edges_df = pd.DataFrame(edges)
edges_df[['timestamp','src_node', 'dst_node', 'label']].to_csv(
    'organized_combined.txt', sep = ' ', header = False, index = False
)

MAD

## MIDAS ##

In [96]:
midas_dataset = five_minute_injection(dataset_benign, malicious_logs1)

def fill_gaps(wuil_df):
    wuil_df = wuil_df.copy()
    time_difference = wuil_df["Session_ID"].astype(int).shift(-1) - wuil_df["Session_ID"].astype(int)
    critical_indices = wuil_df[time_difference > 1].index.tolist()

    while critical_indices:
        cur_index = critical_indices.pop(0)
        gap = int(wuil_df.loc[cur_index+1, "Session_ID"]) - int(wuil_df.loc[cur_index, "Session_ID"])-1
        filler = pd.DataFrame([wuil_df.iloc[cur_index]]* gap)
        
        filler["Session_ID"] = range(int(wuil_df.loc[cur_index, "Session_ID"]) + 1, int(wuil_df.loc[cur_index, "Session_ID"]) + gap + 1)
        wuil_df = pd.concat([
            wuil_df.iloc[:cur_index+1],  
            filler,                      
            wuil_df.iloc[cur_index+1:]  
        ]).reset_index(drop=True)
        critical_indices = [i+gap for i in critical_indices]
    return wuil_df


# For sanity check, let's test this function
sample_df_with_gaps = pd.DataFrame([{"Session_ID": "1"}, {"Session_ID": "1"}, {"Session_ID": "4"}, {"Session_ID": "8"}, {"Session_ID": "9"}, {"Session_ID": "11"}])
fill_gaps(sample_df_with_gaps)

Unnamed: 0,Session_ID
0,1
1,1
2,2
3,3
4,4
5,5
6,6
7,7
8,8
9,9
