In [None]:
import logging 
from torch_geometric import seed_everything
import os
import configparser
logging.basicConfig(
    format='%(asctime)s - %(levelname)s - %(message)s',
    level=logging.INFO,
    filename='app.log',
    #filemode='w'
    filemode='a',
)
logging.info('This will get logged to a file')

In [None]:
%load_ext autoreload
%autoreload 1
from benchmark.preprocess import *
from benchmark.constants import *
from benchmark.db_import import *
from benchmark.ground_truth import *
from benchmark.construct_threatrace_graph import *
from benchmark.threatrace import *

In [None]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 100)
import itertools
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PowerTransformer
from sklearn.metrics import f1_score, classification_report, confusion_matrix

from torch_geometric.loader import DataLoader
from torch_geometric.loader.neighbor_loader import NeighborLoader

from torch_geometric.data import HeteroData
from torch.nn import functional as F
from torch.optim import Adam
from torch import nn
import torch

In [None]:
df = pd.read_csv("/Users/robinbuchta/Documents/GitHub/Hands-On-Graph-Neural-Networks-Using-Python/Chapter16/CIDDS-001/traffic/OpenStack/CIDDS-001-internal-week1.csv")
df 

In [None]:
df = df.drop(columns=['Src Pt', 'Dst Pt', 'Flows', 'Tos', 'class', 'attackID', 'attackDescription'])
df['attackType'] = df['attackType'].replace('---', 'benign')
df['Date first seen'] = pd.to_datetime(df['Date first seen'])
df

In [None]:
count_labels = df['attackType'].value_counts() / len(df) * 100
print(count_labels)
plt.pie(count_labels[:3], labels=df['attackType'].unique()[:3], autopct='%.0f%%')
plt.show()

In [None]:
fig, ((ax1, ax2, ax3)) = plt.subplots(1, 3, figsize=(15,5))
df['Duration'].hist(ax=ax1)
ax1.set_xlabel("Duration")
df['Packets'].hist(ax=ax2)
ax2.set_xlabel("Number of packets")
pd.to_numeric(df['Bytes'], errors='coerce').hist(ax=ax3)
ax3.set_xlabel("Number of bytes")
plt.show()

In [None]:
df['weekday'] = df['Date first seen'].dt.weekday
df = pd.get_dummies(df, columns=['weekday']).rename(columns = {'weekday_0': 'Monday',
                                                              'weekday_1': 'Tuesday',
                                                              'weekday_2': 'Wednesday',
                                                              'weekday_3': 'Thursday',
                                                              'weekday_4': 'Friday',
                                                              'weekday_5': 'Saturday',
                                                              'weekday_6': 'Sunday',
                                                             })

df['daytime'] = (df['Date first seen'].dt.second +df['Date first seen'].dt.minute*60 + df['Date first seen'].dt.hour*60*60)/(24*60*60)

In [None]:
def one_hot_flags(input):
    return [1 if char1 == char2 else 0 for char1, char2 in zip('APRSF', input[1:])]

df = df.reset_index(drop=True)
ohe_flags = one_hot_flags(df['Flags'].to_numpy())
ohe_flags = df['Flags'].apply(one_hot_flags).to_list()
df[['ACK', 'PSH', 'RST', 'SYN', 'FIN']] = pd.DataFrame(ohe_flags, columns=['ACK', 'PSH', 'RST', 'SYN', 'FIN'])
df = df.drop(columns=['Date first seen', 'Flags'])


In [None]:
#Functions for deanonymizing IPv4 addresses and converting them to binary representation
import random
import re

ipv4_pattern = r"^(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})$"

# Dictionary to store deanonymized 3 bytes for each random part
random_to_ip = {}

def generate_random_part_one():
    return random.randint(0, 191)
def generate_random_part():
    return random.randint(0, 255)

def deanonymize_ipv4(anonymized_ipv4):
    specific_ip_dict = {
        "OPENSTACK NET": "100.100.100.100",
        "DNS": "200.200.200.200",
        "EXT_SERVER": "90.90.90.90",
        "ATTACKER1": "10.10.10.10",
        "ATTACKER2": "20.20.20.20",
        "ATTACKER3": "30.30.30.30",
    }
    if anonymized_ipv4 in specific_ip_dict:
        return specific_ip_dict[anonymized_ipv4]
    try:
        random_part, last_byte = anonymized_ipv4.split('_')
        last_byte = int(last_byte)
        
        if not (0 <= last_byte <= 255):
            raise ValueError("Invalid last byte")
        
        if random_part not in random_to_ip:
            random_to_ip[random_part] = f"{generate_random_part_one()}.{generate_random_part()}.{generate_random_part()}"
        
        ip_prefix = random_to_ip[random_part]
        deanonymized_ip = f"{ip_prefix}.{last_byte}"
        return deanonymized_ip
    except (ValueError, IndexError):
        return None  # Return None for any errors during deanonymization
    
def ipv4_to_binary(ipv4_address):
    octets = ipv4_address.split('.')
    binary_encoded = []

    for octet in octets:
        binary_octet = format(int(octet), '08b')  # Convert to 8-bit binary representation
        binary_encoded.append(binary_octet)

    binary_address = ''.join(binary_encoded)
    return binary_address

def ipv4_to_number(ipv4_address):
    octets = ipv4_address.split('.')
    number_encoded = []

    for octet in octets:
        number_octet = format(int(octet), 'd')  # Convert to 8-bit binary representation
        number_encoded.append(number_octet)

    number_address = ''.join(number_encoded)
    return number_address


In [None]:
temp = pd.DataFrame()
df_src_host = pd.DataFrame()
df["src_ip_deanonymized"] = df["Src IP Addr"].apply(lambda x: deanonymize_ipv4(x) if not re.match(ipv4_pattern, x) else x)
df["src_ip_number"] = df["src_ip_deanonymized"].apply(lambda x: ipv4_to_number(x))


In [None]:
# ToDo: Optimize and make deanoymize and binary together in one function
temp = pd.DataFrame()
df_dst_host = pd.DataFrame()
df["dst_ip_deanonymized"] = df["Dst IP Addr"].apply(lambda x: deanonymize_ipv4(x) if not re.match(ipv4_pattern, x) else x)
df["dst_ip_number"] = df["dst_ip_deanonymized"].apply(lambda x: ipv4_to_number(x))


In [None]:
df

In [None]:
m_index = df[pd.to_numeric(df['Bytes'], errors='coerce').isnull() == True].index
df['Bytes'].loc[m_index] = df['Bytes'].loc[m_index].apply(lambda x: 10e6 * float(x.strip().split()[0]))
df['Bytes'] = pd.to_numeric(df['Bytes'], errors='coerce', downcast='integer')

In [None]:
df = pd.get_dummies(df, prefix='', prefix_sep='', columns=['Proto', 'attackType'])
# df = pd.get_dummies(df, prefix='', prefix_sep='', columns=['attackType'])
df.head(5)

In [None]:
#save dataframes to csv files
df.to_csv('/Users/robinbuchta/Documents/GitHub/Hands-On-Graph-Neural-Networks-Using-Python/Chapter16/dataframe_tt.csv', index=False)

# Shortcut

In [1]:
import torch
#!pip install -q torch-scatter~=2.1.0 torch-sparse~=0.6.16 torch-cluster~=1.6.0 torch-spline-conv~=1.2.1 torch-geometric==2.2.0 -f https://data.pyg.org/whl/torch-{torch.__version__}.html

torch.manual_seed(0)
torch.cuda.manual_seed(0)
torch.cuda.manual_seed_all(0)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 100)
import itertools
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PowerTransformer
from sklearn.metrics import f1_score, classification_report, confusion_matrix

from torch_geometric.loader import DataLoader
from torch_geometric.loader.neighbor_loader import NeighborLoader

from torch_geometric.data import HeteroData
from torch_geometric.data import Data

from torch.nn import functional as F
from torch.optim import Adam
from torch import nn
import torch

In [2]:
#load csv to pandas df
df = pd.read_csv('/Users/robinbuchta/Documents/GitHub/Hands-On-Graph-Neural-Networks-Using-Python/Chapter16/dataframe_tt.csv')

In [3]:
df

Unnamed: 0,Duration,Src IP Addr,Dst IP Addr,Packets,Bytes,Monday,Tuesday,Wednesday,Thursday,Friday,Saturday,Sunday,daytime,ACK,PSH,RST,SYN,FIN,src_ip_deanonymized,src_ip_number,dst_ip_deanonymized,dst_ip_number,ICMP,IGMP,TCP,UDP,benign,bruteForce,dos,pingScan,portScan
0,0.000,192.168.100.5,192.168.220.16,1,108,False,False,True,False,False,False,False,0.000880,1,1,0,0,0,192.168.100.5,1921681005,192.168.220.16,19216822016,False,False,True,False,True,False,False,False,False
1,0.000,192.168.100.5,192.168.220.15,1,108,False,False,True,False,False,False,False,0.000880,1,1,0,0,0,192.168.100.5,1921681005,192.168.220.15,19216822015,False,False,True,False,True,False,False,False,False
2,0.004,192.168.220.15,192.168.100.5,2,174,False,False,True,False,False,False,False,0.000880,1,1,0,0,0,192.168.220.15,19216822015,192.168.100.5,1921681005,False,False,True,False,True,False,False,False,False
3,0.004,192.168.220.16,192.168.100.5,2,174,False,False,True,False,False,False,False,0.000880,1,1,0,0,0,192.168.220.16,19216822016,192.168.100.5,1921681005,False,False,True,False,True,False,False,False,False
4,0.000,192.168.100.5,192.168.220.15,1,108,False,False,True,False,False,False,False,0.000880,1,1,0,0,0,192.168.100.5,1921681005,192.168.220.15,19216822015,False,False,True,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8451515,0.248,192.168.200.8,EXT_SERVER,2,319,False,True,False,False,False,False,False,0.999954,1,1,0,0,0,192.168.200.8,1921682008,90.90.90.90,90909090,False,False,True,False,True,False,False,False,False
8451516,0.000,10179_174,192.168.210.5,1,54,False,True,False,False,False,False,False,0.999965,1,0,0,0,0,151.27.253.174,15127253174,192.168.210.5,1921682105,False,False,True,False,True,False,False,False,False
8451517,0.000,192.168.210.5,10179_174,1,55,False,True,False,False,False,False,False,0.999954,1,0,0,0,0,192.168.210.5,1921682105,151.27.253.174,15127253174,False,False,True,False,True,False,False,False,False
8451518,0.000,192.168.100.5,192.168.220.6,1,108,False,True,False,False,False,False,False,0.999977,1,1,0,0,0,192.168.100.5,1921681005,192.168.220.6,1921682206,False,False,True,False,True,False,False,False,False


In [4]:
#create new column "host" all values are 0
df['host'] = False

In [5]:
src_ip = df['src_ip_number'].to_numpy() 
print(f"Unique src_ip: {len(np.unique(src_ip))}")
dst_ip = df['dst_ip_number'].to_numpy() 
print(f"Unique dst_ip: {len(np.unique(dst_ip))}")

ip_map = {ip:index for index, ip in enumerate(np.unique(np.append(src_ip, dst_ip)))}

Unique src_ip: 9346
Unique dst_ip: 10104


In [6]:
ip_map

{0: 0,
 74227: 1,
 74235: 2,
 74270: 3,
 85346: 4,
 136464: 5,
 150435: 6,
 224134: 7,
 293317: 8,
 398458: 9,
 402601: 10,
 402602: 11,
 442260: 12,
 613377: 13,
 616816: 14,
 655231: 15,
 677421: 16,
 742124: 17,
 750490: 18,
 903469: 19,
 1019177: 20,
 1072776: 21,
 1081537: 22,
 1117133: 23,
 1136363: 24,
 1203231: 25,
 1245315: 26,
 1245317: 27,
 1245318: 28,
 1245336: 29,
 1414531: 30,
 1460975: 31,
 1467125: 32,
 1597043: 33,
 1610508: 34,
 1664946: 35,
 1692528: 36,
 1723817: 37,
 1871947: 38,
 1878121: 39,
 1883148: 40,
 1884430: 41,
 1884439: 42,
 1886729: 43,
 1886732: 44,
 1886735: 45,
 1886737: 46,
 2029341: 47,
 2070911: 48,
 2141833: 49,
 2181525: 50,
 2353101: 51,
 2354421: 52,
 2413281: 53,
 2444029: 54,
 2461178: 55,
 2654734: 56,
 2725881: 57,
 2731266: 58,
 2881286: 59,
 2922268: 60,
 2968099: 61,
 2979164: 62,
 2992338: 63,
 3160835: 64,
 3160836: 65,
 3160838: 66,
 3164512: 67,
 3227810: 68,
 3227818: 69,
 3267950: 70,
 3314884: 71,
 3314898: 72,
 3382473: 73,
 33

In [7]:
hosts_dummy = np.zeros((len(ip_map), df.shape[1]))

In [8]:
df_hosts_dummy = pd.DataFrame(hosts_dummy, columns=df.columns)

In [9]:
df["ip"] = 0

In [10]:
df_hosts_dummy["host"] = True
df_hosts_dummy["benign"] = True
df_hosts_dummy["ip"] = ip_map.keys()
df_hosts_dummy["src_ip_number"] = 0
df_hosts_dummy["dst_ip_number"] = 0
labels = ['ICMP ', 'IGMP ', 'TCP  ', 'UDP  ']
df_hosts_dummy[labels] = False 


In [11]:
df_hosts_dummy

Unnamed: 0,Duration,Src IP Addr,Dst IP Addr,Packets,Bytes,Monday,Tuesday,Wednesday,Thursday,Friday,Saturday,Sunday,daytime,ACK,PSH,RST,SYN,FIN,src_ip_deanonymized,src_ip_number,dst_ip_deanonymized,dst_ip_number,ICMP,IGMP,TCP,UDP,benign,bruteForce,dos,pingScan,portScan,host,ip
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,False,False,False,False,True,0.0,0.0,0.0,0.0,True,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,False,False,False,False,True,0.0,0.0,0.0,0.0,True,74227
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,False,False,False,False,True,0.0,0.0,0.0,0.0,True,74235
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,False,False,False,False,True,0.0,0.0,0.0,0.0,True,74270
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,False,False,False,False,True,0.0,0.0,0.0,0.0,True,85346
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10103,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,False,False,False,False,True,0.0,0.0,0.0,0.0,True,192168210254
10104,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,False,False,False,False,True,0.0,0.0,0.0,0.0,True,192168210255
10105,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,False,False,False,False,True,0.0,0.0,0.0,0.0,True,192168220255
10106,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,False,False,False,False,True,0.0,0.0,0.0,0.0,True,200200200200


In [12]:
df_nodes = pd.concat([df_hosts_dummy, df], ignore_index=True)

In [13]:
df_nodes

Unnamed: 0,Duration,Src IP Addr,Dst IP Addr,Packets,Bytes,Monday,Tuesday,Wednesday,Thursday,Friday,Saturday,Sunday,daytime,ACK,PSH,RST,SYN,FIN,src_ip_deanonymized,src_ip_number,dst_ip_deanonymized,dst_ip_number,ICMP,IGMP,TCP,UDP,benign,bruteForce,dos,pingScan,portScan,host,ip
0,0.000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,False,False,False,False,True,0.0,0.0,0.0,0.0,True,0
1,0.000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,False,False,False,False,True,0.0,0.0,0.0,0.0,True,74227
2,0.000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,False,False,False,False,True,0.0,0.0,0.0,0.0,True,74235
3,0.000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,False,False,False,False,True,0.0,0.0,0.0,0.0,True,74270
4,0.000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,False,False,False,False,True,0.0,0.0,0.0,0.0,True,85346
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8461623,0.248,192.168.200.8,EXT_SERVER,2.0,319.0,False,True,False,False,False,False,False,0.999954,1.0,1.0,0.0,0.0,0.0,192.168.200.8,1921682008,90.90.90.90,90909090,False,False,True,False,True,False,False,False,False,False,0
8461624,0.000,10179_174,192.168.210.5,1.0,54.0,False,True,False,False,False,False,False,0.999965,1.0,0.0,0.0,0.0,0.0,151.27.253.174,15127253174,192.168.210.5,1921682105,False,False,True,False,True,False,False,False,False,False,0
8461625,0.000,192.168.210.5,10179_174,1.0,55.0,False,True,False,False,False,False,False,0.999954,1.0,0.0,0.0,0.0,0.0,192.168.210.5,1921682105,151.27.253.174,15127253174,False,False,True,False,True,False,False,False,False,False,0
8461626,0.000,192.168.100.5,192.168.220.6,1.0,108.0,False,True,False,False,False,False,False,0.999977,1.0,1.0,0.0,0.0,0.0,192.168.100.5,1921681005,192.168.220.6,1921682206,False,False,True,False,True,False,False,False,False,False,0


In [14]:
# labels = ['benign', 'bruteForce', 'dos', 'pingScan', 'portScan']
labels = ['ICMP ', 'IGMP ', 'TCP  ', 'UDP  ']
# df_train, df_test = train_test_split(df, random_state=0, test_size=0.2, stratify=df[labels])
# df_val, df_test = train_test_split(df_test, random_state=0, test_size=0.5, stratify=df_test[labels])
df_train = df_nodes[df_nodes["benign"] == 1]

In [15]:
df_test = df_nodes

In [16]:
scaler = PowerTransformer()
df_train[['Duration', 'Packets', 'Bytes']] = scaler.fit_transform(df_train[['Duration', 'Packets', 'Bytes']])
# df_val[['Duration', 'Packets', 'Bytes']] = scaler.transform(df_val[['Duration', 'Packets', 'Bytes']])
df_test[['Duration', 'Packets', 'Bytes']] = scaler.transform(df_test[['Duration', 'Packets', 'Bytes']])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train[['Duration', 'Packets', 'Bytes']] = scaler.fit_transform(df_train[['Duration', 'Packets', 'Bytes']])


In [17]:
hosts = df_train[df_train["host"] == 1]

ip = hosts['ip'].to_numpy() 

In [18]:
hosts[hosts["ip"] == 0].index.tolist()[0]

0

In [19]:
hosts['ip']

0                   0
1               74227
2               74235
3               74270
4               85346
             ...     
10103    192168210254
10104    192168210255
10105    192168220255
10106    200200200200
10107    255255255255
Name: ip, Length: 10108, dtype: int64

In [20]:
df_train["src_ip_number"]

0                    0
1                    0
2                    0
3                    0
4                    0
              ...     
8461623     1921682008
8461624    15127253174
8461625     1921682105
8461626     1921681005
8461627     1921682206
Name: src_ip_number, Length: 7021005, dtype: int64

In [21]:
flows = df_train[df_train["host"] == 0]
hosts = df_train[df_train["host"] == 1]

In [22]:
for  index, x in hosts["ip"].items():
    print(index, x)

0 0
1 74227
2 74235
3 74270
4 85346
5 136464
6 150435
7 224134
8 293317
9 398458
10 402601
11 402602
12 442260
13 613377
14 616816
15 655231
16 677421
17 742124
18 750490
19 903469
20 1019177
21 1072776
22 1081537
23 1117133
24 1136363
25 1203231
26 1245315
27 1245317
28 1245318
29 1245336
30 1414531
31 1460975
32 1467125
33 1597043
34 1610508
35 1664946
36 1692528
37 1723817
38 1871947
39 1878121
40 1883148
41 1884430
42 1884439
43 1886729
44 1886732
45 1886735
46 1886737
47 2029341
48 2070911
49 2141833
50 2181525
51 2353101
52 2354421
53 2413281
54 2444029
55 2461178
56 2654734
57 2725881
58 2731266
59 2881286
60 2922268
61 2968099
62 2979164
63 2992338
64 3160835
65 3160836
66 3160838
67 3164512
68 3227810
69 3227818
70 3267950
71 3314884
72 3314898
73 3382473
74 3388797
75 3391871
76 3391872
77 3391873
78 3391874
79 3391875
80 3391876
81 3391878
82 3391879
83 3396270
84 3489518
85 3531931
86 3571498
87 3743553
88 3781043
89 4005764
90 4005766
91 4005770
92 4005775
93 4544044
94 46

In [23]:
result_dict = {}
for index, row in hosts["ip"].items():
    result_dict[row] = index

In [24]:
result_dict[1921681005]

3889

In [25]:
flow = list(range(len(flows))) 
print(f"flow: {len(flow)}")
flow = flow + flow
print(f"y: {len(flow)}")

flow: 7010897
y: 14021794


In [26]:
df_train[labels]

Unnamed: 0,ICMP,IGMP,TCP,UDP
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,False,False
4,False,False,False,False
...,...,...,...,...
8461623,False,False,True,False
8461624,False,False,True,False
8461625,False,False,True,False
8461626,False,False,True,False


In [27]:
import numpy as np
# Create a new column based on the condition
def merge_columns(row):
    for i, value in enumerate(row):
        if value != np.nan:
            if value != 0.0:
                if value != "NaN":
                    if value:   
                        return i
    return None

labels = [ 'host', 'ICMP ', 'IGMP ', 'TCP  ', 'UDP  ']
df_train['label_mapped'] = df_train[labels].apply(merge_columns, axis=1)
df_test['label_mapped'] = df_test[labels].apply(merge_columns, axis=1)

df_train

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train['label_mapped'] = df_train[labels].apply(merge_columns, axis=1)


Unnamed: 0,Duration,Src IP Addr,Dst IP Addr,Packets,Bytes,Monday,Tuesday,Wednesday,Thursday,Friday,Saturday,Sunday,daytime,ACK,PSH,RST,SYN,FIN,src_ip_deanonymized,src_ip_number,dst_ip_deanonymized,dst_ip_number,ICMP,IGMP,TCP,UDP,benign,bruteForce,dos,pingScan,portScan,host,ip,label_mapped
0,-0.616728,0.0,0.0,-5.184286,-9.243745,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,False,False,False,False,True,0.0,0.0,0.0,0.0,True,0,0
1,-0.616728,0.0,0.0,-5.184286,-9.243745,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,False,False,False,False,True,0.0,0.0,0.0,0.0,True,74227,0
2,-0.616728,0.0,0.0,-5.184286,-9.243745,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,False,False,False,False,True,0.0,0.0,0.0,0.0,True,74235,0
3,-0.616728,0.0,0.0,-5.184286,-9.243745,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,False,False,False,False,True,0.0,0.0,0.0,0.0,True,74270,0
4,-0.616728,0.0,0.0,-5.184286,-9.243745,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,False,False,False,False,True,0.0,0.0,0.0,0.0,True,85346,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8461623,1.949052,192.168.200.8,EXT_SERVER,0.355854,0.563484,False,True,False,False,False,False,False,0.999954,1.0,1.0,0.0,0.0,0.0,192.168.200.8,1921682008,90.90.90.90,90909090,False,False,True,False,True,False,False,False,False,False,0,3
8461624,-0.616728,10179_174,192.168.210.5,-0.851851,-0.993715,False,True,False,False,False,False,False,0.999965,1.0,0.0,0.0,0.0,0.0,151.27.253.174,15127253174,192.168.210.5,1921682105,False,False,True,False,True,False,False,False,False,False,0,3
8461625,-0.616728,192.168.210.5,10179_174,-0.851851,-0.973605,False,True,False,False,False,False,False,0.999954,1.0,0.0,0.0,0.0,0.0,192.168.210.5,1921682105,151.27.253.174,15127253174,False,False,True,False,True,False,False,False,False,False,0,3
8461626,-0.616728,192.168.100.5,192.168.220.6,-0.851851,-0.296885,False,True,False,False,False,False,False,0.999977,1.0,1.0,0.0,0.0,0.0,192.168.100.5,1921681005,192.168.220.6,1921682206,False,False,True,False,True,False,False,False,False,False,0,3


In [28]:
df_train

Unnamed: 0,Duration,Src IP Addr,Dst IP Addr,Packets,Bytes,Monday,Tuesday,Wednesday,Thursday,Friday,Saturday,Sunday,daytime,ACK,PSH,RST,SYN,FIN,src_ip_deanonymized,src_ip_number,dst_ip_deanonymized,dst_ip_number,ICMP,IGMP,TCP,UDP,benign,bruteForce,dos,pingScan,portScan,host,ip,label_mapped
0,-0.616728,0.0,0.0,-5.184286,-9.243745,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,False,False,False,False,True,0.0,0.0,0.0,0.0,True,0,0
1,-0.616728,0.0,0.0,-5.184286,-9.243745,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,False,False,False,False,True,0.0,0.0,0.0,0.0,True,74227,0
2,-0.616728,0.0,0.0,-5.184286,-9.243745,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,False,False,False,False,True,0.0,0.0,0.0,0.0,True,74235,0
3,-0.616728,0.0,0.0,-5.184286,-9.243745,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,False,False,False,False,True,0.0,0.0,0.0,0.0,True,74270,0
4,-0.616728,0.0,0.0,-5.184286,-9.243745,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,False,False,False,False,True,0.0,0.0,0.0,0.0,True,85346,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8461623,1.949052,192.168.200.8,EXT_SERVER,0.355854,0.563484,False,True,False,False,False,False,False,0.999954,1.0,1.0,0.0,0.0,0.0,192.168.200.8,1921682008,90.90.90.90,90909090,False,False,True,False,True,False,False,False,False,False,0,3
8461624,-0.616728,10179_174,192.168.210.5,-0.851851,-0.993715,False,True,False,False,False,False,False,0.999965,1.0,0.0,0.0,0.0,0.0,151.27.253.174,15127253174,192.168.210.5,1921682105,False,False,True,False,True,False,False,False,False,False,0,3
8461625,-0.616728,192.168.210.5,10179_174,-0.851851,-0.973605,False,True,False,False,False,False,False,0.999954,1.0,0.0,0.0,0.0,0.0,192.168.210.5,1921682105,151.27.253.174,15127253174,False,False,True,False,True,False,False,False,False,False,0,3
8461626,-0.616728,192.168.100.5,192.168.220.6,-0.851851,-0.296885,False,True,False,False,False,False,False,0.999977,1.0,1.0,0.0,0.0,0.0,192.168.100.5,1921681005,192.168.220.6,1921682206,False,False,True,False,True,False,False,False,False,False,0,3


In [54]:
#features_host = [f'ipsrc_{i}' for i in range(1, 33)] + [f'ipdst_{i}' for i in range(1, 33)]
# labels = ['benign', 'bruteForce', 'dos', 'pingScan', 'portScan']
labels = ['ICMP ', 'IGMP ', 'TCP  ', 'UDP  ', 'host']
features_host = [f'{i}' for i in range(1, 33)]
features_dst_host = [f'{i}' for i in range(1, 33)]
#features_flow = ['daytime', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Duration', 'Packets', 'Bytes', 'ACK', 'PSH', 'RST', 'SYN', 'FIN', 'ICMP ', 'IGMP ', 'TCP  ', 'UDP  ']
features_flow = ['daytime', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday', 'Duration', 'Packets', 'Bytes', 'ACK', 'PSH', 'RST', 'SYN', 'FIN']
#features_flow = ['daytime', 'Monday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday', 'Duration', 'Packets', 'Bytes', 'ACK', 'PSH', 'RST', 'SYN', 'FIN', 'ICMP ', 'IGMP ', 'TCP  ', 'UDP  ']


def get_connections(ip_dict, flows): 
    print(f"hosts: {hosts.shape}")
    print(f"flows: {flows.shape}")
    # host_src = [hosts[hosts["ip"] == ip].index.tolist()[0] for ip in flows["src_ip_number"]] 
    # host_dst = [hosts[hosts["ip"] == ip].index.tolist()[0] for ip in flows["dst_ip_number"]] 
    host_src = [ip_dict[ip] for ip in flows["src_ip_number"]] 
    host_dst = [ip_dict[ip] for ip in flows["dst_ip_number"]]
    print(f"host_src: {len(host_src)}")
    print(f"host_dst: {len(host_dst)}")
    print(f"unique host_src: {len(np.unique(host_src))}")
    print(f"unique host_dst: {len(np.unique(host_dst))}")
    flow = list(range(len(hosts),len(flows)+len(hosts))) 
    print(f"flow: {len(flow)}") # -> offset from hosts is needed! 
    x = host_src + host_dst
    print(f"host_src: {len(x)}")
    y = flow + flow
    print(f"flow: {len(y)}")
    #return torch.Tensor([host_src, flow]).long(), torch.Tensor([host_dst, flow]).long()
    #return torch.Tensor([flow, host_src]).long(), torch.Tensor([flow, host_dst]).long()
    
    return torch.Tensor([x, y]).long()

def create_data(df): 
    flows = df[df["host"] == 0]
    hosts = df[df["host"] == 1]
    print(f"Flows: {flows.shape}")
    print(f"Hosts: {hosts.shape}")
    ip = hosts['ip'].to_numpy() 
    print(f"Unique src_ip: {len(np.unique(ip))}")

    ip_dict = {}
    for index, row in hosts["ip"].items():
        ip_dict[row] = index

    
    ip_map = {ip:index for index, ip in enumerate(np.unique(ip))}
    print(f"Unique ip_map: {len(ip_map)}")
    flow_to_host = get_connections(ip_dict, flows)
    print(f"flow_to_host: {flow_to_host.shape}")

    data = Data()

    # data["host"].x = torch.Tensor(host_dummy_features).float()
    data.x = torch.Tensor(df[features_flow].to_numpy(dtype=np.float32)).float() # dtype=np.float32 prevent error : #TypeError: can't convert np.ndarray of type numpy.object_. 
    data.y = torch.Tensor(df["label_mapped"].to_numpy()).long()

    data.edge_index = flow_to_host

    data.train_mask = torch.Tensor(np.ones((len(data.x)))).bool()
    data.test_mask = torch.Tensor(np.ones((len(data.x)))).bool()

    return data

train_data = create_data(df_train)
test_data = create_data(df_test)

Flows: (7010897, 34)
Hosts: (10108, 34)
Unique src_ip: 10108
Unique ip_map: 10108
hosts: (10108, 33)
flows: (7010897, 34)
host_src: 7010897
host_dst: 7010897
unique host_src: 9343
unique host_dst: 9354
flow: 7010897
host_src: 14021794
flow: 14021794
flow_to_host: torch.Size([2, 14021794])
Flows: (8451520, 34)
Hosts: (10108, 34)
Unique src_ip: 10108
Unique ip_map: 10108
hosts: (10108, 33)
flows: (8451520, 34)
host_src: 8451520
host_dst: 8451520
unique host_src: 9346
unique host_dst: 10104
flow: 8451520
host_src: 16903040
flow: 16903040
flow_to_host: torch.Size([2, 16903040])


In [55]:
test_data

Data(x=[8461628, 16], y=[8461628], edge_index=[2, 16903040], train_mask=[8461628], test_mask=[8461628])

In [56]:
train_data

Data(x=[7021005, 16], y=[7021005], edge_index=[2, 14021794], train_mask=[7021005], test_mask=[7021005])

In [57]:
%load_ext autoreload
%autoreload 1
from benchmark.preprocess import *
from benchmark.constants import *
from benchmark.db_import import *
from benchmark.ground_truth import *
from benchmark.construct_threatrace_graph import *
from benchmark.threatrace import *

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [58]:
from torch_geometric.loader import NeighborLoader


In [59]:
ttp = ThreaTracePipeline(train_data, test_data) 

In [60]:
#ttp.delete_old_models()
ttp.pretraining()
ttp.multi_model_training()

1 1.9230844974517822 4.92806941456387e-05 2023-08-10 10:58:18
2 1.6948002576828003 0.10293526354133062 2023-08-10 10:59:26
3 1.6902745962142944 0.2881979146860029 2023-08-10 11:00:10
4 1.461227297782898 0.7660303617502052 2023-08-10 11:00:55
5 1.3520114421844482 0.8469183542811891 2023-08-10 11:01:52
6 1.244035243988037 0.8494648558148015 2023-08-10 11:02:34
7 1.223513126373291 0.8494648558148015 2023-08-10 11:03:17
8 1.0870167016983032 0.8494648558148015 2023-08-10 11:03:58
9 1.1340965032577515 0.8494648558148015 2023-08-10 11:04:35
New Round
false_classified, true_classified
1056908 5964097
Model saved loop_num: 0 2023-08-10 11:08:25
New Round
false_classified, true_classified
10744 1046164
Model saved loop_num: 1 2023-08-10 11:09:08
New Round
false_classified, true_classified
636 10108
Model saved loop_num: 2 2023-08-10 11:09:09
New Round
false_classified, true_classified
246 390
Model saved loop_num: 3 2023-08-10 11:09:10


In [None]:
print("##############################################################################################################")
print("own evaluation")
print("##############################################################################################################")
ttp.reinit_test_data() 
ttp.test_model_performance() 


##############################################################################################################
own evaluation
##############################################################################################################
Loop_num: 0  Accuracy:0.8711  true_classified:7371269  false_classified:1090359
Loop_num: 1  Accuracy:0.1238  true_classified:1047279  false_classified:7414349
Loop_num: 2  Accuracy:0.8738  true_classified:7393818  false_classified:1067810
Loop_num: 3  Accuracy:0.0012  true_classified:10108  false_classified:8451520
Loop_num: 4  Accuracy:0.0011  true_classified:8944  false_classified:8452684
Loop_num: 5  Accuracy:0.0000  true_classified:246  false_classified:8461382
Unique Count of data_flow.y:  (tensor([0, 1, 2, 3, 4]), tensor([  10108,   10177,     246, 7393818, 1047279]))
Unique Count of pred: (tensor([False,  True]), tensor([8460395,    1233]))


In [42]:
df_test

Unnamed: 0,Duration,Src IP Addr,Dst IP Addr,Packets,Bytes,Monday,Tuesday,Wednesday,Thursday,Friday,Saturday,Sunday,daytime,ACK,PSH,RST,SYN,FIN,src_ip_deanonymized,src_ip_number,dst_ip_deanonymized,dst_ip_number,ICMP,IGMP,TCP,UDP,benign,bruteForce,dos,pingScan,portScan,host,ip,label_mapped
0,-0.616728,0.0,0.0,-5.184286,-9.243745,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,False,False,False,False,True,0.0,0.0,0.0,0.0,True,0,0
1,-0.616728,0.0,0.0,-5.184286,-9.243745,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,False,False,False,False,True,0.0,0.0,0.0,0.0,True,74227,0
2,-0.616728,0.0,0.0,-5.184286,-9.243745,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,False,False,False,False,True,0.0,0.0,0.0,0.0,True,74235,0
3,-0.616728,0.0,0.0,-5.184286,-9.243745,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,False,False,False,False,True,0.0,0.0,0.0,0.0,True,74270,0
4,-0.616728,0.0,0.0,-5.184286,-9.243745,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,False,False,False,False,True,0.0,0.0,0.0,0.0,True,85346,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8461623,1.949052,192.168.200.8,EXT_SERVER,0.355854,0.563484,False,True,False,False,False,False,False,0.999954,1.0,1.0,0.0,0.0,0.0,192.168.200.8,1921682008,90.90.90.90,90909090,False,False,True,False,True,False,False,False,False,False,0,3
8461624,-0.616728,10179_174,192.168.210.5,-0.851851,-0.993715,False,True,False,False,False,False,False,0.999965,1.0,0.0,0.0,0.0,0.0,151.27.253.174,15127253174,192.168.210.5,1921682105,False,False,True,False,True,False,False,False,False,False,0,3
8461625,-0.616728,192.168.210.5,10179_174,-0.851851,-0.973605,False,True,False,False,False,False,False,0.999954,1.0,0.0,0.0,0.0,0.0,192.168.210.5,1921682105,151.27.253.174,15127253174,False,False,True,False,True,False,False,False,False,False,0,3
8461626,-0.616728,192.168.100.5,192.168.220.6,-0.851851,-0.296885,False,True,False,False,False,False,False,0.999977,1.0,1.0,0.0,0.0,0.0,192.168.100.5,1921681005,192.168.220.6,1921682206,False,False,True,False,True,False,False,False,False,False,0,3


In [44]:
# create list of indexes for label of flow 
tmp = df_test['benign'].tolist()

# get indexes where label is False
gt = [i for i, x in enumerate(gt) if x == False]

In [45]:
print("##############################################################################################################")
print("2 HOP")
hits = ttp.evaluation(gt)

##############################################################################################################
2 HOP
{'tn': 8461628}
start
{'tn': 8461628}
set every node in gt to fn
{'fn': 1440623, 'tn': 7021005}
gt
{'fn': 1440623, 'tn': 7021005}


100%|██████████| 1233/1233 [21:20<00:00,  1.04s/it]


{'tn': 7021005, 'tp': 1440623}
Intersection Counter: 1216
8461628
1440623 0 7021005 0
TP: 1440623, FP: 0, TN: 7021005, FN: 0
Precision:  1.0
Recall:  1.0
F-Score:  0.99999999995
Accuracy:  1.0


In [46]:
hits = ttp.evaluation_no_hop(gt)

start
{'tn': 8461628}
set every node in gt to fn
{'fn': 1440623, 'tn': 7021005}


100%|██████████| 1233/1233 [00:04<00:00, 260.14it/s]


{'fn': 1439418, 'fp': 15, 'tn': 7020990, 'tp': 1205}
8461628
1205 15 7020990 1439418
TP: 1205, FP: 15, TN: 7020990, FN: 1439418
Precision:  0.9877049180327059
Recall:  0.0008364436774923071
F-Score:  0.001671471859111017
Accuracy:  0.8298869910140224


In [47]:
hits = ttp.evaluation(gt, num_hops=1)

{'tn': 8461628}
start
{'tn': 8461628}
set every node in gt to fn
{'fn': 1440623, 'tn': 7021005}
gt
{'fn': 1440623, 'tn': 7021005}


100%|██████████| 1233/1233 [03:35<00:00,  5.73it/s]


{'fn': 1439418, 'fp': 15, 'tn': 7020990, 'tp': 1205}
Intersection Counter: 1205
8461628
1205 15 7020990 1439418
TP: 1205, FP: 15, TN: 7020990, FN: 1439418
Precision:  0.9877049180327059
Recall:  0.0008364436774923071
F-Score:  0.001671471859111017
Accuracy:  0.8298869910140224


In [None]:
from torch_geometric.transforms import AddMetaPaths

metapaths = [[("flow", "host"), ("host", "flow")]]

train_data = AddMetaPaths(metapaths)(train_data)


In [None]:
train_data

In [None]:
# Read config file
config_dir = "../configs"
config_file_record_03 = config_dir + '/eng3/cadets/cadets_03_record.ini'
config_rec3 = configparser.ConfigParser()
config_rec3.read(config_file_record_03)


In [None]:
# read tar file and extract to a folder + preprocess
process_darpa_data(config_rec3)


In [None]:
#setup and import data to mariaDB 
import_nodes_and_edges(config_rec3)

In [None]:
# create train and test data
mypygdata = MyPyGDataset(config_rec3, own_timestamp_percent=20, extension="20percentTrain") # 4min
data_train = mypygdata[0][0]
data_test = mypygdata[0][1]
print(data_train)
print(data_test)

In [None]:
# create ground truth 
gt_file1 = config_rec3['Files']['ground_truth'] # -> special case for record3, because here 2 attacks are given 
gt_file2 = config_rec3['Files']['ground_truth1'] # -> special case for record3, because here 2 attacks are given 
gts = [gt_file1, gt_file2]
gt = get_atk_nodes_for_evaluation(config_rec3, multi=True, gts=gts)

In [None]:
# create threatrace GNN anomaly detection Pipeline 
ttp = ThreaTracePipeline(config_rec3, data_train, data_test) 

In [None]:
#ttp.delete_old_models()
ttp.pretraining()
ttp.multi_model_training()

In [None]:
print("##############################################################################################################")
print("own evaluation")
print("##############################################################################################################")
ttp.reinit_test_data() 
ttp.test_model_performance() 


In [None]:
print("##############################################################################################################")
print("2 HOP")
hits = ttp.evaluation(gt)

# TP: 18234, FP: 0, TN: 549289, FN: 6815
# Precision:  0.9999999999999946
# Recall:  0.7279332508283736
# F-Score:  0.8425478824917251
# Accuracy:  0.9881341648994146

In [None]:
hits = ttp.evaluation_no_hop(gt)


In [None]:
ttp.get_detection_insights()

In [None]:
hits = ttp.evaluation(gt, num_hops=1)

In [None]:
def create_boolean_tensor(length, true_positions):
    tensor = np.zeros(length, dtype=bool)
    tensor[true_positions] = True
    return torch.tensor(tensor)

In [None]:
x = len(ttp.test_data.test_mask)  # The length of the tensor
positions_list = np.random.choice(len(ttp.test_data.x), 3323).tolist()  # List of positions to set as True
print(positions_list)
boolean_tensor = create_boolean_tensor(x, positions_list)
print(boolean_tensor)
random_hits = ttp.evaluation_no_hop(gt, boolean_tensor) 

In [None]:
hits = ttp.evaluation(gt, hits=boolean_tensor,num_hops=1)

In [None]:
x = len(ttp.test_data.test_mask)  # The length of the tensor
positions_list = np.random.choice(len(ttp.test_data.x), 113).tolist()  # List of positions to set as True
print(positions_list)
boolean_tensor = create_boolean_tensor(x, positions_list)
print(boolean_tensor)
random_hits = ttp.evaluation(gt, boolean_tensor) 

# Accuracy:  0.9999947765949667
# TP: 18437, FP: 0, TN: 549289, FN: 6612
# Precision:  0.9999999999999947
# Recall:  0.7360373667611453
# F-Score:  0.8479510646616337
# Accuracy:  0.9884876153066661

In [None]:
# hit_neighbor_count_list = []
# for i in tqdm(range(ttp.test_data.num_nodes)):
#     i_as_list = [i]
#     hit_neighbors, edge_index, mapping, edge_mask = k_hop_subgraph(i_as_list, num_hops=2, edge_index=ttp.test_data_undirected_edge_index)
#     hit_neighbor_count_list.append(len(hit_neighbors))
# print('Average number of hit neighbors: ', np.mean(hit_neighbor_count_list))
# print('Max number of hit neighbors: ', np.max(hit_neighbor_count_list))
# print('Min number of hit neighbors: ', np.min(hit_neighbor_count_list))



# 100%|██████████| 574338/574338 [25:11:59<00:00,  6.33it/s]        
# Average number of hit neighbors:  225559.81589760733
# Max number of hit neighbors:  428045
# Min number of hit neighbors:  1

In [None]:
# # save list hit_neighbor_count_list to file 
# with open('hit_neighbor_count_list_sorted.txt', 'w') as filehandle:
#     for listitem in hit_neighbor_count_list:
#         filehandle.write('%s\n' % listitem)

In [None]:
# #sort list hit_neighbor_count_list high to low
# hit_neighbor_count_list.sort(reverse=True)

In [None]:
len(ttp.test_data.x)

In [None]:
import matplotlib.pyplot as plt

# Assuming you have an array with neighbor count for each node
# Replace 'neighbor_count_array' with your actual data
# For demonstration, I'll create a random array of neighbor counts
np.random.seed(42)
neighbor_count_array = np.random.randint(0, 100, size=1000)

# Get the indices of the top 500 nodes based on neighbor count
top_500_nodes_indices = np.argsort(neighbor_count_array)[-500:][::-1]

# Get the neighbor count values for the top 500 nodes
top_500_neighbor_count = neighbor_count_array[top_500_nodes_indices]

# Plot the neighbor count for the top 500 nodes
plt.figure(figsize=(12, 6))
plt.bar(range(1, 501), top_500_neighbor_count)
plt.xlabel('Node Index (Top 500 Nodes)')
plt.ylabel('Neighbor Count')
plt.title('Neighbor Count for Top 500 Nodes')
plt.grid(axis='y')
plt.show()