In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm
import gc
import warnings
warnings.filterwarnings('ignore')
from sklearn.linear_model import LogisticRegression

In [None]:
def dados_tratar(df):
    # Classificar os ataques diretamente e criar uma nova coluna
    DDoS_attacks = ['DDoS-RSTFINFlood','DDoS-PSHACK_Flood','DDoS-SYN_Flood','DDoS-UDP_Flood','DDoS-TCP_Flood','DDoS-ICMP_Flood','DDoS-SynonymousIP_Flood','DDoS-ACK_Fragmentation','DDoS-UDP_Fragmentation','DDoS-ICMP_Fragmentation','DDoS-SlowLoris','DDoS-HTTP_Flood']
    DoS_attacks = ['DoS-UDP_Flood','DoS-SYN_Flood','DoS-TCP_Flood','DoS-HTTP_Flood']
    Mirai_attacks = ['Mirai-greeth_flood','Mirai-greip_flood','Mirai-udpplain']
    Recon_attacks = ['Recon-PingSweep','Recon-OSScan','Recon-PortScan','VulnerabilityScan','Recon-HostDiscovery']
    Spoofing_attacks = ['DNS_Spoofing','MITM-ArpSpoofing']
    Web_attacks = ['BrowserHijacking','Backdoor_Malware','XSS','Uploading_Attack','SqlInjection','CommandInjection']
    BruteForce_attacks = ['DictionaryBruteForce']

    df['attack_cat'] = 'Normal'  # Inicialmente, todos os ataques são classificados como "Normal"
    df.loc[df['label'].isin(DDoS_attacks), 'attack_cat'] = 'DDoS'
    df.loc[df['label'].isin(DoS_attacks), 'attack_cat'] = 'DoS'
    df.loc[df['label'].isin(Mirai_attacks), 'attack_cat'] = 'Mirai'
    df.loc[df['label'].isin(Recon_attacks), 'attack_cat'] = 'Recon'
    df.loc[df['label'].isin(Spoofing_attacks), 'attack_cat'] = 'Spoofing'
    df.loc[df['label'].isin(Web_attacks), 'attack_cat'] = 'Web'
    df.loc[df['label'].isin(BruteForce_attacks), 'attack_cat'] = 'BruteForce'

    #df = df.drop(['label'], axis=1)

    return df


if __name__ == "__main__":
    # Obtém a hora atual antes da execução do programa
    pd.set_option('display.max_columns', None)
    print("Loading dataset...")
    #Load Data
    # Directory containing the files
    directory = '/content/drive/MyDrive/intrusion/ciciot2023/01-RawDataset'

    all_sampled_data = []  # List to store sampled dataframes

    # Get a sorted list of CSV filenames
    csv_files = sorted([f for f in os.listdir(directory) if f.endswith(".csv")])

    for i, filename in enumerate(csv_files, start=1):
      filepath = os.path.join(directory, filename)
      df = pd.read_csv(filepath)
      print(f"Tamanho df: {df.shape}")
      sampled_df = df.sample(frac=0.05)
      print(f"Tamanho sampled_df: {sampled_df.shape}")
      del df
      gc.collect()
      all_sampled_data.append(sampled_df)
      print(f"{i}. Processed: {filename}")

    # Concatenate all sampled dataframes
    concatenated_df = pd.concat(all_sampled_data)
    # Now you can work with the 'concatenated_df'
    print(f'Dados importados com shape: {concatenated_df.shape}')
    df  = dados_tratar(concatenated_df)
    print("Distribuição do label:")
    print(df['label'].value_counts())
    print(f'Dados para AL com shape: {df.shape}')
    print("Salvar dataset")
    df.drop(['label','attack_cat'], axis=1).to_csv('/content/drive/MyDrive/intrusion/ciciot2023/02-TreatedDataset/CICIoT2023_features.csv', index=False,header=False )
    df["attack_cat"].to_csv('/content/drive/MyDrive/intrusion/ciciot2023/02-TreatedDataset/CICIoT2023_8_labels.csv', index=False,header=False )
    df["label"].to_csv('/content/drive/MyDrive/intrusion/ciciot2023/02-TreatedDataset/CICIoT2023_34_labels.csv', index=False,header=False )
    print("Salvar dataset... DONE")

Loading dataset...
Tamanho df: (238687, 47)
Tamanho sampled_df: (11934, 47)
1. Processed: part-00000-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Tamanho df: (218805, 47)
Tamanho sampled_df: (10940, 47)
2. Processed: part-00001-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Tamanho df: (275258, 47)
Tamanho sampled_df: (13763, 47)
3. Processed: part-00002-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Tamanho df: (231023, 47)
Tamanho sampled_df: (11551, 47)
4. Processed: part-00003-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Tamanho df: (227491, 47)
Tamanho sampled_df: (11375, 47)
5. Processed: part-00004-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Tamanho df: (240046, 47)
Tamanho sampled_df: (12002, 47)
6. Processed: part-00005-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Tamanho df: (233793, 47)
Tamanho sampled_df: (11690, 47)
7. Processed: part-00006-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Tamanho df: (227910, 47)
Tamanho sampled_df: (11396, 47)
8. Processed: part-00007-363

In [None]:
print(concatenated_df['label'].value_counts())

label
DDoS-ICMP_Flood            360433
DDoS-UDP_Flood             270879
DDoS-TCP_Flood             225137
DDoS-PSHACK_Flood          204628
DDoS-SYN_Flood             202382
DDoS-RSTFINFlood           201733
DDoS-SynonymousIP_Flood    179455
DoS-UDP_Flood              166179
DoS-TCP_Flood              133526
DoS-SYN_Flood              101903
BenignTraffic               55626
Mirai-greeth_flood          49261
Mirai-udpplain              44467
Mirai-greip_flood           37398
DDoS-ICMP_Fragmentation     22912
MITM-ArpSpoofing            15393
DDoS-UDP_Fragmentation      14361
DDoS-ACK_Fragmentation      14320
DNS_Spoofing                 8670
Recon-HostDiscovery          6554
Recon-OSScan                 4938
Recon-PortScan               4163
DoS-HTTP_Flood               3630
VulnerabilityScan            1822
DDoS-HTTP_Flood              1423
DDoS-SlowLoris               1195
DictionaryBruteForce          623
BrowserHijacking              277
CommandInjection              266
SqlInjec