In [None]:
from google.colab import drive
drive.mount('/content/drive')
!ls "/content/drive/MyDrive/CiC-DataSet/Complete_Dataset/csv"

Mounted at /content/drive
 CICIoT2023  'README_csv - README.pdf'


## Classification 34 class (33+1)

In [None]:
import pandas as pd
import os
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

DATASET_DIRECTORY = '/content/drive/MyDrive/CiC-DataSet/Complete_Dataset/csv/CICIoT2023'


df_sets = [f for f in os.listdir(DATASET_DIRECTORY) if f.endswith('.csv')]
df_sets.sort()


training_sets = df_sets[:int(len(df_sets) * 0.8)]
test_sets = df_sets[int(len(df_sets) * 0.8):]

X_columns = [
    'flow_duration', 'Header_Length', 'Protocol Type', 'Duration',
    'Rate', 'Srate', 'Drate', 'fin_flag_number', 'syn_flag_number',
    'rst_flag_number', 'psh_flag_number', 'ack_flag_number',
    'ece_flag_number', 'cwr_flag_number', 'ack_count',
    'syn_count', 'fin_count', 'urg_count', 'rst_count',
    'HTTP', 'HTTPS', 'DNS', 'Telnet', 'SMTP', 'SSH', 'IRC', 'TCP',
    'UDP', 'DHCP', 'ARP', 'ICMP', 'IPv', 'LLC', 'Tot sum', 'Min',
    'Max', 'AVG', 'Std', 'Tot size', 'IAT', 'Number', 'Magnitue',
    'Radius', 'Covariance', 'Variance', 'Weight',
]
y_column = 'label'

scaler = StandardScaler()
svm_model = SGDClassifier(loss='hinge', max_iter=1000, tol=1e-3)


print("Fitting scaler on training data...")
all_labels = set()
for train_set in tqdm(training_sets, desc="Scaling Training Sets"):
    file_path = os.path.join(DATASET_DIRECTORY, train_set)
    try:
        for chunk in pd.read_csv(file_path, usecols=X_columns + [y_column], chunksize=5000):
            scaler.partial_fit(chunk[X_columns])
            all_labels.update(chunk[y_column].unique())
    except Exception as e:
        print(f"Error processing file {train_set}: {e}")


all_labels = sorted(all_labels)
print(f"Unique labels found: {all_labels}")

print("Scaler fitted successfully.")

print("Training SVM Model...")
for train_set in tqdm(training_sets, desc="Training SVM"):
    file_path = os.path.join(DATASET_DIRECTORY, train_set)
    try:
        data = pd.read_csv(file_path, usecols=X_columns + [y_column])
        if data.empty:
            print(f"Skipping empty file: {file_path}")
            continue

        data[X_columns] = scaler.transform(data[X_columns])

        svm_model.partial_fit(data[X_columns], data[y_column], classes=all_labels)
    except Exception as e:
        print(f"Error processing file {train_set}: {e}")

print("SVM Model trained successfully.")


y_test, y_pred = [], []

print("Evaluating SVM Model...")
for test_set in tqdm(test_sets, desc="Evaluating Test Sets"):
    file_path = os.path.join(DATASET_DIRECTORY, test_set)
    try:
        test_data = pd.read_csv(file_path, usecols=X_columns + [y_column])
        if test_data.empty:
            print(f"Skipping empty file: {file_path}")
            continue

        test_data[X_columns] = scaler.transform(test_data[X_columns])

        y_test.extend(test_data[y_column].values)
        y_pred.extend(svm_model.predict(test_data[X_columns]))
    except Exception as e:
        print(f"Error processing file {test_set}: {e}")


if len(y_test) == 0 or len(y_pred) == 0:
    print("No predictions or labels collected. Check data integrity.")
else:
    print("##### Evaluation Results #####")
    print('Accuracy Score = ', accuracy_score(y_test, y_pred))
    print('Recall Score = ', recall_score(y_test, y_pred, average='macro'))
    print('Precision Score = ', precision_score(y_test, y_pred, average='macro'))
    print('F1 Score = ', f1_score(y_test, y_pred, average='macro'))


Fitting scaler on training data...


Scaling Training Sets: 100%|██████████| 135/135 [07:30<00:00,  3.34s/it]


Unique labels found: ['Backdoor_Malware', 'BenignTraffic', 'BrowserHijacking', 'CommandInjection', 'DDoS-ACK_Fragmentation', 'DDoS-HTTP_Flood', 'DDoS-ICMP_Flood', 'DDoS-ICMP_Fragmentation', 'DDoS-PSHACK_Flood', 'DDoS-RSTFINFlood', 'DDoS-SYN_Flood', 'DDoS-SlowLoris', 'DDoS-SynonymousIP_Flood', 'DDoS-TCP_Flood', 'DDoS-UDP_Flood', 'DDoS-UDP_Fragmentation', 'DNS_Spoofing', 'DictionaryBruteForce', 'DoS-HTTP_Flood', 'DoS-SYN_Flood', 'DoS-TCP_Flood', 'DoS-UDP_Flood', 'MITM-ArpSpoofing', 'Mirai-greeth_flood', 'Mirai-greip_flood', 'Mirai-udpplain', 'Recon-HostDiscovery', 'Recon-OSScan', 'Recon-PingSweep', 'Recon-PortScan', 'SqlInjection', 'Uploading_Attack', 'VulnerabilityScan', 'XSS']
Scaler fitted successfully.
Training SVM Model...


Training SVM: 100%|██████████| 135/135 [12:54<00:00,  5.73s/it]


SVM Model trained successfully.
Evaluating SVM Model...


Evaluating Test Sets: 100%|██████████| 34/34 [02:24<00:00,  4.26s/it]


##### Evaluation Results #####
Accuracy Score =  0.7871270089508278
Recall Score =  0.42767336250608096


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Precision Score =  0.5286743006990764
F1 Score =  0.43372292838904086


## Classification 8 (7+1)

In [None]:
import os
import pandas as pd
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from tqdm import tqdm


DATASET_DIRECTORY = '/content/drive/MyDrive/CiC-DataSet/Complete_Dataset/csv/CICIoT2023'
X_columns = [
    'flow_duration', 'Header_Length', 'Protocol Type', 'Duration', 'Rate', 'Srate', 'Drate',
    'fin_flag_number', 'syn_flag_number', 'rst_flag_number', 'psh_flag_number', 'ack_flag_number',
    'ece_flag_number', 'cwr_flag_number', 'ack_count', 'syn_count', 'fin_count', 'urg_count',
    'rst_count', 'HTTP', 'HTTPS', 'DNS', 'Telnet', 'SMTP', 'SSH', 'IRC', 'TCP', 'UDP', 'DHCP',
    'ARP', 'ICMP', 'IPv', 'LLC', 'Tot sum', 'Min', 'Max', 'AVG', 'Std', 'Tot size', 'IAT',
    'Number', 'Magnitue', 'Radius', 'Covariance', 'Variance', 'Weight',
]
y_column = 'label'

dict_7classes = {
    'DDoS-RSTFINFlood': 'DDoS', 'DDoS-PSHACK_Flood': 'DDoS', 'DDoS-SYN_Flood': 'DDoS',
    'DDoS-UDP_Flood': 'DDoS', 'DDoS-TCP_Flood': 'DDoS', 'DDoS-ICMP_Flood': 'DDoS',
    'DDoS-SynonymousIP_Flood': 'DDoS', 'DDoS-ACK_Fragmentation': 'DDoS',
    'DDoS-UDP_Fragmentation': 'DDoS', 'DDoS-ICMP_Fragmentation': 'DDoS',
    'DDoS-SlowLoris': 'DDoS', 'DDoS-HTTP_Flood': 'DDoS', 'DoS-UDP_Flood': 'DoS',
    'DoS-SYN_Flood': 'DoS', 'DoS-TCP_Flood': 'DoS', 'DoS-HTTP_Flood': 'DoS',
    'Mirai-greeth_flood': 'Mirai', 'Mirai-greip_flood': 'Mirai', 'Mirai-udpplain': 'Mirai',
    'Recon-PingSweep': 'Recon', 'Recon-OSScan': 'Recon', 'Recon-PortScan': 'Recon',
    'VulnerabilityScan': 'Recon', 'Recon-HostDiscovery': 'Recon', 'DNS_Spoofing': 'Spoofing',
    'MITM-ArpSpoofing': 'Spoofing', 'BenignTraffic': 'Benign', 'BrowserHijacking': 'Web',
    'Backdoor_Malware': 'Web', 'XSS': 'Web', 'Uploading_Attack': 'Web', 'SqlInjection': 'Web',
    'CommandInjection': 'Web', 'DictionaryBruteForce': 'BruteForce'
}

df_sets = [f for f in os.listdir(DATASET_DIRECTORY) if f.endswith('.csv')]

training_sets = df_sets[:int(len(df_sets) * 0.8)]
test_sets = df_sets[int(len(df_sets) * 0.8):]

print("Fitting scaler on training data...")
scaler = StandardScaler()
for train_set in tqdm(training_sets, desc="Fitting Scaler"):
    file_path = os.path.join(DATASET_DIRECTORY, train_set)
    try:
        for chunk in pd.read_csv(file_path, chunksize=5000, usecols=X_columns + [y_column]):
            scaler.partial_fit(chunk[X_columns])
    except Exception as e:
        print(f"Error processing {file_path}: {e}")

print("Scaler fitted successfully.")

print("Training the SVM (using SGDClassifier)...")
svm_model = SGDClassifier(loss='hinge', max_iter=1000, tol=1e-3)
all_labels = sorted(set(dict_7classes.values()))

for train_set in tqdm(training_sets, desc="Training SVM"):
    file_path = os.path.join(DATASET_DIRECTORY, train_set)
    try:
        for chunk in pd.read_csv(file_path, chunksize=5000, usecols=X_columns + [y_column]):
            chunk[X_columns] = scaler.transform(chunk[X_columns])
            chunk[y_column] = chunk[y_column].map(dict_7classes)
            svm_model.partial_fit(chunk[X_columns], chunk[y_column], classes=all_labels)
    except Exception as e:
        print(f"Error processing {file_path}: {e}")

print("SVM Model trained successfully.")

print("Evaluating SVM Model...")
y_test, y_pred = [], []

for test_set in tqdm(test_sets, desc="Evaluating Test Sets"):
    file_path = os.path.join(DATASET_DIRECTORY, test_set)
    try:
        for chunk in pd.read_csv(file_path, chunksize=5000, usecols=X_columns + [y_column]):
            chunk[X_columns] = scaler.transform(chunk[X_columns])
            chunk[y_column] = chunk[y_column].map(dict_7classes)
            y_test.extend(chunk[y_column].values)
            y_pred.extend(svm_model.predict(chunk[X_columns]))
    except Exception as e:
        print(f"Error processing {file_path}: {e}")


if len(y_test) > 0 and len(y_pred) > 0:
    print("##### Evaluation Results #####")
    print("Accuracy Score = ", accuracy_score(y_test, y_pred))
    print("Recall Score = ", recall_score(y_test, y_pred, average='macro'))
    print("Precision Score = ", precision_score(y_test, y_pred, average='macro'))
    print("F1 Score = ", f1_score(y_test, y_pred, average='macro'))
else:
    print("No predictions or labels collected. Check data integrity.")


Fitting scaler on training data...


Fitting Scaler: 100%|██████████| 135/135 [05:30<00:00,  2.45s/it]


Scaler fitted successfully.
Training the SVM (using SGDClassifier)...


Training SVM: 100%|██████████| 135/135 [08:25<00:00,  3.74s/it]


SVM Model trained successfully.
Evaluating SVM Model...


Evaluating Test Sets: 100%|██████████| 34/34 [03:07<00:00,  5.51s/it]


##### Evaluation Results #####
Accuracy Score =  0.8230735478876973
Recall Score =  0.46898625413482875
Precision Score =  0.6774741457451166
F1 Score =  0.5017935247751959


## Classification 2 class (1+1)

In [None]:
import os
import pandas as pd
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from tqdm import tqdm

DATASET_DIRECTORY = '/content/drive/MyDrive/CiC-DataSet/Complete_Dataset/csv/CICIoT2023'
X_columns = [
    'flow_duration', 'Header_Length', 'Protocol Type', 'Duration', 'Rate', 'Srate', 'Drate',
    'fin_flag_number', 'syn_flag_number', 'rst_flag_number', 'psh_flag_number', 'ack_flag_number',
    'ece_flag_number', 'cwr_flag_number', 'ack_count', 'syn_count', 'fin_count', 'urg_count',
    'rst_count', 'HTTP', 'HTTPS', 'DNS', 'Telnet', 'SMTP', 'SSH', 'IRC', 'TCP', 'UDP', 'DHCP',
    'ARP', 'ICMP', 'IPv', 'LLC', 'Tot sum', 'Min', 'Max', 'AVG', 'Std', 'Tot size', 'IAT',
    'Number', 'Magnitue', 'Radius', 'Covariance', 'Variance', 'Weight',
]
y_column = 'label'

dict_2classes = {
    'DDoS-RSTFINFlood': 'Attack', 'DDoS-PSHACK_Flood': 'Attack', 'DDoS-SYN_Flood': 'Attack',
    'DDoS-UDP_Flood': 'Attack', 'DDoS-TCP_Flood': 'Attack', 'DDoS-ICMP_Flood': 'Attack',
    'DDoS-SynonymousIP_Flood': 'Attack', 'DDoS-ACK_Fragmentation': 'Attack',
    'DDoS-UDP_Fragmentation': 'Attack', 'DDoS-ICMP_Fragmentation': 'Attack',
    'DDoS-SlowLoris': 'Attack', 'DDoS-HTTP_Flood': 'Attack', 'DoS-UDP_Flood': 'Attack',
    'DoS-SYN_Flood': 'Attack', 'DoS-TCP_Flood': 'Attack', 'DoS-HTTP_Flood': 'Attack',
    'Mirai-greeth_flood': 'Attack', 'Mirai-greip_flood': 'Attack', 'Mirai-udpplain': 'Attack',
    'Recon-PingSweep': 'Attack', 'Recon-OSScan': 'Attack', 'Recon-PortScan': 'Attack',
    'VulnerabilityScan': 'Attack', 'Recon-HostDiscovery': 'Attack', 'DNS_Spoofing': 'Attack',
    'MITM-ArpSpoofing': 'Attack', 'BenignTraffic': 'Benign', 'BrowserHijacking': 'Attack',
    'Backdoor_Malware': 'Attack', 'XSS': 'Attack', 'Uploading_Attack': 'Attack',
    'SqlInjection': 'Attack', 'CommandInjection': 'Attack', 'DictionaryBruteForce': 'Attack'
}

df_sets = [f for f in os.listdir(DATASET_DIRECTORY) if f.endswith('.csv')]

training_sets = df_sets[:int(len(df_sets) * 0.8)]
test_sets = df_sets[int(len(df_sets) * 0.8):]

print("Fitting scaler on training data...")
scaler = StandardScaler()
for train_set in tqdm(training_sets, desc="Fitting Scaler"):
    file_path = os.path.join(DATASET_DIRECTORY, train_set)
    try:
        for chunk in pd.read_csv(file_path, chunksize=5000, usecols=X_columns + [y_column]):
            scaler.partial_fit(chunk[X_columns])
    except Exception as e:
        print(f"Error processing {file_path}: {e}")

print("Scaler fitted successfully.")

print("Training the SVM (using SGDClassifier)...")
svm_model = SGDClassifier(loss='hinge', max_iter=1000, tol=1e-3)
classes = ['Attack', 'Benign']

for train_set in tqdm(training_sets, desc="Training SVM"):
    file_path = os.path.join(DATASET_DIRECTORY, train_set)
    try:
        for chunk in pd.read_csv(file_path, chunksize=5000, usecols=X_columns + [y_column]):
            chunk[X_columns] = scaler.transform(chunk[X_columns])
            chunk[y_column] = chunk[y_column].map(dict_2classes)
            svm_model.partial_fit(chunk[X_columns], chunk[y_column], classes=classes)
    except Exception as e:
        print(f"Error processing {file_path}: {e}")

print("SVM Model trained successfully.")

print("Evaluating SVM Model...")
y_test, y_pred = [], []

for test_set in tqdm(test_sets, desc="Evaluating Test Sets"):
    file_path = os.path.join(DATASET_DIRECTORY, test_set)
    try:
        for chunk in pd.read_csv(file_path, chunksize=5000, usecols=X_columns + [y_column]):
            chunk[X_columns] = scaler.transform(chunk[X_columns])
            chunk[y_column] = chunk[y_column].map(dict_2classes)
            y_test.extend(chunk[y_column].values)
            y_pred.extend(svm_model.predict(chunk[X_columns]))
    except Exception as e:
        print(f"Error processing {file_path}: {e}")

if len(y_test) > 0 and len(y_pred) > 0:
    print("##### Evaluation Results #####")
    print("Accuracy Score = ", accuracy_score(y_test, y_pred))
    print("Recall Score = ", recall_score(y_test, y_pred, average='macro'))
    print("Precision Score = ", precision_score(y_test, y_pred, average='macro'))
    print("F1 Score = ", f1_score(y_test, y_pred, average='macro'))
else:
    print("No predictions or labels collected. Check data integrity.")


Fitting scaler on training data...


Fitting Scaler: 100%|██████████| 135/135 [04:58<00:00,  2.21s/it]


Scaler fitted successfully.
Training the SVM (using SGDClassifier)...


Training SVM: 100%|██████████| 135/135 [06:29<00:00,  2.88s/it]


SVM Model trained successfully.
Evaluating SVM Model...


Evaluating Test Sets: 100%|██████████| 34/34 [03:01<00:00,  5.33s/it]


##### Evaluation Results #####
Accuracy Score =  0.9871150942427299
Recall Score =  0.8353270133620774
Precision Score =  0.8718356978563582
F1 Score =  0.8525845306892493
