<a href="https://colab.research.google.com/github/sajan-sarker/web-attack-detection/blob/main/code/web_attack_dataset_processing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import math, copy
import csv

*****
# HTTP Params Dataset

**Dataset Link:** \
httpparamsds: [https://www.kaggle.com/datasets/evg3n1j/httpparamsdataset](https://www.kaggle.com/datasets/evg3n1j/httpparamsdataset)

In [None]:
httpparamds = pd.read_csv('/content/payloaddataset.csv')

In [None]:
print(httpparamds.head())

            payload  length attack_type label
0    c/ caridad s/n      14        norm  norm
1      campello, el      12        norm  norm
2             40184       5        norm  norm
3  1442431887503330      16        norm  norm
4             nue37       5        norm  norm


In [None]:
attack_type_counts = httpparamds['attack_type'].value_counts()

print("Attack payload counts for each category:")
print(attack_type_counts)

Attack payload counts for each category:
attack_type
norm              19304
sqli              10852
xss                 532
path-traversal      290
cmdi                 89
Name: count, dtype: int64


*****
# Command Injection Attack Dataset

**Dataset Link:** \
cmdi: [https://www.kaggle.com/datasets/sanketpawase/os-command-injection](https://www.kaggle.com/datasets/sanketpawase/os-command-injection)

In [None]:
cmdi = pd.read_csv('/content/command injection.csv')

In [None]:
print(cmdi.head())

                                            sentence  Label
0  &lt;!--#exec%20cmd=&quot;/bin/cat%20/etc/passw...      1
1  &lt;!--#exec%20cmd=&quot;/bin/cat%20/etc/shado...      1
2        &lt;!--#exec%20cmd=&quot;/usr/bin/id;--&gt;      1
3        &lt;!--#exec%20cmd=&quot;/usr/bin/id;--&gt;      1
4                                    /index.html|id|      1


In [None]:
cmdi.shape

(2106, 2)

In [None]:
level_counts = cmdi['Label'].value_counts()

print(level_counts)

Label
0    1591
1     515
Name: count, dtype: int64


**Write a new csv file**

In [None]:
cmdi.rename(columns={'sentence': 'payload'}, inplace=True)
cmdi.rename(columns={'Label': 'label'}, inplace=True)

cmdi['payload'] = cmdi['payload'].fillna("").astype(str)
cmdi['length'] = cmdi['payload'].apply(len)
cmdi['attack_type'] = cmdi['label'].apply(lambda x: 'norm' if x == 0 else 'cmdi')
cmdi['label'] = cmdi['label'].apply(lambda x: 'norm' if x == 0 else 'anom')

data_dict = cmdi.to_dict(orient='records')
cmdi_output_df = pd.DataFrame(data_dict, columns=['payload', 'length', 'attack_type', 'label'])

In [None]:
cmdi_output_df.shape

(2106, 4)

In [None]:
cmdi_output_df.head()

Unnamed: 0,payload,length,attack_type,label
0,&lt;!--#exec%20cmd=&quot;/bin/cat%20/etc/passw...,59,cmdi,anom
1,&lt;!--#exec%20cmd=&quot;/bin/cat%20/etc/shado...,59,cmdi,anom
2,&lt;!--#exec%20cmd=&quot;/usr/bin/id;--&gt;,43,cmdi,anom
3,&lt;!--#exec%20cmd=&quot;/usr/bin/id;--&gt;,43,cmdi,anom
4,/index.html|id|,15,cmdi,anom


In [None]:
cmdi_output_df.to_csv('/content/command_injection_processed.csv', index=False)

# Create Synthetic Command Injection Data

In [None]:
import random

basic_commands = ["ls", "cat /etc/passwd", "uname -a", "id", "whoami", "ps aux", "df -h"]
injection_techniques = [";", "|", "&&", "||", "`", "$(", ">", "<", "&>", "2>&1"]
env_variables = ["$PATH", "$HOME", "$USER", "$SHELL", "$PWD"]
malicious_env = ["export PATH=/malicious/path", "APPHOME=/tmp/malicious"]


payloads = []
num_payloads = 500
for _ in range(num_payloads):
  command = random.choice(basic_commands)
  tech = random.choice(injection_techniques)

  # Generate the payload
  payload = f"{command}{tech}rm -rf /tmp/test"

  # Add variations
  if random.random() < 0.5:
    env = random.choice(env_variables)
    payload = f"{payload}{tech}{env}"
  elif random.random() < 0.5:
    env_set = random.choice(malicious_env)
    payload = f"{payload}{tech}{env_set}"

  # Add additional commands to create variations
  if random.random() < 0.5:
    extra_command = random.choice(basic_commands)
    payload = f"{payload}{tech}{extra_command}"
  payloads.append(payload)

payload_data = {
    "payload": payloads,
    "length": [len(p) for p in payloads],
    "attack_type": ["cmdi"] * len(payloads),
    "label": ["anom"] * len(payloads)
}
cmdi_syn_output_df = pd.DataFrame(payload_data)

In [None]:
cmdi_syn_output_df.head()

Unnamed: 0,payload,length,attack_type,label
0,uname -a2>&1rm -rf /tmp/test2>&1$HOME,37,cmdi,anom
1,id>rm -rf /tmp/test>$PATH,25,cmdi,anom
2,ps aux;rm -rf /tmp/test;$SHELL;whoami,37,cmdi,anom
3,uname -a||rm -rf /tmp/test,26,cmdi,anom
4,whoami|rm -rf /tmp/test|APPHOME=/tmp/malicious|id,49,cmdi,anom


In [None]:
cmdi_syn_output_df.shape

(500, 4)

In [None]:
cmdi_syn_output_df.to_csv('/content/command_injection_synthetic_processed.csv', index=False)

In [None]:
df_a = pd.read_csv('/content/command_injection_processed.csv')
df_b = pd.read_csv('/content/command_injection_synthetic_processed.csv')

merged_df = pd.concat([df_a, df_b], ignore_index=True)

In [None]:
print(merged_df.shape)
merged_df.head()

(2606, 4)


Unnamed: 0,payload,length,attack_type,label
0,&lt;!--#exec%20cmd=&quot;/bin/cat%20/etc/passw...,59,cmdi,anom
1,&lt;!--#exec%20cmd=&quot;/bin/cat%20/etc/shado...,59,cmdi,anom
2,&lt;!--#exec%20cmd=&quot;/usr/bin/id;--&gt;,43,cmdi,anom
3,&lt;!--#exec%20cmd=&quot;/usr/bin/id;--&gt;,43,cmdi,anom
4,/index.html|id|,15,cmdi,anom


In [None]:
merged_df.to_csv('/content/command_injection_payload_processed.csv', index=False)

*****
# Path Traversal Attack Dataset

**Dataset Link:** \
1. pt1 & pt2: [https://github.com/swisskyrepo/PayloadsAllTheThings/tree/master](https://github.com/swisskyrepo/PayloadsAllTheThings/tree/master)

2. pt3: [https://github.com/1N3/IntruderPayloads/tree/master](https://github.com/1N3/IntruderPayloads/tree/master)

In [None]:
pt1 = []
pt2 = []
pt3 = []

with open('/content/deep_traversal.txt', mode='r', encoding='utf-8') as file:
  for line in file:
    pt1.append(line.strip())

with open('/content/directory_traversal.txt', mode='r', encoding='utf-8') as file:
  for line in file:
    pt2.append(line.strip())

with open('/content/traversal.txt', mode='r', encoding='utf-8') as file:
  for line in file:
    pt3.append(line.strip())

In [None]:
print(f"Total data length: {len(pt1)+len(pt2)+len(pt3)}")

Total data length: 5548


In [None]:
payloads = pt1 + pt2 + pt3

df = pd.DataFrame(payloads, columns=['payload'])

df['length'] = df['payload'].apply(len)
df['attack_type'] = 'path-traversal'
df['label'] = 'anom'

data_dict = df.to_dict(orient='records')
pathtrav_output_df = pd.DataFrame(data_dict, columns=['payload', 'length', 'attack_type', 'label'])

In [None]:
pathtrav_output_df.head()

Unnamed: 0,payload,length,attack_type,label
0,../{FILE},9,path-traversal,anom
1,../../{FILE},12,path-traversal,anom
2,../../../{FILE},15,path-traversal,anom
3,../../../../{FILE},18,path-traversal,anom
4,../../../../../{FILE},21,path-traversal,anom


In [None]:
pathtrav_output_df.shape

(5548, 4)

In [None]:
pathtrav_output_df.to_csv('/content/path_traversal_payloads_processed.csv', index=False)

*****
# XSS Attack Dataset

**Dataset Link:** \
1. xss1: [https://github.com/payloadbox/xss-payload-list](https://github.com/payloadbox/xss-payload-list)

2. xss2: [https://github.com/7ioSecurity/XSS-Payloads](https://github.com/7ioSecurity/XSS-Payloads)

3. xss3: [https://github.com/pgaijin66/XSS-Payloads/tree/master](https://github.com/pgaijin66/XSS-Payloads/tree/master)

In [None]:
xss1 = []
xss2 = []
xss3 = []

with open('/content/xss-payload-list.txt', mode='r', encoding='utf-8') as file:
  for line in file:
    xss1.append(line.strip())

with open('/content/xss_payloads_2016.txt', mode='r', encoding='utf-8') as file:
  for line in file:
    xss2.append(line.strip())

with open('/content/payload.txt', mode='r', encoding='ISO-8859-1') as file:
    for line in file:
        xss3.append(line.strip())

In [None]:
print(f"Total data length: {len(xss1)+len(xss2)+len(xss3)}")

Total data length: 7465


In [None]:
payloads = xss1 + xss2 + xss3

df = pd.DataFrame(payloads, columns=['payload'])

df['length'] = df['payload'].apply(len)
df['attack_type'] = 'xss'
df['label'] = 'anom'

data_dict = df.to_dict(orient='records')
xss_output_df = pd.DataFrame(data_dict, columns=['payload', 'length', 'attack_type', 'label'])

In [None]:
print(xss_output_df.shape)
xss_output_df.head()

(7465, 4)


Unnamed: 0,payload,length,attack_type,label
0,"""-prompt(8)-""",13,xss,anom
1,'-prompt(8)-',13,xss,anom
2,""";a=prompt,a()//",16,xss,anom
3,"';a=prompt,a()//",16,xss,anom
4,"'-eval(""window['pro'%2B'mpt'](8)"")-'",36,xss,anom


In [None]:
xss_output_df.to_csv('/content/xss_payloads_processed.csv', index=False)

norm- 19,304 + 1,591 \
xss- 7,465 + 532 -> 7,997\
sqli- 10,852 \
cmdi- 515 + 89 -> 604\
path- 5,548 + 290 -> 5,838

*****
# Creating a Unified Main Dataset through Dataset Merging

In [None]:
httpparamds = pd.read_csv('/content/payloaddataset.csv')
cmdi = pd.read_csv('/content/command_injection_payload_processed.csv')
path_traversal = pd.read_csv('/content/path_traversal_payloads_processed.csv')
xss = pd.read_csv('/content/xss_payloads_processed.csv')

In [None]:
print(httpparamds.shape)
httpparamds.head()

Unnamed: 0,payload,length,attack_type,label
0,c/ caridad s/n,14,norm,norm
1,"campello, el",12,norm,norm
2,40184,5,norm,norm
3,1442431887503330,16,norm,norm
4,nue37,5,norm,norm


In [None]:
attack_type_counts = httpparamds['attack_type'].value_counts()
print(httpparamds.columns)
print(attack_type_counts)

Index(['payload', 'length', 'attack_type', 'label'], dtype='object')
attack_type
norm              19304
sqli              10852
xss                 532
path-traversal      290
cmdi                 89
Name: count, dtype: int64


In [None]:
print(cmdi.shape)
cmdi.head()

(2606, 4)


Unnamed: 0,payload,length,attack_type,label
0,&lt;!--#exec%20cmd=&quot;/bin/cat%20/etc/passw...,59,cmdi,anom
1,&lt;!--#exec%20cmd=&quot;/bin/cat%20/etc/shado...,59,cmdi,anom
2,&lt;!--#exec%20cmd=&quot;/usr/bin/id;--&gt;,43,cmdi,anom
3,&lt;!--#exec%20cmd=&quot;/usr/bin/id;--&gt;,43,cmdi,anom
4,/index.html|id|,15,cmdi,anom


In [None]:
attack_type_counts = cmdi['attack_type'].value_counts()
print(cmdi.columns)
print(attack_type_counts)

Index(['payload', 'length', 'attack_type', 'label'], dtype='object')
attack_type
norm    1591
cmdi    1015
Name: count, dtype: int64


In [None]:
print(path_traversal.shape)
path_traversal.head()

(5548, 4)


Unnamed: 0,payload,length,attack_type,label
0,../{FILE},9,path-traversal,anom
1,../../{FILE},12,path-traversal,anom
2,../../../{FILE},15,path-traversal,anom
3,../../../../{FILE},18,path-traversal,anom
4,../../../../../{FILE},21,path-traversal,anom


In [None]:
attack_type_counts = path_traversal['attack_type'].value_counts()
print(path_traversal.columns)
print(attack_type_counts)

Index(['payload', 'length', 'attack_type', 'label'], dtype='object')
attack_type
path-traversal    5548
Name: count, dtype: int64


In [None]:
print(xss.shape)
xss.head()

(7465, 4)


Unnamed: 0,payload,length,attack_type,label
0,"""-prompt(8)-""",13,xss,anom
1,'-prompt(8)-',13,xss,anom
2,""";a=prompt,a()//",16,xss,anom
3,"';a=prompt,a()//",16,xss,anom
4,"'-eval(""window['pro'%2B'mpt'](8)"")-'",36,xss,anom


In [None]:
attack_type_counts = xss['attack_type'].value_counts()
print(xss.columns)
print(attack_type_counts)

Index(['payload', 'length', 'attack_type', 'label'], dtype='object')
attack_type
xss    7465
Name: count, dtype: int64


In [None]:
columns_httpparamds = set(httpparamds.columns)
columns_cmdi = set(cmdi.columns)
columns_path_traversal = set(path_traversal.columns)
columns_xss = set(xss.columns)

if columns_httpparamds == columns_cmdi == columns_path_traversal == columns_xss:
  dataset = pd.concat([httpparamds, cmdi, path_traversal, xss], ignore_index=True)
  print("Merged Complete!")

Merged Complete!


In [None]:
dataset.shape

(46686, 4)

In [None]:
dataset.to_csv('payloads_dataset.csv', index=False)

checking the new dataset

In [None]:
attack_type_counts = dataset['attack_type'].value_counts()
print(dataset.columns)
print(attack_type_counts)

Index(['payload', 'length', 'attack_type', 'label'], dtype='object')
attack_type
norm              20895
sqli              10852
xss                7997
path-traversal     5838
cmdi               1104
Name: count, dtype: int64


In [None]:
label_counts = dataset['label'].value_counts()
print(dataset.columns)
print(label_counts)

Index(['payload', 'length', 'attack_type', 'label'], dtype='object')
label
anom    25791
norm    20895
Name: count, dtype: int64
