### This notebook Analyzes and Processes HTTP Requests and Responses

#### HTTP Request Parser for .csv Files

In [1]:
import re
import csv
from urllib.parse import urlparse, parse_qs

def extract_parameters(param_str):
    """
    Extracts parameters from a URL-encoded string and returns as a formatted string.
    """
    params = parse_qs(param_str)
    return '; '.join([f'{k}={v[0]}' for k, v in params.items()])

def count_special_characters(s):
    """
    Counts occurrences of special characters and patterns commonly associated with attacks.
    """
    # Define patterns associated with attacks
    attack_patterns = {
        'single_quote': r"'",
        'double_quote': r'"',
        'backslash': r'\\',
        'semicolon': r';',
        'double_dash': r'--',
        'asterisk': r'\*',
        'hash': r'#',
        'percent': r'%',
        'ampersand': r'&',
        'pipe': r'\|',
        'question_mark': r'\?',
        'equal_sign': r'=',
        'parentheses': r'\(|\)',
        'angle_brackets': r'<|>',
        'curly_brackets': r'\{|\}',
        'square_brackets': r'\[|\]',
        'dollar_sign': r'\$',
        'at_symbol': r'@',
        'tilde': r'~',
        'backtick': r'`',
        'slash': r'/',
        'colon': r':',
        'exclamation_mark': r'!',
        'javascript_scheme': r'javascript:',
        'eval_function': r'eval\(',
        'alert_function': r'alert\(',
        'file_inclusion': r'file://',
        'path_traversal': r'\.\./',
        'localhost': r'localhost',
        'root_path': r'root'
    }
    
    # Count occurrences of each pattern
    total_count = sum(len(re.findall(pattern, s, re.IGNORECASE)) for pattern in attack_patterns.values())
    
    return total_count

def parse_txt_file(file_in):
    """
    Parses a .txt file and returns a list of log entries.
    """
    with open(file_in, 'r', encoding='utf-8') as fin:
        return fin.readlines()

def parse_file(file_in, file_out):
    # Determine file type
    if file_in.endswith('.txt'):
        lines = parse_txt_file(file_in)
    else:
        raise ValueError("Unsupported file type. Only .txt and .xml files are supported.")

    with open(file_out, 'w', newline='', encoding='utf-8') as fout:
        csv_writer = csv.writer(fout)
        # Write the header
        csv_writer.writerow([
            'Method', 'Full URL', 'URL Path', 'Query Params', 'Query Params Length', 'Number of Query Params', 'Body Params', 
            'Content-Length', 'Content-Type', 'User-Agent', 'Host', 'Accept', 
            'Accept-Encoding', 'Accept-Charset', 'Accept-Language', 'Pragma',
            'Connection', 'Body Length', 'URL Length', 'Special Characters Count in URL', 'Special Characters Count in Query Params'
        ])
        
        i = 0
        while i < len(lines):
            line = lines[i].strip()
            if line.startswith(('GET', 'POST', 'PUT', 'DELETE', 'HEAD', 'TRACE', 'PATCH', 'OPTIONS')):
                # Extract Method
                method = line.split(' ')[0]
                
                # Extract Full URL
                full_url = line.split(' ')[1]
                
                # Extract URL Path
                url_path = urlparse(full_url).path
                
                # Extract Query Parameters
                query_params = parse_qs(urlparse(full_url).query)
                query_params_str = extract_parameters(urlparse(full_url).query)
                query_params_length = len(urlparse(full_url).query)
                num_query_params = len(query_params)
                
                # Count special characters
                url_special_characters_count = count_special_characters(url_path)
                query_special_characters_count = count_special_characters(urlparse(full_url).query)
                
                # Initialize other fields
                content_length = ''
                content_type = ''
                user_agent = ''
                host = ''
                accept = ''
                accept_encoding = ''
                accept_charset = ''
                accept_language = ''
                pragma = ''
                connection = ''
                body_params_str = ''
                
                headers = {}
                body = ''
                
                # Loop through headers and body
                j = 1
                while i + j < len(lines) and not lines[i + j].strip() == '':
                    header_line = lines[i + j].strip()
                    if header_line.startswith('Content-Length:'):
                        content_length = header_line.split(':', 1)[1].strip()
                    elif header_line.startswith('Content-Type:'):
                        content_type = header_line.split(':', 1)[1].strip()
                    elif header_line.startswith('User-Agent:'):
                        user_agent = header_line.split(':', 1)[1].strip()
                    elif header_line.startswith('Host:'):
                        host = header_line.split(':', 1)[1].strip()
                    elif header_line.startswith('Accept:'):
                        accept = header_line.split(':', 1)[1].strip()
                    elif header_line.startswith('Accept-Encoding:'):
                        accept_encoding = header_line.split(':', 1)[1].strip()
                    elif header_line.startswith('Accept-Charset:'):
                        accept_charset = header_line.split(':', 1)[1].strip()
                    elif header_line.startswith('Accept-Language:'):
                        accept_language = header_line.split(':', 1)[1].strip()
                    elif header_line.startswith('Pragma:'):
                        pragma = header_line.split(':', 1)[1].strip()
                    elif header_line.startswith('Connection:'):
                        connection = header_line.split(':', 1)[1].strip()
                    j += 1
                
                # The body is the line after the headers
                if i + j < len(lines):
                    body = lines[i + j].strip()
                    # Extract body parameters (if the body is URL-encoded)
                    body_params_str = extract_parameters(body)
                
                # Calculate lengths for additional features
                body_length = len(body)
                url_length = len(full_url)
                    
                # Append the extracted features to the CSV
                csv_writer.writerow([
                    method, full_url, url_path, query_params_str, query_params_length, num_query_params, body_params_str, 
                    content_length, content_type, user_agent, host, accept, 
                    accept_encoding, accept_charset, accept_language, pragma,
                    connection, body_length, url_length,
                    url_special_characters_count, query_special_characters_count
                ])
                
                i += j + 1
            else:
                i += 1

In [3]:
parse_file('dataset/CSIC 2010/anomalousTrafficTest.txt', 'anomalous_parsed_data.csv')

In [5]:
parse_file('dataset/CSIC 2010/normalTrafficTraining.txt', 'normal_parsed_data.csv')

##### Analysis of Parsed Dataset

In [6]:
import pandas as pd

In [7]:
anomaly_file_parsed = 'anomalous_parsed_data.csv'
anomaly_file_parsed=pd.read_csv('anomalous_parsed_data.csv')
anomaly_file_parsed.head()

Unnamed: 0,Method,Full URL,URL Path,Query Params,Query Params Length,Number of Query Params,Body Params,Content-Length,Content-Type,User-Agent,...,Accept,Accept-Encoding,Accept-Charset,Accept-Language,Pragma,Connection,Body Length,URL Length,Special Characters Count in URL,Special Characters Count in Query Params
0,GET,http://localhost:8080/tienda1/publico/anadir.j...,/tienda1/publico/anadir.jsp,id=2; nombre=Jam�n Ib�rico; precio=85; cantida...,146,5,,,,Mozilla/5.0 (compatible; Konqueror/3.5; Linux)...,...,"text/xml,application/xml,application/xhtml+xml...","x-gzip, x-deflate, gzip, deflate","utf-8, utf-8;q=0.5, *;q=0.5",en,no-cache,close,0,195,3,18
1,POST,http://localhost:8080/tienda1/publico/anadir.jsp,/tienda1/publico/anadir.jsp,,0,0,,146.0,application/x-www-form-urlencoded,Mozilla/5.0 (compatible; Konqueror/3.5; Linux)...,...,"text/xml,application/xml,application/xhtml+xml...","x-gzip, x-deflate, gzip, deflate","utf-8, utf-8;q=0.5, *;q=0.5",en,no-cache,close,0,48,3,0
2,GET,http://localhost:8080/tienda1/publico/anadir.j...,/tienda1/publico/anadir.jsp,id=2/; nombre=Jam�n Ib�rico; precio=85; cantid...,77,5,,,,Mozilla/5.0 (compatible; Konqueror/3.5; Linux)...,...,"text/xml,application/xml,application/xhtml+xml...","x-gzip, x-deflate, gzip, deflate","utf-8, utf-8;q=0.5, *;q=0.5",en,no-cache,close,0,126,3,13
3,POST,http://localhost:8080/tienda1/publico/anadir.jsp,/tienda1/publico/anadir.jsp,,0,0,,77.0,application/x-www-form-urlencoded,Mozilla/5.0 (compatible; Konqueror/3.5; Linux)...,...,"text/xml,application/xml,application/xhtml+xml...","x-gzip, x-deflate, gzip, deflate","utf-8, utf-8;q=0.5, *;q=0.5",en,no-cache,close,0,48,3,0
4,GET,http://localhost:8080/asf-logo-wide.gif~,/asf-logo-wide.gif~,,0,0,,,,Mozilla/5.0 (compatible; Konqueror/3.5; Linux)...,...,"text/xml,application/xml,application/xhtml+xml...","x-gzip, x-deflate, gzip, deflate","utf-8, utf-8;q=0.5, *;q=0.5",en,no-cache,close,0,40,2,0


In [8]:
n_features=anomaly_file_parsed.shape[1]
n_samples =anomaly_file_parsed.shape[0]

In [9]:
print(f'number of features: {n_features}')
missing_values_count = anomaly_file_parsed.isnull().sum()
missing_values_count[0:n_features]

number of features: 21


Method                                          0
Full URL                                        0
URL Path                                      214
Query Params                                15468
Query Params Length                             0
Number of Query Params                          0
Body Params                                 25065
Content-Length                              15088
Content-Type                                15088
User-Agent                                      0
Host                                            0
Accept                                          0
Accept-Encoding                                 0
Accept-Charset                                  0
Accept-Language                                 0
Pragma                                          0
Connection                                      0
Body Length                                     0
URL Length                                      0
Special Characters Count in URL                 0


In [10]:
for feature in anomaly_file_parsed.columns:
    if feature in anomaly_file_parsed.columns:
        unique_count = anomaly_file_parsed[feature].nunique()
        print(f"Number of unique values for {feature}: {unique_count}")
    else:
        print(f"Column '{feature}' does not exist in the DataFrame.")

Number of unique values for Method: 3
Number of unique values for Full URL: 8666
Number of unique values for URL Path: 1612
Number of unique values for Query Params: 7042
Number of unique values for Query Params Length: 383
Number of unique values for Number of Query Params: 5
Number of unique values for Body Params: 0
Number of unique values for Content-Length: 382
Number of unique values for Content-Type: 1
Number of unique values for User-Agent: 1
Number of unique values for Host: 2
Number of unique values for Accept: 1
Number of unique values for Accept-Encoding: 1
Number of unique values for Accept-Charset: 1
Number of unique values for Accept-Language: 1
Number of unique values for Pragma: 1
Number of unique values for Connection: 1
Number of unique values for Body Length: 1
Number of unique values for URL Length: 416
Number of unique values for Special Characters Count in URL: 7
Number of unique values for Special Characters Count in Query Params: 64


#### Combining Normal and Anomalous Request Datasets Randomly

In [1]:
import pandas as pd

def combine_csv_randomly(file1, file2, output_file):
    # Load the CSV files into DataFrames
    df1 = pd.read_csv(file1)
    df2 = pd.read_csv(file2)

    # Shuffle the rows of both DataFrames
    df1_shuffled = df1.sample(frac=1).reset_index(drop=True)
    df2_shuffled = df2.sample(frac=1).reset_index(drop=True)

    # Add a new column to identify the source of the data
    df1_shuffled['request_type'] = '1'  # normal dataset as 1
    df2_shuffled['request_type'] = '0'  # anomalous dataset as 0

    # Combine the two DataFrames
    combined_df = pd.concat([df1_shuffled, df2_shuffled], ignore_index=True)

    # Shuffle the combined DataFrame to mix rows from both sources
    combined_df_shuffled = combined_df.sample(frac=1).reset_index(drop=True)

    # Save the combined DataFrame to a new CSV file
    combined_df_shuffled.to_csv(output_file, index=False)

    print(f"Combined CSV file '{output_file}' has been created.")



In [2]:

combine_csv_randomly('normal_parsed_data.csv', 'anomalous_parsed_data.csv', 'combined_parsed_requests.csv')

Combined CSV file 'combined_parsed_requests.csv' has been created.


#### Request Parser for nginx Logs

In [13]:
import csv
import re
from datetime import datetime, timedelta
import os
from collections import defaultdict

def parse_nginx_log(input_file_path, output_file_path):
    log_pattern = re.compile(
        r'(?P<ip>\S+) - - \[(?P<timestamp>.*?)\] "(?P<method>\S+) (?P<path>\S+) HTTP/\S+" (?P<status>\d{3}) (?P<size>\d+) "(?P<referrer>.*?)" "(?P<user_agent>.*?)"'
    )

    fieldnames = [
        'IP Address', 'Timestamp', 'HTTP Method', 'Request Path', 'Status Code', 
        'Response Size', 'Referrer', 'User Agent', 'Request Length', 'Query Parameters Count', 
        'Is Secure', 'Time of Day', 'Day of Week', 'User-Agent Length', 'Referrer Length', 
        'Status Code Category', 'Request Frequency', 'Status Code Distribution', 
        'Request Size Distribution', 'User-Agent Diversity', 'Time Interval Between Requests', 
        'Path Frequency', 'Suspicious Patterns', 'SQL Injection Detected', 'XSS Detected',
        'Command Injection Detected', 'Insecure Deserialization Detected', 'File Inclusion Detected'
    ]

    ip_request_times = defaultdict(list)
    ip_status_code_counts = defaultdict(lambda: defaultdict(int))
    ip_request_sizes = defaultdict(list)
    ip_user_agents = defaultdict(set)
    ip_path_counts = defaultdict(lambda: defaultdict(int))

    # OWASP Top 10 attack patterns
    sql_injection_pattern = re.compile(r"('|\"|;|--|/\*|\*/|SELECT|INSERT|DELETE|UNION|DROP|UPDATE|ALTER)", re.IGNORECASE)
    xss_pattern = re.compile(r"(<script>|<img|<iframe|onerror|onload|javascript:|<svg)", re.IGNORECASE)
    command_injection_pattern = re.compile(r"(\||;|&&|wget|curl|\$\(.*\)|`.*`|bash|sh)", re.IGNORECASE)
    insecure_deserialization_pattern = re.compile(r"(base64_decode|O:\d+|s:\d+|C:\d+|object|Serialized)", re.IGNORECASE)
    file_inclusion_pattern = re.compile(r"(\.\./|\.\./etc/passwd|\.\./\.env|\.\./config|\.\.php|\.\.html)", re.IGNORECASE)

    with open(output_file_path, mode='w', newline='') as csv_file:
        writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
        writer.writeheader()

        with open(input_file_path, 'r') as log_file:
            for log_entry in log_file:
                match = log_pattern.match(log_entry)
                if match:
                    log_data = match.groupdict()

                    timestamp = datetime.strptime(log_data['timestamp'], '%d/%b/%Y:%H:%M:%S %z')
                    log_data['Timestamp'] = timestamp.isoformat()
                    log_data['IP Address'] = log_data.pop('ip')
                    log_data['HTTP Method'] = log_data.pop('method')
                    log_data['Request Path'] = log_data.pop('path')
                    log_data['Status Code'] = log_data.pop('status')
                    log_data['Response Size'] = log_data.pop('size')
                    log_data['Referrer'] = log_data.pop('referrer')
                    log_data['User Agent'] = log_data.pop('user_agent')

                    request_length = len(log_entry)
                    query_parameters_count = len(re.findall(r'[?&]', log_data['Request Path']))
                    is_secure = 'Yes' if log_data['Request Path'].startswith('https://') else 'No'
                    time_of_day = timestamp.strftime('%H:%M:%S')
                    day_of_week = timestamp.strftime('%A')
                    user_agent_length = len(log_data['User Agent'])
                    referrer_length = len(log_data['Referrer'])
                    status_code_category = f"{log_data['Status Code'][0]}xx"
                    
                    ip_request_times[log_data['IP Address']].append(timestamp)
                    ip_status_code_counts[log_data['IP Address']][log_data['Status Code']] += 1
                    ip_request_sizes[log_data['IP Address']].append(int(log_data['Response Size']))
                    ip_user_agents[log_data['IP Address']].add(log_data['User Agent'])
                    ip_path_counts[log_data['IP Address']][log_data['Request Path']] += 1

                    request_frequency = len(ip_request_times[log_data['IP Address']])
                    status_code_distribution = ip_status_code_counts[log_data['IP Address']]
                    request_size_distribution = sum(ip_request_sizes[log_data['IP Address']]) / len(ip_request_sizes[log_data['IP Address']]) if ip_request_sizes[log_data['IP Address']] else 0
                    user_agent_diversity = len(ip_user_agents[log_data['IP Address']])
                    if len(ip_request_times[log_data['IP Address']]) > 1:
                        time_intervals = [ip_request_times[log_data['IP Address']][i] - ip_request_times[log_data['IP Address']][i-1] for i in range(1, len(ip_request_times[log_data['IP Address']]))]
                        time_interval_between_requests = sum(time_intervals, timedelta()) / len(time_intervals)
                    else:
                        time_interval_between_requests = timedelta()
                    path_frequency = ip_path_counts[log_data['IP Address']][log_data['Request Path']]

                    # Suspicious pattern detection
                    suspicious_patterns = ['admin', 'wp-login', 'phpmyadmin']
                    suspicious_patterns_found = any(pattern in log_data['Request Path'] for pattern in suspicious_patterns)

                    # OWASP Top 10 Detection
                    sql_injection_detected = bool(sql_injection_pattern.search(log_data['Request Path']))
                    xss_detected = bool(xss_pattern.search(log_data['Request Path'])) or bool(xss_pattern.search(log_data['Referrer']))
                    command_injection_detected = bool(command_injection_pattern.search(log_data['Request Path']))
                    insecure_deserialization_detected = bool(insecure_deserialization_pattern.search(log_data['Request Path']))
                    file_inclusion_detected = bool(file_inclusion_pattern.search(log_data['Request Path']))

                    log_data['Request Length'] = request_length
                    log_data['Query Parameters Count'] = query_parameters_count
                    log_data['Is Secure'] = is_secure
                    log_data['Time of Day'] = time_of_day
                    log_data['Day of Week'] = day_of_week
                    log_data['User-Agent Length'] = user_agent_length
                    log_data['Referrer Length'] = referrer_length
                    log_data['Status Code Category'] = status_code_category
                    log_data['Request Frequency'] = request_frequency
                    log_data['Status Code Distribution'] = str(status_code_distribution)
                    log_data['Request Size Distribution'] = request_size_distribution
                    log_data['User-Agent Diversity'] = user_agent_diversity
                    log_data['Time Interval Between Requests'] = str(time_interval_between_requests)
                    log_data['Path Frequency'] = path_frequency
                    log_data['Suspicious Patterns'] = 'Yes' if suspicious_patterns_found else 'No'
                    log_data['SQL Injection Detected'] = 'Yes' if sql_injection_detected else 'No'
                    log_data['XSS Detected'] = 'Yes' if xss_detected else 'No'
                    log_data['Command Injection Detected'] = 'Yes' if command_injection_detected else 'No'
                    log_data['Insecure Deserialization Detected'] = 'Yes' if insecure_deserialization_detected else 'No'
                    log_data['File Inclusion Detected'] = 'Yes' if file_inclusion_detected else 'No'

                    row_data = {field: log_data.get(field, '') for field in fieldnames}
                    writer.writerow(row_data)

    print(f"CSV file '{output_file_path}' has been created successfully.")






In [14]:
parse_nginx_log('dataset/nginx_logs/nginx_logs/access.log', 'access_logs.csv')

CSV file 'access_logs.csv' has been created successfully.
