In [1]:
import re
import pandas as pd
import numpy as np
from urllib.parse import urlparse, parse_qs
from sklearn.preprocessing import LabelEncoder, StandardScaler
import datetime

def extract_features_from_request(request):
    features = {}

    # 1. HTTP Method
    method_match = re.match(r'^(GET|POST|PUT|DELETE|HEAD|OPTIONS|CONNECT|TRACE|PATCH)', request)
    method = method_match.group(0) if method_match else 'UNKNOWN'
    features['http_method'] = method

    # 2. URI and URI Length
    uri_match = re.search(r'\s(\/[^\s]*)\s', request)
    uri = uri_match.group(1) if uri_match else '/'
    features['uri'] = uri
    features['uri_length'] = len(uri)
    
    # 3. Number of Parameters and Length of Parameter Values
    parsed_uri = urlparse(uri)
    params = parse_qs(parsed_uri.query)
    features['num_parameters'] = len(params)
    features['param_lengths'] = sum(len(value[0]) for value in params.values())

    # 4. Special Characters in URI/Parameters
    special_chars_count = len(re.findall(r'[\'\";<>=/\\%]', uri))
    features['num_special_chars'] = special_chars_count
    
    # 5. HTTP Headers
    headers = {}
    for line in request.split('\n')[1:]:
        if ': ' in line:
            key, value = line.split(': ', 1)
            headers[key.lower()] = value.strip()

    features['user_agent'] = headers.get('user-agent', 'UNKNOWN')
    features['host'] = headers.get('host', 'UNKNOWN')
    features['content_type'] = headers.get('content-type', 'UNKNOWN')
    features['content_length'] = int(headers.get('content-length', 0))
    features['referer'] = headers.get('referer', 'UNKNOWN')
    features['num_cookies'] = len(headers.get('cookie', '').split(';')) if 'cookie' in headers else 0

    # 6. Payload Anomalies (e.g., SQL Injection)
    payload = request.split('\n')[-1].strip() if method in ['POST', 'PUT'] else ''
    sql_keywords = ['SELECT', 'INSERT', 'UPDATE', 'DELETE', 'DROP', 'UNION']
    features['sql_keywords_count'] = sum(payload.upper().count(keyword) for keyword in sql_keywords)
    
    # 7. Entropy Measures (for URI)
    uri_entropy = -sum((uri.count(char)/len(uri)) * np.log2(uri.count(char)/len(uri)) for char in set(uri)) if uri else 0
    features['uri_entropy'] = uri_entropy

    # 8. Binary Features for Attack Detection
    features['contains_select'] = int('SELECT' in payload.upper())

    # 9. Bigram Features (for URI)
    tokens = re.findall(r'\w+', uri)
    bigrams = [' '.join(bigram) for bigram in zip(tokens[:-1], tokens[1:])]
    features['num_bigrams'] = len(bigrams)

    # 10. Request Anomalies
    invalid_methods = ['TRACE', 'CONNECT', 'PATCH']
    features['invalid_http_method'] = int(method in invalid_methods)
    
    # 11. Behavioral Patterns (Request Length)
    features['request_length'] = len(request)

    return features

def process_requests(requests):
    features_list = [extract_features_from_request(req) for req in requests]
    return pd.DataFrame(features_list)

def preprocess_data(df):
    # Label Encoding for categorical features
    label_encoders = {}
    categorical_features = ['http_method', 'user_agent', 'host', 'content_type', 'referer']

    for col in categorical_features:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])
        label_encoders[col] = le

    # Normalization and Scaling
    scaler = StandardScaler()
    df[df.columns] = scaler.fit_transform(df[df.columns])

    return df, label_encoders, scaler

def save_to_csv(df, filename='csic2010_features.csv'):
    df.to_csv(filename, index=False)
    print(f"Features extracted and saved to {filename}")

def read_requests_from_file(file_path):
    with open(file_path, 'r') as file:
        requests = file.read().split('\n\n')  # Assuming requests are separated by a blank line
    return requests

# Main function to handle the entire process
def main(input_file, output_file='csic2010_features.csv'):
    raw_requests = read_requests_from_file(input_file)
    df_features = process_requests(raw_requests)
    df_preprocessed, label_encoders, scaler = preprocess_data(df_features)
    save_to_csv(df_preprocessed, output_file)

# Example usage:
# main('raw_requests.txt', 'csic2010_features.csv')


In [2]:
main('anomalousTrafficTest.txt', 'anomalous_csic2010_features.csv')

ValueError: could not convert string to float: '/'

In [8]:
import re
import pandas as pd
import numpy as np
from urllib.parse import urlparse, parse_qs
from sklearn.preprocessing import LabelEncoder, StandardScaler

def extract_features_from_request(request):
    features = {}

    # 1. HTTP Method
    method_match = re.match(r'^(GET|POST|PUT|DELETE|HEAD|OPTIONS|CONNECT|TRACE|PATCH)', request)
    method = method_match.group(0) if method_match else 'UNKNOWN'
    features['http_method'] = method

    # 2. URI and URI Length
    uri_match = re.search(r'\s(\/[^\s]*)\s', request)
    uri = uri_match.group(1) if uri_match else '/'
    features['uri'] = uri
    features['uri_length'] = len(uri)
    
    # 3. Number of Parameters and Length of Parameter Values
    parsed_uri = urlparse(uri)
    params = parse_qs(parsed_uri.query)
    features['num_parameters'] = len(params)
    features['param_lengths'] = sum(len(value[0]) for value in params.values())

    # 4. Special Characters in URI/Parameters
    special_chars_count = len(re.findall(r'[\'\";<>=/\\%]', uri))
    features['num_special_chars'] = special_chars_count
    
    # 5. HTTP Headers
    headers = {}
    for line in request.split('\n')[1:]:
        if ': ' in line:
            key, value = line.split(': ', 1)
            headers[key.lower()] = value.strip()

    features['user_agent'] = headers.get('user-agent', 'UNKNOWN')
    features['host'] = headers.get('host', 'UNKNOWN')
    features['content_type'] = headers.get('content-type', 'UNKNOWN')
    features['content_length'] = int(headers.get('content-length', 0))
    features['referer'] = headers.get('referer', 'UNKNOWN')
    features['num_cookies'] = len(headers.get('cookie', '').split(';')) if 'cookie' in headers else 0

    # 6. Payload Anomalies (e.g., SQL Injection)
    payload = request.split('\n')[-1].strip() if method in ['POST', 'PUT'] else ''
    sql_keywords = ['SELECT', 'INSERT', 'UPDATE', 'DELETE', 'DROP', 'UNION']
    features['sql_keywords_count'] = sum(payload.upper().count(keyword) for keyword in sql_keywords)
    
    # 7. Entropy Measures (for URI)
    uri_entropy = -sum((uri.count(char)/len(uri)) * np.log2(uri.count(char)/len(uri)) for char in set(uri)) if uri else 0
    features['uri_entropy'] = uri_entropy

    # 8. Binary Features for Attack Detection
    features['contains_select'] = int('SELECT' in payload.upper())

    # 9. Bigram Features (for URI)
    tokens = re.findall(r'\w+', uri)
    bigrams = [' '.join(bigram) for bigram in zip(tokens[:-1], tokens[1:])]
    features['num_bigrams'] = len(bigrams)

    # 10. Request Anomalies
    invalid_methods = ['TRACE', 'CONNECT', 'PATCH']
    features['invalid_http_method'] = int(method in invalid_methods)
    
    # 11. Behavioral Patterns (Request Length)
    features['request_length'] = len(request)

    return features

def process_requests(requests):
    features_list = [extract_features_from_request(req) for req in requests]
    return pd.DataFrame(features_list)

def preprocess_data(df):
    # Label Encoding for categorical features
    label_encoders = {}
    categorical_features = ['http_method', 'user_agent', 'host', 'content_type', 'referer']

    for col in categorical_features:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])
        label_encoders[col] = le

    # Selecting numeric columns for scaling
    numeric_columns = df.select_dtypes(include=[np.number]).columns
    scaler = StandardScaler()
    df[numeric_columns] = scaler.fit_transform(df[numeric_columns])

    return df, label_encoders, scaler

def save_to_csv(df, filename='csic2010_features.csv'):
    df.to_csv(filename, index=False)
    print(f"Features extracted and saved to {filename}")

def read_requests_from_file(file_path):
    with open(file_path, 'r') as file:
        requests = file.read().split('\n\n')  # Assuming requests are separated by a blank line
    return requests

# Main function to handle the entire process
def main(input_file, output_file='csic2010_features.csv'):
    raw_requests = read_requests_from_file(input_file)
    df_features = process_requests(raw_requests)
    df_preprocessed, label_encoders, scaler = preprocess_data(df_features)
    save_to_csv(df_preprocessed, output_file)

# Example usage:
# main('raw_requests.txt', 'csic2010_features.csv')


In [9]:
main('anomalousTrafficTest.txt', 'csic2010_features.csv')

Features extracted and saved to csic2010_features.csv


In [10]:
import pandas as pd

# Function to read raw HTTP requests from a file
def read_requests_from_file(file_path):
    with open(file_path, 'r') as file:
        requests = file.read().split('\n\n')  # Assuming requests are separated by a blank line
    return requests

# Function to extract basic features from raw HTTP requests
def extract_basic_features(requests):
    data = []
    for request in requests:
        request_lines = request.split('\n')
        
        if not request_lines:
            continue

        # Extract HTTP Method, URI, and Protocol
        request_line = request_lines[0].strip()
        request_parts = request_line.split(' ', 2)
        
        # Check if the request line is valid
        if len(request_parts) < 3:
            print(f"Skipping invalid request line: {request_line}")
            continue
        
        method, uri, _ = request_parts
        
        # Extract headers and other fields
        headers = dict(line.split(': ', 1) for line in request_lines[1:] if ': ' in line)
        user_agent = headers.get('User-Agent', '')
        host = headers.get('Host', '')
        content_type = headers.get('Content-Type', '')
        content_length = headers.get('Content-Length', '0')
        referer = headers.get('Referer', '')
        num_cookies = headers.get('Cookie', '').count(';') + 1
        
        # Append extracted features
        data.append({
            'http_method': method,
            'uri': uri,
            'user_agent': user_agent,
            'host': host,
            'content_type': content_type,
            'content_length': int(content_length) if content_length.isdigit() else 0,
            'referer': referer,
            'num_cookies': num_cookies
        })

    # Create DataFrame
    df = pd.DataFrame(data)
    return df

# Function to save DataFrame to CSV
def save_to_csv(df, filename='csic2010_basic_features.csv'):
    df.to_csv(filename, index=False)
    print(f"Features extracted and saved to {filename}")

# Main function to handle the entire process
def main(input_file, output_file='csic2010_basic_features.csv'):
    raw_requests = read_requests_from_file(input_file)
    df_features = extract_basic_features(raw_requests)
    save_to_csv(df_features, output_file)

# Example usage
# main('raw_requests.txt', 'csic2010_basic_features.csv')


In [11]:
main('anomalousTrafficTest.txt', 'anomalous_csic2010_basic_features.csv')

Skipping invalid request line: 
Skipping invalid request line: id=2&nombre=Jam%F3n+Ib%E9rico&precio=85&cantidad=%27%3B+DROP+TABLE+usuarios%3B+SELECT+*+FROM+datos+WHERE+nombre+LIKE+%27%25&B1=A%F1adir+al+carrito
Skipping invalid request line: 
Skipping invalid request line: id=2%2F&nombre=Jam%F3n+Ib%E9rico&precio=85&cantidad=49&B1=A%F1adir+al+carrito
Skipping invalid request line: 
Skipping invalid request line: 
Skipping invalid request line: modo=entrar&login=bob%40%3CSCRipt%3Ealert%28Paros%29%3C%2FscrIPT%3E.parosproxy.org&pwd=84m3ri156&remember=on&B1=Entrar
Skipping invalid request line: 
Skipping invalid request line: modo=entrar&login=grimshaw&pwd=G%2F%2FlAc%2CIAr&remember=on&B1=Entrar
Skipping invalid request line: 
Skipping invalid request line: modo=entrar&login=grimshaw&pwd=84m3ri156&rememberA=on&B1=Entrar
Skipping invalid request line: 
Skipping invalid request line: idA=2
Skipping invalid request line: 
Skipping invalid request line: errorMsg=%2B
Skipping invalid request line:

In [12]:
import re
import math
import csv
from collections import Counter
from urllib.parse import unquote, urlparse, parse_qs

class AdvancedHTTPRequestParser:
    def __init__(self, raw_request):
        self.raw_request = raw_request
        self.headers = {}
        self.body = ''
        self.query_string = ''
        self.method = ''
        self.url = ''
        self.path = ''
        self.parse_request()

    def parse_request(self):
        lines = self.raw_request.split('\n')
    
        if len(lines) == 0 or not lines[0].strip():
            print("Skipping empty or malformed request")
            return False

        request_line = lines[0].strip()

        try:
            self.method, full_url, _ = request_line.split(' ')
        except ValueError:
            print(f"Skipping malformed request line: {request_line}")
            return False

        parsed_url = urlparse(full_url)
        self.url = parsed_url.geturl()
        self.path = parsed_url.path
        self.query_string = parsed_url.query

        header_section = True
        for line in lines[1:]:
            line = line.strip()
            if header_section:
                if line == '':
                    header_section = False
                else:
                    try:
                        key, value = line.split(': ', 1)
                        self.headers[key] = value
                    except ValueError:
                        print(f"Skipping malformed header: {line}")
                        continue  # Skip lines that aren't valid headers
            else:
                self.body += line + '\n'

        self.body = self.body.strip()
        return True


    def parse_parameters(self, param_string):
        params = parse_qs(param_string)
        parsed_params = {key: unquote(value[0]) if value else '' for key, value in params.items()}
        return parsed_params

    def get_query_params(self):
        return self.parse_parameters(self.query_string)

    def get_body_params(self):
        content_type = self.headers.get('Content-Type', '')
        if content_type == 'application/x-www-form-urlencoded':
            return self.parse_parameters(self.body)
        elif content_type == 'application/json':
            try:
                import json
                return json.loads(self.body)
            except json.JSONDecodeError:
                return {}
        elif content_type.startswith('multipart/form-data'):
            return self.parse_multipart_body(self.body, content_type)
        return {}

    def parse_multipart_body(self, body, content_type):
        boundary = content_type.split("boundary=")[-1]
        parts = body.split('--' + boundary)
        params = {}
        for part in parts:
            if 'Content-Disposition' in part:
                header, value = part.split('\r\n\r\n', 1)
                name_match = re.search(r'name="(.+?)"', header)
                if name_match:
                    name = name_match.group(1)
                    params[name] = value.strip()
        return params

    def calculate_entropy(self, string):
        if not string:
            return 0
        entropy = 0
        for x in Counter(string).values():
            p_x = x / len(string)
            entropy += - p_x * math.log2(p_x)
        return entropy

    def character_diversity(self, string):
        return len(set(string)) / len(string) if string else 0

    def contains_attack_signature(self, string):
        attack_patterns = [
            re.compile(r"(?i)(SELECT|INSERT|DELETE|UPDATE|DROP|UNION).*--"),
            re.compile(r"(?i)<script.*?>.*?</script>"),
            re.compile(r"(?i)or\s+1=1"),
            re.compile(r"(?i)char\((\d{1,3},?)+\)"),
            re.compile(r"(?i)admin.*'--")
        ]
        for pattern in attack_patterns:
            if pattern.search(string):
                return True
        return False

    def extract_features(self):
        features = {}

        query_params = self.get_query_params()
        body_params = self.get_body_params()

        features['num_query_params'] = len(query_params)
        features['num_body_params'] = len(body_params)
        features['num_headers'] = len(self.headers)
        features['url_length'] = len(self.url)
        features['path_length'] = len(self.path)

        for param_name, param_value in {**query_params, **body_params}.items():
            features[f'entropy_{param_name}'] = self.calculate_entropy(param_value)
            features[f'diversity_{param_name}'] = self.character_diversity(param_value)
            features[f'contains_attack_signature_{param_name}'] = int(self.contains_attack_signature(param_value))

        features['body_length'] = len(self.body)

        for header_name, header_value in self.headers.items():
            features[f'header_length_{header_name}'] = len(header_value)
            features[f'entropy_header_{header_name}'] = self.calculate_entropy(header_value)
            features[f'contains_attack_signature_header_{header_name}'] = int(self.contains_attack_signature(header_value))

        return features

def process_request_dataset(file_path):
    features_list = []
    with open(file_path, 'r') as file:
        raw_requests = file.read().split('\n\n')  # Assumes each request is separated by a blank line
        for raw_request in raw_requests:
            if raw_request.strip():  # Skip empty requests
                parser = AdvancedHTTPRequestParser(raw_request)
                if parser.parse_request():  # Proceed only if parsing was successful
                    features = parser.extract_features()
                    features_list.append(features)
                else:
                    print("Skipping malformed request")
    return features_list


def save_features_to_csv(features_list, output_file):
    if not features_list:
        print("No features to save.")
        return

    # Extracting the header from the first feature set (keys)
    headers = features_list[0].keys()

    with open(output_file, 'w', newline='') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=headers)
        writer.writeheader()
        writer.writerows(features_list)
    print(f"Features saved to {output_file}")



In [13]:
# Example usage
file_path = 'anomalousTrafficTest.txt'
output_csv = 'anomalous_output_features.csv'

features_list = process_request_dataset(file_path)
save_features_to_csv(features_list, output_csv)


Skipping empty or malformed request
Skipping empty or malformed request
Skipping malformed request
Skipping malformed request line: id=2&nombre=Jam%F3n+Ib%E9rico&precio=85&cantidad=%27%3B+DROP+TABLE+usuarios%3B+SELECT+*+FROM+datos+WHERE+nombre+LIKE+%27%25&B1=A%F1adir+al+carrito
Skipping malformed request line: id=2&nombre=Jam%F3n+Ib%E9rico&precio=85&cantidad=%27%3B+DROP+TABLE+usuarios%3B+SELECT+*+FROM+datos+WHERE+nombre+LIKE+%27%25&B1=A%F1adir+al+carrito
Skipping malformed request
Skipping empty or malformed request
Skipping empty or malformed request
Skipping malformed request
Skipping malformed request line: id=2%2F&nombre=Jam%F3n+Ib%E9rico&precio=85&cantidad=49&B1=A%F1adir+al+carrito
Skipping malformed request line: id=2%2F&nombre=Jam%F3n+Ib%E9rico&precio=85&cantidad=49&B1=A%F1adir+al+carrito
Skipping malformed request
Skipping empty or malformed request
Skipping empty or malformed request
Skipping malformed request
Skipping empty or malformed request
Skipping empty or malformed req

ValueError: dict contains fields not in fieldnames: 'contains_attack_signature_login', 'diversity_login', 'entropy_modo', 'entropy_remember', 'entropy_login', 'entropy_pwd', 'diversity_modo', 'contains_attack_signature_modo', 'contains_attack_signature_remember', 'diversity_remember', 'diversity_pwd', 'contains_attack_signature_pwd'

In [14]:
import re
import pandas as pd
import numpy as np
from urllib.parse import urlparse, parse_qs, unquote

def extract_features_from_request(request):
    features = {}

    # Split the request into lines
    lines = request.split('\n')
    
    # Extract the request line (e.g., "GET /path?query HTTP/1.1")
    request_line = lines[0].strip()
    
    # Extract HTTP Method, URL, and HTTP Version
    method_match = re.match(r'^(GET|POST|PUT|DELETE|HEAD|OPTIONS|CONNECT|TRACE|PATCH)\s+(\S+)\s+(HTTP/\d\.\d)$', request_line)
    if method_match:
        method = method_match.group(1)
        url = method_match.group(2)
        http_version = method_match.group(3)
    else:
        method = 'UNKNOWN'
        url = '/'
        http_version = 'UNKNOWN'
    
    features['http_method'] = method
    features['http_version'] = http_version
    
    # Parse URL
    parsed_url = urlparse(url)
    features['uri'] = parsed_url.path
    features['uri_length'] = len(parsed_url.path)
    
    # Parse query parameters
    params = parse_qs(parsed_url.query)
    features['num_parameters'] = len(params)
    features['param_lengths'] = sum(len(unquote(value[0])) for value in params.values())
    
    # Special characters in URI/Parameters
    special_chars_count = len(re.findall(r'[\'\";<>=/\\%]', url))
    features['num_special_chars'] = special_chars_count
    
    # Parse headers
    headers = {}
    for line in lines[1:]:
        if ': ' in line:
            key, value = line.split(': ', 1)
            headers[key.lower()] = value.strip()

    features['user_agent'] = headers.get('user-agent', 'UNKNOWN')
    features['host'] = headers.get('host', 'UNKNOWN')
    features['content_type'] = headers.get('content-type', 'UNKNOWN')
    features['content_length'] = int(headers.get('content-length', 0))
    features['referer'] = headers.get('referer', 'UNKNOWN')
    features['num_cookies'] = len(headers.get('cookie', '').split(';')) if 'cookie' in headers else 0

    # Extract and analyze body for POST requests
    if method == 'POST':
        body_index = request.find('\n\n') + 2
        body = request[body_index:].strip()
    else:
        body = ''
    
    sql_keywords = ['SELECT', 'INSERT', 'UPDATE', 'DELETE', 'DROP', 'UNION']
    features['sql_keywords_count'] = sum(body.upper().count(keyword) for keyword in sql_keywords)
    
    # Entropy Measures (for URI)
    uri_entropy = -sum((parsed_url.path.count(char)/len(parsed_url.path)) * np.log2(parsed_url.path.count(char)/len(parsed_url.path)) for char in set(parsed_url.path)) if parsed_url.path else 0
    features['uri_entropy'] = uri_entropy
    
    # Binary Features for Attack Detection
    features['contains_select'] = int('SELECT' in body.upper())

    # Bigram Features (for URI)
    tokens = re.findall(r'\w+', parsed_url.path)
    bigrams = [' '.join(bigram) for bigram in zip(tokens[:-1], tokens[1:])]
    features['num_bigrams'] = len(bigrams)

    # Request Anomalies
    invalid_methods = ['TRACE', 'CONNECT', 'PATCH']
    features['invalid_http_method'] = int(method in invalid_methods)
    
    # Behavioral Patterns (Request Length)
    features['request_length'] = len(request)

    return features

def save_to_csv(features_list, filename='csic2010_features.csv'):
    df = pd.DataFrame(features_list)
    df.to_csv(filename, index=False)
    print(f"Features extracted and saved to {filename}")

def read_requests_from_file(file_path):
    with open(file_path, 'r') as file:
        requests = file.read().split('\n\n')  # Assuming requests are separated by a blank line
    return requests

def main(input_file, output_file='csic2010_features.csv'):
    raw_requests = read_requests_from_file(input_file)
    features_list = [extract_features_from_request(request) for request in raw_requests if request.strip()]
    save_to_csv(features_list, output_file)

# Example usage:
# main('anomalousTrafficTest.txt', 'anomalous_csic2010_features.csv')


In [15]:
main('anomalousTrafficTest.txt', 'anomalous_csic2010_features.csv')

Features extracted and saved to anomalous_csic2010_features.csv


### Parser 

In [19]:
import csv
import re
from datetime import datetime, timedelta
import os
from collections import defaultdict

def parse_nginx_log(input_file_path, output_file_path):
    log_pattern = re.compile(
        r'(?P<ip>\S+) - - \[(?P<timestamp>.*?)\] "(?P<method>\S+) (?P<path>\S+) HTTP/\S+" (?P<status>\d{3}) (?P<size>\d+) "(?P<referrer>.*?)" "(?P<user_agent>.*?)"'
    )

    # Define fieldnames consistent with the log data structure
    fieldnames = [
        'IP Address', 'Timestamp', 'HTTP Method', 'Request Path', 'Status Code', 
        'Response Size', 'Referrer', 'User Agent', 'Request Length', 'Query Parameters Count', 
        'Is Secure', 'Time of Day', 'Day of Week', 'User-Agent Length', 'Referrer Length', 
        'Status Code Category', 'Request Frequency', 'Status Code Distribution', 
        'Request Size Distribution', 'User-Agent Diversity', 'Time Interval Between Requests', 
        'Path Frequency', 'Suspicious Patterns'
    ]

    # Data structures to calculate additional features
    ip_request_times = defaultdict(list)
    ip_status_code_counts = defaultdict(lambda: defaultdict(int))
    ip_request_sizes = defaultdict(list)
    ip_user_agents = defaultdict(set)
    ip_path_counts = defaultdict(lambda: defaultdict(int))

    # Open the CSV file for writing
    with open(output_file_path, mode='w', newline='') as csv_file:
        writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
        writer.writeheader()

        # Open and process the input log file
        with open(input_file_path, 'r') as log_file:
            for log_entry in log_file:
                match = log_pattern.match(log_entry)
                if match:
                    log_data = match.groupdict()

                    # Process the timestamp and other fields
                    timestamp = datetime.strptime(log_data['timestamp'], '%d/%b/%Y:%H:%M:%S %z')
                    log_data['Timestamp'] = timestamp.isoformat()
                    log_data['IP Address'] = log_data.pop('ip')
                    log_data['HTTP Method'] = log_data.pop('method')
                    log_data['Request Path'] = log_data.pop('path')
                    log_data['Status Code'] = log_data.pop('status')
                    log_data['Response Size'] = log_data.pop('size')
                    log_data['Referrer'] = log_data.pop('referrer')
                    log_data['User Agent'] = log_data.pop('user_agent')

                    # Calculate additional fields
                    request_length = len(log_entry)
                    query_parameters_count = len(re.findall(r'[?&]', log_data['Request Path']))
                    is_secure = 'Yes' if log_data['Request Path'].startswith('https://') else 'No'
                    time_of_day = timestamp.strftime('%H:%M:%S')
                    day_of_week = timestamp.strftime('%A')
                    user_agent_length = len(log_data['User Agent'])
                    referrer_length = len(log_data['Referrer'])
                    status_code_category = f"{log_data['Status Code'][0]}xx"
                    
                    # Update data structures for feature calculation
                    ip_request_times[log_data['IP Address']].append(timestamp)
                    ip_status_code_counts[log_data['IP Address']][log_data['Status Code']] += 1
                    ip_request_sizes[log_data['IP Address']].append(int(log_data['Response Size']))
                    ip_user_agents[log_data['IP Address']].add(log_data['User Agent'])
                    ip_path_counts[log_data['IP Address']][log_data['Request Path']] += 1

                    # Calculate new features
                    request_frequency = len(ip_request_times[log_data['IP Address']])
                    status_code_distribution = ip_status_code_counts[log_data['IP Address']]
                    request_size_distribution = sum(ip_request_sizes[log_data['IP Address']]) / len(ip_request_sizes[log_data['IP Address']]) if ip_request_sizes[log_data['IP Address']] else 0
                    user_agent_diversity = len(ip_user_agents[log_data['IP Address']])
                    if len(ip_request_times[log_data['IP Address']]) > 1:
                        time_intervals = [ip_request_times[log_data['IP Address']][i] - ip_request_times[log_data['IP Address']][i-1] for i in range(1, len(ip_request_times[log_data['IP Address']]))]
                        time_interval_between_requests = sum(time_intervals, timedelta()) / len(time_intervals)
                    else:
                        time_interval_between_requests = timedelta()
                    path_frequency = ip_path_counts[log_data['IP Address']][log_data['Request Path']]
                    
                    # Example of a simple suspicious pattern check (customize as needed)
                    suspicious_patterns = ['admin', 'wp-login', 'phpmyadmin']
                    suspicious_patterns_found = any(pattern in log_data['Request Path'] for pattern in suspicious_patterns)

                    # Add calculated fields to log_data
                    log_data['Request Length'] = request_length
                    log_data['Query Parameters Count'] = query_parameters_count
                    log_data['Is Secure'] = is_secure
                    log_data['Time of Day'] = time_of_day
                    log_data['Day of Week'] = day_of_week
                    log_data['User-Agent Length'] = user_agent_length
                    log_data['Referrer Length'] = referrer_length
                    log_data['Status Code Category'] = status_code_category
                    log_data['Request Frequency'] = request_frequency
                    log_data['Status Code Distribution'] = str(status_code_distribution)
                    log_data['Request Size Distribution'] = request_size_distribution
                    log_data['User-Agent Diversity'] = user_agent_diversity
                    log_data['Time Interval Between Requests'] = str(time_interval_between_requests)
                    log_data['Path Frequency'] = path_frequency
                    log_data['Suspicious Patterns'] = 'Yes' if suspicious_patterns_found else 'No'

                    # Ensure log_data only contains the specified fieldnames
                    row_data = {field: log_data.get(field, '') for field in fieldnames}
                    writer.writerow(row_data)

    print(f"CSV file '{output_file_path}' has been created successfully.")

# Interactive input
input_file_path = input("Please enter the path to the input Nginx log file: ").strip()
output_file_path = input("Please enter the path to the output CSV file: ").strip()

# Validate input file
if not os.path.isfile(input_file_path):
    print(f"Error: The file {input_file_path} does not exist.")
else:
    # Ensure the output directory exists
    output_dir = os.path.dirname(output_file_path)
    if output_dir and not os.path.exists(output_dir):
        print(f"Error: The directory {output_dir} does not exist.")
    else:
        parse_nginx_log(input_file_path, output_file_path)


CSV file 'access_logs.csv' has been created successfully.


In [1]:
import pandas as pd

# Load the CSV file
file_path = 'access_logs.csv'
df = pd.read_csv(file_path)

# Display the first few rows of the dataframe
df.head()


Unnamed: 0,IP Address,Timestamp,HTTP Method,Request Path,Status Code,Response Size,Referrer,User Agent,Request Length,Query Parameters Count,...,User-Agent Length,Referrer Length,Status Code Category,Request Frequency,Status Code Distribution,Request Size Distribution,User-Agent Diversity,Time Interval Between Requests,Path Frequency,Suspicious Patterns
0,172.18.28.9,2024-09-10T00:00:13+05:30,GET,/socket.io/?EIO=4&transport=polling&t=P7O3UpE....,200,1,https://ss.dmrc.org/,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...,276,4,...,111,20,2xx,1,"defaultdict(<class 'int'>, {'200': 1})",1.0,1,0:00:00,1,No
1,172.18.28.9,2024-09-10T00:00:14+05:30,POST,/socket.io/?EIO=4&transport=polling&t=P7O3a-e&...,200,2,https://ss.dmrc.org/,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...,275,4,...,111,20,2xx,2,"defaultdict(<class 'int'>, {'200': 2})",1.5,1,0:00:01,1,No
2,172.18.28.9,2024-09-10T00:00:31+05:30,GET,/socket.io/?EIO=4&transport=polling&t=P7O3Z9u&...,200,1,http://localhost/,Mozilla/5.0 (Linux; Android 9; SM-M307F Build/...,320,4,...,160,17,2xx,3,"defaultdict(<class 'int'>, {'200': 3})",1.333333,2,0:00:09,1,No
3,172.18.28.9,2024-09-10T00:00:31+05:30,GET,/socket.io/?EIO=4&transport=polling&t=P7O3ZA4&...,200,1,http://localhost/,Mozilla/5.0 (Linux; Android 9; SM-M307F Build/...,320,4,...,160,17,2xx,4,"defaultdict(<class 'int'>, {'200': 4})",1.25,2,0:00:06,1,No
4,172.18.28.9,2024-09-10T00:00:31+05:30,GET,/socket.io/?EIO=4&transport=polling&t=P7O3ZAE&...,200,1,http://localhost/,Mozilla/5.0 (Linux; Android 9; SM-M307F Build/...,320,4,...,160,17,2xx,5,"defaultdict(<class 'int'>, {'200': 5})",1.2,2,0:00:04.500000,1,No


In [22]:
pip install pycaret

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [24]:
pip install --upgrade scipy scikitplot



Collecting scipy
  Downloading scipy-1.14.1-cp311-cp311-win_amd64.whl.metadata (60 kB)
     ---------------------------------------- 0.0/60.8 kB ? eta -:--:--
     ---------------------------------------- 60.8/60.8 kB 1.6 MB/s eta 0:00:00


ERROR: Ignored the following versions that require a different python version: 1.6.2 Requires-Python >=3.7,<3.10; 1.6.3 Requires-Python >=3.7,<3.10; 1.7.0 Requires-Python >=3.7,<3.10; 1.7.1 Requires-Python >=3.7,<3.10; 1.7.2 Requires-Python >=3.7,<3.11; 1.7.3 Requires-Python >=3.7,<3.11; 1.8.0 Requires-Python >=3.8,<3.11; 1.8.0rc1 Requires-Python >=3.8,<3.11; 1.8.0rc2 Requires-Python >=3.8,<3.11; 1.8.0rc3 Requires-Python >=3.8,<3.11; 1.8.0rc4 Requires-Python >=3.8,<3.11; 1.8.1 Requires-Python >=3.8,<3.11
ERROR: Could not find a version that satisfies the requirement scikitplot (from versions: none)
ERROR: No matching distribution found for scikitplot

[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [25]:
pip install scipy==1.8.0

Note: you may need to restart the kernel to use updated packages.


ERROR: Ignored the following yanked versions: 1.11.0, 1.14.0rc1
ERROR: Ignored the following versions that require a different python version: 1.6.2 Requires-Python >=3.7,<3.10; 1.6.3 Requires-Python >=3.7,<3.10; 1.7.0 Requires-Python >=3.7,<3.10; 1.7.1 Requires-Python >=3.7,<3.10; 1.7.2 Requires-Python >=3.7,<3.11; 1.7.3 Requires-Python >=3.7,<3.11; 1.8.0 Requires-Python >=3.8,<3.11; 1.8.0rc1 Requires-Python >=3.8,<3.11; 1.8.0rc2 Requires-Python >=3.8,<3.11; 1.8.0rc3 Requires-Python >=3.8,<3.11; 1.8.0rc4 Requires-Python >=3.8,<3.11; 1.8.1 Requires-Python >=3.8,<3.11
ERROR: Could not find a version that satisfies the requirement scipy==1.8.0 (from versions: 0.8.0, 0.9.0, 0.10.0, 0.10.1, 0.11.0, 0.12.0, 0.12.1, 0.13.0, 0.13.1, 0.13.2, 0.13.3, 0.14.0, 0.14.1, 0.15.0, 0.15.1, 0.16.0, 0.16.1, 0.17.0, 0.17.1, 0.18.0, 0.18.1, 0.19.0, 0.19.1, 1.0.0, 1.0.1, 1.1.0, 1.2.0, 1.2.1, 1.2.2, 1.2.3, 1.3.0, 1.3.1, 1.3.2, 1.3.3, 1.4.0, 1.4.1, 1.5.0, 1.5.1, 1.5.2, 1.5.3, 1.5.4, 1.6.0, 1.6.1, 1.9.0rc1, 1.

In [7]:
import pandas as pd
from pycaret.clustering import setup, create_model, evaluate_model, assign_model, save_model, plot_model

# Load the CSV file
file_path = 'access_logs2.csv'
df = pd.read_csv(file_path)

# Initialize the PyCaret clustering environment
clustering_setup = setup(data=df, session_id=123, normalize=True, verbose=False, memory=None)

# Create a specific clustering model, e.g., KMeans
kmeans_model = create_model('kmeans')

# Evaluate the model
evaluate_model(kmeans_model)

# Assign clusters to the data
clustered_data = assign_model(kmeans_model)

# Display the first few rows of the clustered data
print(clustered_data.head())

# Save the model
save_model(kmeans_model, 'clustering_model')

# Optional: Plot clusters
plot_model(kmeans_model, plot='cluster')


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,-0.0206,1.6519,23.8724,0,0,0


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

    IP Address                  Timestamp HTTP Method  \
0  172.18.28.9  2024-09-10T00:00:13+05:30         GET   
1  172.18.28.9  2024-09-10T00:00:14+05:30        POST   
2  172.18.28.9  2024-09-10T00:00:31+05:30         GET   
3  172.18.28.9  2024-09-10T00:00:31+05:30         GET   
4  172.18.28.9  2024-09-10T00:00:31+05:30         GET   

                                        Request Path  Status Code  \
0  /socket.io/?EIO=4&transport=polling&t=P7O3UpE....          200   
1  /socket.io/?EIO=4&transport=polling&t=P7O3a-e&...          200   
2  /socket.io/?EIO=4&transport=polling&t=P7O3Z9u&...          200   
3  /socket.io/?EIO=4&transport=polling&t=P7O3ZA4&...          200   
4  /socket.io/?EIO=4&transport=polling&t=P7O3ZAE&...          200   

   Response Size              Referrer  \
0              1  https://ss.dmrc.org/   
1              2  https://ss.dmrc.org/   
2              1     http://localhost/   
3              1     http://localhost/   
4              1     http://loca