In [3]:
import re
import json
import pandas as pd
from bs4 import BeautifulSoup

# 🔹 Log File Path
log_file = "server_logs.txt"  # Change this to your actual log file

# 🔹 Define the compartments (keys) to extract
compartments = [
    "Host", "User-Agent", "Accept", "Accept-Language", "Accept-Encoding", "Content-Type",
    "Content-Length", "Origin", "Connection", "Referer", "Cookie", "Upgrade-Insecure-Requests",
    "Sec-Fetch-Dest", "Sec-Fetch-Mode", "Sec-Fetch-Site", "Sec-Fetch-User", "Ip",
    "Date", "Server", "Expires", "Cache-Control", "Pragma", "Vary", "Keep-Alive"
]

# 🔹 Regex pattern to match key-value pairs
pattern = r"(?P<key>" + "|".join(compartments) + r"): (?P<value>.+)"

# 🔹 Dictionary to store extracted log data
log_data = {}

# 🔹 Read the log file
with open(log_file, "r", encoding="utf-8") as file:
    content = file.read()

    # 🔹 Extract key-value pairs (headers)
    matches = re.findall(pattern, content)
    for key, value in matches:
        log_data[key] = value

    # 🔹 Extract full HTML content
    html_match = re.search(r"<!DOCTYPE html>[\s\S]+", content, re.MULTILINE)
    if html_match:
        raw_html = html_match.group(0)  # Capture the full HTML content
        log_data["HTML"] = raw_html  # Store raw HTML

        # 🔹 Parse HTML with BeautifulSoup
        soup = BeautifulSoup(raw_html, "html.parser")

        # Extract useful HTML elements
        log_data["HTML_Title"] = soup.title.string if soup.title else "No Title"
        log_data["HTML_Headings"] = [h.get_text() for h in soup.find_all(['h1', 'h2', 'h3'])]
        log_data["HTML_Links"] = [a['href'] for a in soup.find_all('a', href=True)]
        log_data["HTML_Text"] = soup.get_text()

# 🔹 Print extracted log data
for key, value in log_data.items():
    print(f"{key}: {value[:300]}\n")  # Print first 300 characters for readability

# 🔹 Save to JSON (Optional)
with open("parsed_logs.json", "w", encoding="utf-8") as json_file:
    json.dump(log_data, json_file, indent=4, ensure_ascii=False)

# 🔹 Save to CSV (Optional)
df = pd.DataFrame([log_data])  # Convert dictionary to DataFrame
df.to_csv("parsed_logs.csv", index=False)


Host: localhost

User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0

Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8

Accept-Language: en-US,en;q=0.5

Accept-Encoding: gzip, deflate, br

Content-Type: text/html;charset=utf-8

Content-Length: 4270

Origin: http://localhost

Connection: Keep-Alive

Referer: http://localhost/DVWA/vulnerabilities/exec/

Cookie: security=low; PHPSESSID=6pm6rok1377ehidgaft7949hos

Upgrade-Insecure-Requests: 1

Sec-Fetch-Dest: document

Sec-Fetch-Mode: navigate

Sec-Fetch-Site: same-origin

Sec-Fetch-User: ?1

Date: Tue, 22 Oct 2024 02:21:00 GMT

Server: Apache/2.4.58 (Debian)

Expires: Tue, 23 Jun 2009 12:00:00 GMT

Cache-Control: no-cache, must-revalidate

Pragma: no-cache

Vary: Accept-Encoding

Keep-Alive: timeout=5, max=100

HTML: <!DOCTYPE html>

<html lang="en-GB">

	<head>
		<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />

		<title>Vulnerability: Comman