In [None]:
import pandas as pd
import re
from collections import Counter

# Load the CSV file
file_path = 'cybersecl3.csv'
data = pd.read_csv(file_path)

# Extract the Info column containing SMTP commands
info_column = data['Info']

# Define SMTP commands to analyze
smtp_commands = ["EHLO", "MAIL FROM", "RCPT TO", "DATA", "QUIT"]

# Initialize counters for frequency and sequence
command_frequency = Counter()
command_sequences = []

# Function to extract and analyze SMTP commands from each Info entry
def analyze_smtp_commands(info):
    # Use regex to extract all SMTP commands from the Info column
    commands = []
    for cmd in smtp_commands:
        if re.search(rf"\b{cmd}\b", info):
            commands.append(cmd)
            command_frequency[cmd] += 1
    if commands:
        command_sequences.append(commands)

# Analyze the Info column
info_column.apply(analyze_smtp_commands)

# Frequency count of SMTP commands
print("Frequency of SMTP Commands:")
for cmd, freq in command_frequency.items():
    print(f"{cmd}: {freq}")

# Analyze command sequences
print("\nCommand Sequences:")
for sequence in command_sequences:
    print(" -> ".join(sequence))

# Insights and anomalies
most_frequent_command = command_frequency.most_common(1)[0]
print(f"\nMost frequent command: {most_frequent_command[0]} with {most_frequent_command[1]} occurrences.")

# Identify unusual sequences
print("\nAnomalies or Unusual Patterns:")
for idx, sequence in enumerate(command_sequences):
    if sequence.count("MAIL FROM") > 1 or sequence.count("RCPT TO") > 1:
        print(f"Unusual sequence at line {idx + 1}: {' -> '.join(sequence)}")


Frequency of SMTP Commands:
EHLO: 1000
MAIL FROM: 1000
RCPT TO: 1000
DATA: 1000
QUIT: 1000

Command Sequences:
EHLO -> MAIL FROM -> RCPT TO -> DATA -> QUIT
EHLO -> MAIL FROM -> RCPT TO -> DATA -> QUIT
EHLO -> MAIL FROM -> RCPT TO -> DATA -> QUIT
EHLO -> MAIL FROM -> RCPT TO -> DATA -> QUIT
EHLO -> MAIL FROM -> RCPT TO -> DATA -> QUIT
EHLO -> MAIL FROM -> RCPT TO -> DATA -> QUIT
EHLO -> MAIL FROM -> RCPT TO -> DATA -> QUIT
EHLO -> MAIL FROM -> RCPT TO -> DATA -> QUIT
EHLO -> MAIL FROM -> RCPT TO -> DATA -> QUIT
EHLO -> MAIL FROM -> RCPT TO -> DATA -> QUIT
EHLO -> MAIL FROM -> RCPT TO -> DATA -> QUIT
EHLO -> MAIL FROM -> RCPT TO -> DATA -> QUIT
EHLO -> MAIL FROM -> RCPT TO -> DATA -> QUIT
EHLO -> MAIL FROM -> RCPT TO -> DATA -> QUIT
EHLO -> MAIL FROM -> RCPT TO -> DATA -> QUIT
EHLO -> MAIL FROM -> RCPT TO -> DATA -> QUIT
EHLO -> MAIL FROM -> RCPT TO -> DATA -> QUIT
EHLO -> MAIL FROM -> RCPT TO -> DATA -> QUIT
EHLO -> MAIL FROM -> RCPT TO -> DATA -> QUIT
EHLO -> MAIL FROM -> RCPT TO -> DA

In [None]:
import pandas as pd
from collections import Counter

# Load the CSV file
file_path = 'cybersecl3.csv'
data = pd.read_csv(file_path)

# Extract the relevant columns: Source, Destination, and Info
source_ips = data['Source']
destination_ips = data['Destination']
info_column = data['Info']

# Data structures for analysis
source_to_recipients = {}
recipient_from_sources = {}

# Analyze each SMTP packet
for src_ip, dest_ip, info in zip(source_ips, destination_ips, info_column):
    # Check if the packet contains a MAIL FROM or RCPT TO command
    if "MAIL FROM" in info and "RCPT TO" in info:
        # Extract recipient from RCPT TO and sender from MAIL FROM
        sender = src_ip
        recipient = dest_ip

        # Track emails sent from a single source to multiple recipients
        if sender not in source_to_recipients:
            source_to_recipients[sender] = set()
        source_to_recipients[sender].add(recipient)

        # Track emails received by a single recipient from multiple sources
        if recipient not in recipient_from_sources:
            recipient_from_sources[recipient] = set()
        recipient_from_sources[recipient].add(sender)

# Identify suspicious behavior
suspicious_sources = {
    src: recipients for src, recipients in source_to_recipients.items() if len(recipients) > 5
}
suspicious_recipients = {
    recipient: sources for recipient, sources in recipient_from_sources.items() if len(sources) > 5
}

# Summarize findings
print("Suspicious Behavior Analysis:")
print("\nSources sending emails to multiple recipients:")
for src, recipients in suspicious_sources.items():
    print(f"{src} -> {len(recipients)} recipients")

print("\nRecipients receiving emails from multiple sources:")
for recipient, sources in suspicious_recipients.items():
    print(f"{recipient} <- {len(sources)} sources")

# Overall frequency of IP exchanges
source_frequency = Counter(source_ips)
destination_frequency = Counter(destination_ips)

# Identify IPs with the highest number of email exchanges
print("\nTop Source IPs by Email Exchanges:")
for ip, count in source_frequency.most_common(5):
    print(f"{ip}: {count} exchanges")

print("\nTop Destination IPs by Email Exchanges:")
for ip, count in destination_frequency.most_common(5):
    print(f"{ip}: {count} exchanges")


Suspicious Behavior Analysis:

Sources sending emails to multiple recipients:

Recipients receiving emails from multiple sources:

Top Source IPs by Email Exchanges:
172.16.0.3: 231 exchanges
192.168.1.30: 208 exchanges
192.168.1.20: 200 exchanges
192.168.1.10: 182 exchanges
10.0.0.5: 179 exchanges

Top Destination IPs by Email Exchanges:
192.168.1.30: 217 exchanges
172.16.0.3: 210 exchanges
10.0.0.5: 201 exchanges
192.168.1.10: 193 exchanges
192.168.1.20: 179 exchanges


In [None]:
import pandas as pd
import re

# Load the CSV file
file_path = 'cybersecl3.csv'
data = pd.read_csv(file_path)

# Extract relevant columns
sessions = data[['Source', 'Destination', 'Info']]

# Regular expressions for SMTP commands and email validation
mail_from_regex = r"MAIL FROM:\s*<([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})>"
rcpt_to_regex = r"RCPT TO:\s*<([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})>"
quit_regex = r"\bQUIT\b"
data_regex = r"\bDATA\b"
end_of_data_regex = r"^\.$"

# Data structure to store session information
smtp_sessions = {}
current_session_id = 0

# Parse sessions and group by source/destination
for _, row in sessions.iterrows():
    src, dest, info = row['Source'], row['Destination'], row['Info']

    if src not in smtp_sessions:
        smtp_sessions[src] = {'commands': [], 'issues': []}

    # Extract SMTP commands and email validation
    if re.search(mail_from_regex, info):
        smtp_sessions[src]['commands'].append('MAIL FROM')
        if not re.search(mail_from_regex, info):
            smtp_sessions[src]['issues'].append("Invalid MAIL FROM address")

    if re.search(rcpt_to_regex, info):
        smtp_sessions[src]['commands'].append('RCPT TO')
        if not re.search(rcpt_to_regex, info):
            smtp_sessions[src]['issues'].append("Invalid RCPT TO address")

    if re.search(data_regex, info):
        smtp_sessions[src]['commands'].append('DATA')

    if re.search(end_of_data_regex, info):
        smtp_sessions[src]['commands'].append('END OF DATA')

    if re.search(quit_regex, info):
        smtp_sessions[src]['commands'].append('QUIT')

# Analyze sessions for issues
malformed_sessions = []
for session_id, session in smtp_sessions.items():
    commands = session['commands']
    issues = session['issues']

    # Check for missing QUIT command
    if 'QUIT' not in commands:
        issues.append("Missing QUIT command")

    # Check for unclosed DATA command
    if 'DATA' in commands and 'END OF DATA' not in commands:
        issues.append("Unfinished DATA command")

    # Check command sequence
    expected_flow = ['EHLO', 'MAIL FROM', 'RCPT TO', 'DATA', 'END OF DATA', 'QUIT']
    for command in expected_flow:
        if command in commands and commands.index(command) > commands.index('QUIT'):
            issues.append(f"Out-of-order command: {command}")

    # Append session if any issues are found
    if issues:
        malformed_sessions.append({'session_id': session_id, 'issues': issues})

# Generate Report
print("Malformed SMTP Sessions Report:")
if malformed_sessions:
    for session in malformed_sessions:
        print(f"\nSession ID: {session['session_id']}")
        print("Issues:")
        for issue in session['issues']:
            print(f"  - {issue}")
else:
    print("No malformed sessions detected.")


Malformed SMTP Sessions Report:

Session ID: 192.168.1.20
Issues:
  - Unfinished DATA command

Session ID: 10.0.0.5
Issues:
  - Unfinished DATA command

Session ID: 192.168.1.30
Issues:
  - Unfinished DATA command

Session ID: 172.16.0.3
Issues:
  - Unfinished DATA command

Session ID: 192.168.1.10
Issues:
  - Unfinished DATA command
