## Table of contents
-  [Task A](#Task_A)
-  [Task B](#Task_B)
-  [Task C](#Task_C)

In [83]:
# !pip install beautifulsoup4
# !pip install html5lib
# !pip install python-whois

## Imports

In [1]:
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd

import whois

# <a id='Task_A'>Task A. Automation Scripting</a>

### A1

In [84]:
# gets HTML of webpage
url = 'https://www.secureworks.com/blog/opsec-mistakes-reveal-cobalt-mirage-threat-actors'
result = requests.get(url)
soup = BeautifulSoup(result.text, "html.parser")

# extracts IP addresses, URLs and hashes from table
table = soup.find_all('table')
df = pd.read_html(str(table))[0]
display(df)

Unnamed: 0,Indicator,Type,Context
0,gupdate.us,Domain name,TunnelFish C2 server used by COBALT MIRAGE
1,msupdate.top,Domain name,TunnelFish C2 server used by COBALT MIRAGE
2,193.142.59.174,IP address,Hosting TunnelFish domains used by COBALT MIRAGE
3,172.245.26.118,IP address,Staging and distributing COBALT MIRAGE malware
4,mssync.one,Domain name,Suspected C2 server linked to COBALT MIRAGE
5,upmirror.top,Domain name,Suspected C2 server linked to COBALT MIRAGE
6,104.168.117.149,IP address,Hosting COBALT MIRAGE domains
7,69314c1969f28bfab34683769286326e25d9a0f07 c4b...,SHA256 hash,TunnelFish malware used by COBALT MIRAGE
8,f38f3a1cda90229434e8ab8c59342838106b9778,SHA1 hash,TunnelFish malware used by COBALT MIRAGE
9,00e4c488558492b80fd27d51b159a099,MD5 hash,TunnelFish malware used by COBALT MIRAGE


In [102]:
#removes all whitespaces and display results in a list
extracted = [x.replace(" ", "") for x in df['Indicator']]
print("Extracted IP addresses, URLs and hashes:", extracted)

Extracted IP addresses, URLs and hashes: ['gupdate.us', 'msupdate.top', '193.142.59.174', '172.245.26.118', 'mssync.one', 'upmirror.top', '104.168.117.149', '69314c1969f28bfab34683769286326e25d9a0f07c4bad3443d08efe4f43e0a8', 'f38f3a1cda90229434e8ab8c59342838106b9778', '00e4c488558492b80fd27d51b159a099']


### A2

In [91]:
#gets domain and ip addresses 
df_domains = df.loc[df['Type'].isin(['Domain name', 'IP address'])]
domains = df_domains['Indicator']
# print(domains)

#empty df to store results
results_df = pd.DataFrame()

for domain in domains:
    #extracting whois information for each domain
    whois_info = whois.whois(domain)
    whois_info_df = pd.DataFrame(whois_info.items())
    
    #transpose df for csv format
    whois_info_df = whois_info_df.T
    #set header and reset index
    whois_info_df = whois_info_df.rename(columns=whois_info_df.iloc[0]).drop(df.index[0]).reset_index(drop=True)
    
    #add domain extracted as new column
    whois_info_df.insert(0, 'Domain/IP', domain)
    
    #append results
    results_df = pd.concat([results_df, whois_info_df], ignore_index=True)

#export to csv, fill NA values
display(results_df)
results_df.to_csv("taskA.csv", encoding='utf-8', index=False, na_rep='NA')

Unnamed: 0,Domain/IP,domain_name,domain__id,whois_server,registrar,registrar_id,registrar_url,registrar_email,registrar_phone,status,...,updated_date,referral_url,emails,dnssec,name,org,address,city,state,country
0,gupdate.us,gupdate.us,DD745E60FBAD445BBA4F822873796C6A3-GDREG,whois.namecheap.com,NAMECHEAP INC,1068.0,http://www.namecheap.com,abuse@namecheap.com,"[+1.6613102107, +1.9854014545]",[clientTransferProhibited https://icann.org/ep...,...,"[2023-06-06 09:22:11, 0001-01-01 00:00:00]",,,,,,,,,
1,msupdate.top,,,,,,,,,,...,,,,,,,,,,
2,193.142.59.174,,,,,,,,,ASSIGNED PA,...,,,"[abuse@ripe.net, hostmaster@ripe.net, 'abuse@h...",,,,,,,
3,172.245.26.118,"[COLOCROSSING.COM, colocrossing.com]",,whois.names4ever.com,Hostopia Canada Corp,,,,,[clientTransferProhibited https://icann.org/ep...,...,2023-06-22 15:16:46,,"[dns@cs.aplus.net, dns@aplus.net, corp-domains...","[unsigned, Unsigned]",Domain Admin,"Deluxe Small Business Sales, Inc. Aplus.net",801 Marquette Ave S 801 Marquette Ave S,Minneapolois,MN,US
4,mssync.one,mssync.one,,https://porkbun.com/whois,Porkbun,,,,,[pendingDelete https://icann.org/epp#pendingDe...,...,2023-08-09 12:04:40,,abuse@porkbun.com,unsigned,REDACTED FOR PRIVACY,"Private by Design, LLC",REDACTED FOR PRIVACY,REDACTED FOR PRIVACY,NC,US
5,upmirror.top,upmirror.top,,whois.namesilo.com,"NameSilo,LLC",,,,,[clientTransferProhibited https://icann.org/ep...,...,2023-08-02 17:33:53,,abuse@namesilo.com,unsigned,REDACTED FOR PRIVACY,See PrivacyGuardian.org,REDACTED FOR PRIVACY,REDACTED FOR PRIVACY,AZ,US
6,104.168.117.149,"[COLOCROSSING.COM, colocrossing.com]",,whois.names4ever.com,Hostopia Canada Corp,,,,,[clientTransferProhibited https://icann.org/ep...,...,2023-06-22 15:16:46,,"[dns@cs.aplus.net, dns@aplus.net, corp-domains...","[unsigned, Unsigned]",Domain Admin,"Deluxe Small Business Sales, Inc. Aplus.net",801 Marquette Ave S 801 Marquette Ave S,Minneapolois,MN,US


# <a id='Task_B'>Task B. Cyber Threat Analysis</a>

### B1. From the extracted IOCs, outline the type of enrichments that can facilitate cyber threat investigation.

From the extracted WHOIS data, we can perform further analysis to facilitate cyber threat investigation. For example, with the domain names and IP addresses, we can check if the domains have been previously associated with malicious activity in threat intelligence databases.
Furthermore, information such as the domain registrar, creation date, and owner details could shed light on the threat actors or organizations involved. Geolocation data such as address, postal code and country can also help to identify the physical location of the IP addresses to understand potential geographic targets of the threat.

As for the file hashes, we can query threat intelligence databases to determine if the hashes are associated with known malware or malicious files. In addition, we will be able to compare the extracted hashes against our organisation's internal hash database to identify any matches or similarities. Lastly, it is possible to examine metadata from files, such as creation and modification dates, author information, and digital signatures to uncover more information about the files, similar to what the article has mentioned.

Email addresses can also be checked if they have been linked to phishing campaigns, spam, or other malicious activities using known databases and CVE reports.

As for the URLS, we can investigate the content to identify any malicious payloads or indicators on the associated web pages by scanning to assess if the URLs are known to host malware or phishing content.

Lastly, with names and entities, we can perform Open Source Intelligence (OSINT) to gather more information about individuals, organizations, or companies mentioned in the report. For example, just as the article mentioned, 
social media activities can be tracked to uncover more information on social media profiles, posts, and interactions to gather additional context about the named individuals or entities.

### B2. How would you surface potentially unknown IOCs from the list of IOCs in the report?

Firstly, we should start by following the points mentioned in section B1 above, which may be helpful in helping us to surface potentially unknown IOCs as well.

Additionally, we can look for patterns or anomalies within the known IOCs. This could include variations in domain names, slight modifications to file hashes, or similarities in naming conventions. Comparing known IOCs against each other can help us to identify any commonalities that might suggest a larger infrastructure or campaign.

For domains and URLs, we can use domain generation algorithms (DGAs) to generate potential domain variations that may be associated with the same threat actor, and check for subdomains or subdirectories that might have been missed in the initial analysis.

For IP addresses, it is worth investigating IP addresses that are in the same subnet or IP range as the known IP addresses. Search for IP addresses that have communicated with the known IP addresses can potentially indicate a larger network of compromised systems. Furthermore, we can deploy honeypots or network sensors to capture and analyze traffic associated with the known IOCs. This may reveal new and previously unknown connections.

Lastly, we can make use of machine learning. As the data is usually very large and complex, machine learning models can help us in identifying potentially unknown IOCs by analysing the relationships and patterns within the known IOCs and discover hidden connections.

# <a id='Task_C'>Task C. Analytics Development</a>

By inspection, each line of the log file contains fields such as:
1. Timestamp
2. Unique Request ID
3. Source IP
4. Source Port Number
5. Destination IP
6. Destination Port Number
7. HTTP Request Method 
8. Domain Name
9. Requested Path
10. User-Agent info
11. HTTP Response Code (and status)

We will be using the relevant fields **1, 3, 9, 10 and 11** for the analysis.

Assumptions:
The enterprise web server has a decently robust cybersecurity defence set up, hence only HTTP error codes will be considered. Attackers have not found a way to send malicious request (200), the server does not redirect upon errors (302).

<h2>Algorithm</h2>

- Firstly, if the User-Agent contains keywords of common scanning and recon tools such as "Nmap" or "DirBuster" etc, or "Nikto" "Gobuster", immediately flag as suspicious.  


- Next, if there are sensitive path access attempts, such as  "/etc/passwd" or "/admin", count the number of times this occurs and if it exceeds a predetermined threshold value, flag it as suspicious.


- Similarly, if there are a large number of bad request attempts from a particular IP address (eg error code >400), it may indicate that there are some brute force efforts aimed at discovering vulnerabilities or sensitive information on the web server. 


- Lastly, to be safe, we will do a IP Frequency Analysis where we calculate the frequency of requests from each IP address over a specific time window (e.g., one hour). Flag IP addresses with an abnormally high request frequency compared to the average. High-frequency requests could indicate automated scanning.

In this case, we will also flag highly suspicious IPs that met most of the criteria (3 out of 4 checks), in addition to the list of suspicious IPs.

We may also adjust the threshold values based on known activity and traffic patterns of the web sever. In this case low values were chosen as a safer measure.

In [81]:
# Load log file to dataframe for analysis
file_path = 'http.log'
colnames=["timestamp","source_ip","requested_path","user_agent","response_code"]
log_df = pd.read_csv(file_path, delimiter='\t',names=colnames, usecols = [0,2,9,11,14])

display(log_df.head())

  log_df = pd.read_csv(file_path, delimiter='\t',names=colnames, usecols = [0,2,9,11,14])


Unnamed: 0,timestamp,source_ip,requested_path,user_agent,response_code
0,1331901000.0,192.168.202.79,/DEASLog02.nsf,Mozilla/5.0 (compatible; Nmap Scripting Engine...,404
1,1331901000.0,192.168.202.79,/DEASLog03.nsf,Mozilla/5.0 (compatible; Nmap Scripting Engine...,404
2,1331901000.0,192.168.202.79,/DEASLog04.nsf,Mozilla/5.0 (compatible; Nmap Scripting Engine...,404
3,1331901000.0,192.168.202.79,/DEASLog05.nsf,Mozilla/5.0 (compatible; Nmap Scripting Engine...,404
4,1331901000.0,192.168.202.79,/DEASLog.nsf,Mozilla/5.0 (compatible; Nmap Scripting Engine...,404


In [103]:
# Initialize data structures and constants
results = []  # List to store suspicious ips

common_scanning_tools = ["Nmap","DirBuster","Nikto","GoBuster"]
sensitive_paths = ["/etc/passwd", "/admin","/login/admin","/admin.php"]
http_error_codes = ["204","400","401","403","404","429","500","503"]
sensitive_path_threshold = 20
http_error_threshold = 20
requests_threshold = 3600

# Select all rows with common tools used in reconnaissance
mask = log_df['user_agent'].str.contains('|'.join(common_scanning_tools), case=False)
# Apply the mask to the DataFrame to select the desired rows
filtered_df = log_df[mask]
# Add suspicious ips to results
results += list(filtered_df['source_ip'].unique())


# Check for sensitive path access
mask = log_df['requested_path'].str.contains('|'.join(sensitive_paths), case=False)
# Apply the mask to the DataFrame to select the desired rows
filtered_df = log_df[mask]
# Count number of times a particular ip tried to access sensitive paths
filtered_df = filtered_df.groupby(['source_ip']).size().reset_index(name='counts')
# Filter results based on threshold
filtered_df = filtered_df[filtered_df['counts'] >= sensitive_path_threshold]
# Append ips to result
results += list(filtered_df['source_ip'])


# Check for HTTP codes of interest
filtered_df = log_df[log_df['response_code'].isin(http_error_codes)]
# Count number of times a particular ip encounted http error codes
filtered_df = filtered_df.groupby(['source_ip']).size().reset_index(name='counts')
# Filter results based on threshold
filtered_df = filtered_df[filtered_df['counts'] >= http_error_threshold]
# Append ips to result
results += list(filtered_df['source_ip'])


# Check frequency of requests
all_ips = list(log_df['source_ip'].unique())
for source_ip in all_ips:
    filtered_df = log_df[log_df['source_ip'] == source_ip].reset_index()
    # Select middle entry
    middle_index = len(filtered_df) // 2
    # Get timestamp of middle index of particular IP
    timestamp = filtered_df.iloc[middle_index]['timestamp']

    # Get all entrys within 1hr period (+- 30mins)
    lower_bound = timestamp - 1800
    upper_bound = timestamp + 1800

    filtered_df = filtered_df[(filtered_df['timestamp'] >= lower_bound) & (filtered_df['timestamp'] <= upper_bound)]
    # If there are too many requests in 1hr, flag as suspicious
    if len(filtered_df) > 3600:
        results.append(source_ip)
        
# Get unique IPs
suspicious_ips = list(set(results))
print("Suspicious IPs:", suspicious_ips)

# Get IPs that fulfills 3 out of 4 conditions
highly_suspicious_ips = [item for item in set(results) if results.count(item) >= 3 ]
print("\nHighly Suspicious IPs:", highly_suspicious_ips)

Suspicious IPs: ['192.168.202.136', '192.168.202.112', '192.168.202.100', '192.168.202.110', '192.168.202.103', '192.168.202.118', '192.168.202.144', '192.168.202.125', '192.168.202.138', '192.168.202.102', '192.168.202.4', '2001:dbb:c18:202:20c:29ff:fe41:4be7', '192.168.203.64', '192.168.202.76', '192.168.203.61', '2001:dbb:c18:202:20c:29ff:fe93:571e', '192.168.202.96', '192.168.202.140', '192.168.203.45', '192.168.202.79', '192.168.202.68', '192.168.203.63', '192.168.202.141', '192.168.202.94', '192.168.202.90', '192.168.204.45', '192.168.202.108']

Highly Suspicious IPs: ['192.168.202.110', '192.168.202.118', '192.168.202.125', '192.168.202.138', '192.168.202.102', '192.168.202.140', '192.168.202.79', '192.168.203.63']
