In [None]:
# Import libraries
import pandas as pd
import re
import ipaddress
import tldextract


# Load file
df = pd.read_csv('./df_pre_process_I.csv')


# Create the IOC column with the value Não.
df['IOC'] = 'Não'

# Set a global variable to store the value of the 'full_text_ioc' column
text_post = df['full_text_ioc']

#--------------------------------------------------------------
    
# Search for IPv4 IoCs    
def buscar_IoC_ip(pattern):
    patterncompile = re.compile(pattern)
    ipv4_list = []  # Lista para armazenar os IPv4s encontrados
    for index, post in df.iterrows():
        for finded in re.finditer(patterncompile, text_post[index]):
            ipv4 = finded.group()  # Captura o IPv4 encontrado
            ip_obj = ipaddress.ip_address(ipv4)
            if ip_obj.is_global and not ipaddress.ip_network(ipv4).is_reserved:
                ipv4_list.append(ipv4)  # Adiciona o IPv4 à lista
                df.at[index, 'IP4'] = 1
                df.at[index, 'IOC'] = 'Sim'
    
    # Save the list of IPv4s to a text file
    with open('ipv4_list.txt', 'w') as f:
        for ipv4 in ipv4_list:
            f.write(ipv4 + '\n')
            
# Call the search function with the IPv4 pattern
buscar_IoC_ip(r"(?<!\d)(?<!\d\.)(25[0-5]|2[0-4][0-9]|1?[0-9][0-9]?)(\.(25[0-5]|2[0-4][0-9]|1?[0-9][0-9]?)){3}(?!\d)")

#--------------------------------------------------------------


# Search for URL IoCs 
def buscar_IoC_url(pattern):
    patterncompile = re.compile(pattern)
    url_list = []  # Lista para armazenar as URLs encontradas
    for index, post in df.iterrows():
        for finded in re.finditer(patterncompile, text_post[index]):
            url = finded.group(0)  # Captura a URL encontrada
            url_list.append(url)  # Adiciona a URL à lista
            df.at[index, 'URL'] = 1
            df.at[index, 'IOC'] = 'Sim'
    
    # Save the list of URLs to a text file
    with open('url_list.txt', 'w') as f:
        for url in url_list:
            f.write(url + '\n')

# Call the search function with the URL pattern
buscar_IoC_url(r"(http|ftp|https):\/\/([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:\/~+#-]*[\w@?^=%&\/~+#-])")

#--------------------------------------------------------------

# Search for Email IoCs
def buscar_IoC_email(pattern):
    patterncompile = re.compile(pattern)
    email_list = []  # Lista para armazenar os endereços de e-mail encontrados
    for index, post in df.iterrows():
        for finded in re.finditer(patterncompile, text_post[index]):
            email = finded.group(0)  # Captura o endereço de e-mail encontrado
            email_list.append(email)  # Adiciona o endereço de e-mail à lista
            df.at[index, 'EML'] = 1
            df.at[index, 'IOC'] = 'Sim'
    
    # Save the list of email addresses to a text file
    with open('email_list.txt', 'w') as f:
        for email in email_list:
            f.write(email + '\n')

# Call the search function with the email address pattern
buscar_IoC_email(r"[a-zA-Z0-9.]+@[a-zA-Z0-9]+\.[a-zA-Z]+(\.[a-zA-Z]+)*")

#--------------------------------------------------------------


# Search for Domain IoCs
def buscar_IoC_dominio(pattern):
    patterncompile = re.compile(pattern)
    dominio_list = []  # Lista para armazenar os domínios encontrados
    for index, post in df.iterrows():
        # Remove URLs and emails
        text_post_cleaned = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text_post[index])
        text_post_cleaned = re.sub(r'\S+@\S+', '', text_post_cleaned)
        
        # Exclude non-relevant terms
        expressoes_excluir = [
            "rc\.local", "stdio\.h", "stdlib\.h", "locale\.h", "mr\.robot", "compensa\.eu",
            "shaman\.eu", "entendi\.eu", "legal\.faz", "anno\.eu", "saúde\.se", "boxer\.Eu",
        ]
        pattern = r'\b(?:' + '|'.join(expressoes_excluir) + r')\b'
        text_post_cleaned = re.sub(pattern, '', text_post_cleaned, flags=re.IGNORECASE)

        
        # Exclude non-relevant extensions
        extensoes_conhecidas = [
            "a", "i", "s", "n", "o", "e", "t", "x", "h", "c", "j", "w", "p", "cpturbo", "gado", "cgi", "jsp",
            "asp", "md5", "sha1", "db", "xhtml", "você", "vocês", "py", "log", "swf", "jpg", "jpeg", "avi",
            "zip", "rar", "mp3", "mp4", "xml", "exe", "php", "htm", "html", "xghtml", "shtml", "pdf", "txt",
            "json", "img", "css", "png", "bat", "só", "não", "queria", "mas", "dizem", "vou", "ele", "esqueci",
            "estava", "obrigado", "escutei", "enable", "disable", "default", "acho", "achei", "estou", "fazer",
            "sempre", "minha", "que", "append", "quem", "connection", "connecting", "disconnected", "recomendam",
            "basta", "respondendo", "write", "minha", "digo", "depois", "quando", "almocei", "comecei", "pequisei",
            "tomo", "todos", "isso", "assim", "tentei", "alimentado", "ela", "fez", "stone", "fiquei", "disse",
            "talvez", "estamos", "essa", "fico", "melhor", "lugar", "acabei", "esta", "enabled", "delay", "pensei",
            "consideram", "agora", "disabled", "qual", "bolso", "ainda", "irado",
        ]
        extensoes_pattern = r'\.(?:' + '|'.join(extensoes_conhecidas) + r')\b'
        text_post_cleaned = re.sub(extensoes_pattern, '', text_post_cleaned, flags=re.IGNORECASE)
     
        
        for finded in re.finditer(patterncompile, text_post_cleaned):
            dominio = finded.group()  # Captura o domínio encontrado
            dominio_list.append(dominio)  # Adiciona o domínio à lista
            df.at[index, 'DOM'] = 1
            df.at[index, 'IOC'] = 'Sim'
    
    # Save the list of domains to a text file
    with open('dominio_list.txt', 'w') as f:
        for dominio in dominio_list:
            f.write(dominio + '\n')

# Call the search function with the domain pattern
buscar_IoC_dominio(r"([a-zA-Z0-9-]*[a-zA-Z][a-zA-Z0-9-]{1,62}\.[a-zA-Z0-9]*[a-zA-Z][a-zA-Z0-9]{0,24}(\.[a-zA-Z]{2,8}){0,2})")

                   
# ---------------------------------------------------------------------------------

# Search for CVE IoCs
def buscar_IoC_cve(pattern):
    patterncompile = re.compile(pattern)
    cve_list = []  # Lista para armazenar os CVEs encontrados
    for index, post in df.iterrows():
        for finded in re.finditer(patterncompile, text_post[index]):
            cve = finded.group()  # Captura o CVE encontrado
            cve_list.append(cve)  # Adiciona o CVE à lista
            df.at[index, 'CVE'] = 1
            df.at[index, 'IOC'] = 'Sim'
    
    # Save the list of CVEs to a text file
    with open('cve_list.txt', 'w') as f:
        for cve in cve_list:
            f.write(cve + '\n')

# Call the search function with the CVE pattern
buscar_IoC_cve(r"CVE-\d{4}-\d{4,7}")

#--------------------------------------------------------------



###################################################################################
###################################################################################

# Using the IOC-FINDER tool
from ioc_finder import find_iocs
from tqdm import tqdm
import time

# Define a dictionary to store IoC types and their lists
ioc_lists = {
    'ssdeeps': [],
    'md5s': [],
    'sha1s': [],
    'sha256s': [],
    'sha512s': [],
    'asns': [],
    'ipv6s': [],
    'mac_addresses': [],
    'registry_key_paths': []
}

for index, row in tqdm(df.iterrows(), total=len(df)):
    text = str(row['full_text_ioc']).strip('[]')
    #text = str(text_post).strip('[]')
    iocs = find_iocs(text)
    
    for ioc_type, ioc_list in ioc_lists.items():
        if iocs[ioc_type]:
            ioc_list.extend(iocs[ioc_type])
            df.at[index, ioc_type.upper()] = 1
            df.at[index, 'IOC'] = 'Sim'
            
            # Mark as HSH if it's one of the hash types
            if ioc_type in ['ssdeeps', 'md5s', 'sha1s', 'sha256s', 'sha512s']:
                df.at[index, 'HSH'] = 1

# Save the found IoCs to txt files
for ioc_type, ioc_list in ioc_lists.items():
    with open(f'{ioc_type}_list.txt', 'w') as f:
        for ioc_value in ioc_list:
            f.write(str(ioc_value) + '\n')
###################################################################################
###################################################################################


# Save CSV file
df.to_csv('df_ioc_explorer.csv', index=False)


# Save XLSX file
df.to_excel('df_ioc_explorer.xlsx', index=False)