In [None]:
# Import libraries
import pandas as pd


# Load File
df = pd.read_csv('./df_pre_process_II.csv')


# Tagging posts based on the occurrence of keywords
from gensim import corpora, models


docs = df['full_text']
docs = docs.apply(lambda x: x.split())

########

# Define a list of keywords for cyber threats

threat_keywords = ['cpf', 'cpfs', 'cve', 'password', 'passwords', 'senha', 'senhas', 'hack',
                   'hacker', 'hackers', 'hacking', 'virus', 'malware', 'spyware', 'phishing',
                   'fishing', 'spam', 'trojan', 'criptografia', 'rootkit', 'backdoor', 'worm',
                   'botnet', 'vazamento', 'vazamentos', 'dados', 'spoofing', 'wordlist', 'ransomware',
                   'injection', 'sqlinjection', 'ddos', 'exploit', 'keylogger', 'vulnerabilidade',
                   'vulnerabilidades', 'hash', 'hashes']



# Prepare corpus
dictionary = corpora.Dictionary(docs)
corpus = [dictionary.doc2bow(doc) for doc in docs]

# Train LDA model
lda_model = models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=10, random_state=100,
                                     update_every=1, chunksize=100, passes=10, alpha='auto', per_word_topics=True)

# Label documents based on keywords for cyber threats

labels = []
for i, doc in enumerate(docs):
    doc_bow = dictionary.doc2bow(doc)
    doc_topics, word_topics, phi_values = lda_model.get_document_topics(doc_bow, per_word_topics=True)
    threat_count = 0
    for word, topic in word_topics:
        if dictionary[word] in threat_keywords:
            threat_count += 1
    if threat_count > 0:
        labels.append('Sim')
    else:
        labels.append('Não')    

# Add Threat column to the original dataset
df['KEYWORD'] = labels

# Save labeled dataset
df.to_csv('df_ioc_explorer_and_keywords.csv', index=False)
df.to_excel('df_ioc_explorer_and_keywords.xlsx', index=False)