In [4]:
%pip install --upgrade --quiet  langchain langchain-community langchainhub langchain-openai chromadb bs4 jq

Note: you may need to restart the kernel to use updated packages.


In [1]:
import getpass
import os

os.environ["OPENAI_API_KEY"] = ""
#cvc

In [2]:
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_ENDPOINT"]="https://api.smith.langchain.com"
os.environ["LANGCHAIN_API_KEY"] = ""

In [3]:
import bs4
from langchain import hub
from langchain.text_splitter import RecursiveCharacterTextSplitter, RecursiveJsonSplitter
from langchain_community.document_loaders import WebBaseLoader,JSONLoader
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

In [5]:
import pandas as pd

df = pd.read_excel('mitigations.xlsx')
df

Unnamed: 0,TechniqueID,MitigationID,MitigationName,MitigationDescription
0,T1548,M1047,Audit,Check for common UAC bypass weaknesses on Wind...
1,T1548,M1038,Execution Prevention,System settings can prevent applications from ...
2,T1548,M1028,Operating System Configuration,Applications with known vulnerabilities or kno...
3,T1548,M1026,Privileged Account Management,Remove users from the local administrator grou...
4,T1548,M1022,Restrict File and Directory Permissions,The sudoers file should be strictly edited suc...
...,...,...,...,...
463,T1047,M1040,Behavior Prevention on Endpoint,"On Windows 10, enable Attack Surface Reduction..."
464,T1047,M1038,Execution Prevention,Use application control configured to block ex...
465,T1047,M1026,Privileged Account Management,Prevent credential overlap across systems of a...
466,T1047,M1018,User Account Management,"By default, only administrators are allowed to..."


In [7]:
import json
search_technique_id = 'T1210'  

filtered_df = df[df['TechniqueID'] == search_technique_id]

mitigation_descriptions = list(set(filtered_df['MitigationDescription']))

technique_mitigation_dict = {search_technique_id: mitigation_descriptions}

output_json_path = 'mitigation_descriptions.json' 
with open(output_json_path, 'w') as json_file:
    json.dump(technique_mitigation_dict, json_file, indent=4)

In [12]:
loader = JSONLoader(
    file_path='mitigation_descriptions.json',
    jq_schema = '.[] | .[] | {text: .}',
    text_content=False)

data = loader.load()

In [13]:
len(data)

8

In [14]:
data

[Document(page_content='{"text": "Segment networks and systems appropriately to reduce access to critical systems and services to controlled methods."}', metadata={'source': '/home/is1ab/code/SEBERT_RAG_CTI/RAG/mitigation_descriptions.json', 'seq_num': 1}),
 Document(page_content='{"text": "Regularly scan the internal network for available services to identify new and potentially vulnerable services."}', metadata={'source': '/home/is1ab/code/SEBERT_RAG_CTI/RAG/mitigation_descriptions.json', 'seq_num': 2}),
 Document(page_content='{"text": "Develop a robust cyber threat intelligence capability to determine what types and levels of threat may use software exploits and 0-days against a particular organization."}', metadata={'source': '/home/is1ab/code/SEBERT_RAG_CTI/RAG/mitigation_descriptions.json', 'seq_num': 3}),
 Document(page_content='{"text": "Minimize permissions and access for service accounts to limit impact of exploitation."}', metadata={'source': '/home/is1ab/code/SEBERT_RAG_CT

In [16]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=50)
splits = text_splitter.split_documents(data)

In [17]:
vectorstore = Chroma.from_documents(documents=splits, embedding=OpenAIEmbeddings())

In [9]:
#base_retriever = vectorstore.as_retriever(search_kwargs={"k" : 3})

In [10]:
#relevant_docs = base_retriever.get_relevant_documents("Based on this question, what is the most likely technique ID")

In [11]:
#len(relevant_docs)

In [24]:
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex
from llama_index.llms.llama_cpp import LlamaCPP
from langchain_community.llms import LlamaCpp


In [27]:


#model_path = "llama.cpp/models/llama-2-7b-chat/llama-2_q4.gguf"
retriever = vectorstore.as_retriever()
prompt = hub.pull("rlm/rag-prompt")
model_path = "/home/is1ab/code/SEBERT_RAG_CTI/RAG/llama-2-7b-chat.Q2_K.gguf"

# llm = ChatOpenAI(model="gpt-4")

llm = LlamaCpp(
    model_path=model_path,
    n_gpu_layers=100,
    n_batch=512,
    n_ctx=2048,
    f16_kv=True,
    callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]),
    verbose=True,
)

In [30]:
from langchain.chains.prompt_selector import ConditionalPromptSelector, is_chat_model
from langchain.prompts import PromptTemplate
from langchain.prompts.chat import (
    ChatPromptTemplate,
    HumanMessagePromptTemplate,
    SystemMessagePromptTemplate,
)

# Assuming docs are being passed from a retrieval system
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

# This function is expected to fetch and format the question based on CTI and TTP_ID
def format_question(cti, ttp_id):
    return f"CTI: {cti} TTP_ID: {ttp_id}"


prompt_template = """Please provide a mitigation report based on the context of the following Cyber Threat Intelligence (CTI) and associated Techniques. I will supply you with several mitigation methods corresponding to these Techniques. Your task is to draft a mitigation measures report specifically for this CTI scenario.

{context}

Question: {question}
Helpful Answer:"""

PROMPT = PromptTemplate(
    template=prompt_template, input_variables=["context", "question"]
)

from langchain.chains.question_answering import load_qa_chain
system_template = """Use the following pieces of context to answer the user's question.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
----------------
{context}"""
messages = [
    SystemMessagePromptTemplate.from_template(system_template),
    HumanMessagePromptTemplate.from_template("{question}"),
]
CHAT_PROMPT = ChatPromptTemplate.from_messages(messages)


PROMPT_SELECTOR = ConditionalPromptSelector(
    default_prompt=PROMPT, conditionals=[(is_chat_model, CHAT_PROMPT)]
)

# Loading the QA chain
chain = load_qa_chain(llm, chain_type="stuff", prompt=PROMPT_SELECTOR.get_prompt(llm))

In [41]:
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 3})
cti = "APT28 exploited a Windows SMB Remote Code Execution Vulnerability to conduct lateral movement"
ttp_id = "T1210"

question = format_question(cti, ttp_id)
retrieved_docs = retriever.invoke(question)
retrieved_docs

[Document(page_content='{"text": "Develop a robust cyber threat intelligence capability to determine what types and levels of threat may use software exploits and 0-days against a particular organization."}', metadata={'seq_num': 3, 'source': '/home/is1ab/code/SEBERT_RAG_CTI/RAG/mitigation_descriptions.json'}),
 Document(page_content='exploits and weaknesses in these systems may still exist. [42]"}', metadata={'seq_num': 6, 'source': '/home/is1ab/code/SEBERT_RAG_CTI/RAG/mitigation_descriptions.json'}),
 Document(page_content='Experience Toolkit (EMET) can be used to mitigate some exploitation behavior. [43] Control flow integrity checking is another way to potentially identify and stop a software exploit from occurring.', metadata={'seq_num': 7, 'source': '/home/is1ab/code/SEBERT_RAG_CTI/RAG/mitigation_descriptions.json'})]

In [45]:

cti = "APT28 exploited a Windows SMB Remote Code Execution Vulnerability to conduct lateral movement"
ttp_id = "T1210"

context = format_docs(retrieved_docs)
question = format_question(cti, ttp_id)


input_data = {
    "input_documents": data,
    "context": context,
    "question": question
}


print(input_data)
output = chain.run(input_data)

print(output)



{'input_documents': [Document(page_content='{"text": "Segment networks and systems appropriately to reduce access to critical systems and services to controlled methods."}', metadata={'source': '/home/is1ab/code/SEBERT_RAG_CTI/RAG/mitigation_descriptions.json', 'seq_num': 1}), Document(page_content='{"text": "Regularly scan the internal network for available services to identify new and potentially vulnerable services."}', metadata={'source': '/home/is1ab/code/SEBERT_RAG_CTI/RAG/mitigation_descriptions.json', 'seq_num': 2}), Document(page_content='{"text": "Develop a robust cyber threat intelligence capability to determine what types and levels of threat may use software exploits and 0-days against a particular organization."}', metadata={'source': '/home/is1ab/code/SEBERT_RAG_CTI/RAG/mitigation_descriptions.json', 'seq_num': 3}), Document(page_content='{"text": "Minimize permissions and access for service accounts to limit impact of exploitation."}', metadata={'source': '/home/is1ab/c

In [48]:
print(output)

Based on the context, the organization can take several steps to mitigate this threat. Firstly, they should segment their network and systems to reduce access to critical systems and services. Regular scanning of the internal network can help identify new and potentially vulnerable services. 

Service accounts should have minimal permissions and access to reduce the impact of exploitation. The organization should also keep their software updated and use patch management for their internal enterprise endpoints and servers.

Implementing a robust cyber threat intelligence capability can also be beneficial for identifying potential threats that might use software exploits and 0-days. 

They can also use security applications like Windows Defender Exploit Guard (WDEG) and the Enhanced Mitigation Experience Toolkit (EMET) to mitigate some exploitation behavior. 

Finally, they should minimize available services to only those that are necessary. This can help reduce the potential avenues of 

In [47]:
print(context)

{"text": "Develop a robust cyber threat intelligence capability to determine what types and levels of threat may use software exploits and 0-days against a particular organization."}

exploits and weaknesses in these systems may still exist. [42]"}

Experience Toolkit (EMET) can be used to mitigate some exploitation behavior. [43] Control flow integrity checking is another way to potentially identify and stop a software exploit from occurring.
