In [None]:
# Cell 1: Install, Download, Process, and Save RAG Database #2

# --- 1. INSTALL LIBRARIES ---
print("Installing libraries for RAG Database #2...")
!pip install -q mitreattack-python pandas openpyxl requests tqdm
!pip install -q langchain langchain_community langchain_huggingface faiss-cpu sentence-transformers

Installing libraries for RAG Database #2...
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m556.8/556.8 kB[0m [31m47.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.4/91.4 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.1/44.1 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.6/61.6 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m177.8/177.8 kB[0m [31m21.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m175.3/175.3 kB[0m [31m20.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m65.9/65.9 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m110.5 MB/s[0m eta [36m0:00:00[0m
[

In [None]:
# cell 2-

import requests
import json
import os
import pandas as pd
from tqdm.notebook import tqdm
from mitreattack.attackToExcel import attackToExcel
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from google.colab import drive

# --- 2. DEFINE PATHS AND DOWNLOAD DATA ---
RAW_DATA_DIR_2 = "/content/tactic_data_raw"
os.makedirs(RAW_DATA_DIR_2, exist_ok=True)
all_tactic_docs = []

# Download MITRE ATT&CK
print("\n[*] Fetching MITRE ATT&CK Enterprise data...")
try:
    attackToExcel.export("enterprise-attack", output_dir=RAW_DATA_DIR_2)
    print("[+] MITRE ATT&CK data saved successfully.")
except Exception as e:
    print(f"❌ Could not download MITRE ATT&CK data. Error: {e}")
    raise

# Download Exploit-DB CSV
print("\n[*] Fetching Exploit-DB data (via Searchsploit CSV)...")
exploitdb_csv_path = os.path.join(RAW_DATA_DIR_2, "exploitdb_files.csv")
try:
    url = "https://gitlab.com/exploit-database/exploitdb/-/raw/main/files_exploits.csv"
    response = requests.get(url, timeout=30)
    response.raise_for_status()
    with open(exploitdb_csv_path, "w", encoding='utf-8') as f:
        f.write(response.text)
    print("[+] Exploit-DB data saved.")
except requests.exceptions.RequestException as e:
    print(f"❌ Could not download Exploit-DB data. Error: {e}")
    raise

# --- 3. PROCESS MITRE ATT&CK DATA ---
print("\n[*] Processing MITRE ATT&CK data...")
try:
    mitre_file_path = os.path.join(RAW_DATA_DIR_2, "enterprise-attack", "enterprise-attack-techniques.xlsx")
    mitre_techniques_df = pd.read_excel(mitre_file_path)

    for _, row in tqdm(mitre_techniques_df.iterrows(), total=mitre_techniques_df.shape[0], desc="Processing MITRE ATT&CK"):
        page_content = (
            f"MITRE ATT&CK Technique ID: {row['ID']}\n"
            f"Name: {row['name']}\n"
            f"Description: {row['description']}\n"
            f"Tactic(s): {row['tactics']}"
        )
        doc = Document(page_content=page_content, metadata={"source": "mitre_attack", "id": row['ID']})
        all_tactic_docs.append(doc)
    print(f"[+] Processed {len(all_tactic_docs)} MITRE ATT&CK techniques.")
except FileNotFoundError:
    print(f"❌ MITRE ATT&CK file not found at '{mitre_file_path}'.")
    raise

# --- 4. PROCESS METASPLOIT DATA FROM EXPLOIT-DB CSV ---
print("\n[*] Processing Exploit-DB file to find Metasploit modules...")
try:
    exploitdb_df = pd.read_csv(exploitdb_csv_path)
    metasploit_df = exploitdb_df[exploitdb_df['description'].str.contains('Metasploit', case=False, na=False)].copy()

    for _, row in tqdm(metasploit_df.iterrows(), total=metasploit_df.shape[0], desc="Processing Metasploit Modules"):
        cve_codes = str(row.get('codes', '')).replace(';', ', ')
        page_content = (
            f"Metasploit Module Exploit: {row['description']}\n"
            f"File Path: {row['file']}\n"
            f"Type: {row['type']}\n"
            f"Platform: {row['platform']}\n"
            f"Associated CVEs: {cve_codes}"
        )
        doc = Document(page_content=page_content, metadata={"source": "metasploit_from_exploitdb", "id": row['id']})
        all_tactic_docs.append(doc)
    print(f"[+] Processed {len(metasploit_df)} Metasploit modules from Exploit-DB.")
except FileNotFoundError:
    print(f"❌ 'exploitdb_files.csv' not found. Download failed in a previous step.")
    raise

print(f"\n[+] Created a total of {len(all_tactic_docs)} tactic and technique documents.")

# --- 5. CHUNK AND SAVE THE FINAL VECTOR DATABASE ---
print("\n[*] Chunking documents...")
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
chunked_docs = text_splitter.split_documents(all_tactic_docs)
print(f"[+] Split documents into {len(chunked_docs)} chunks.")

print("\n[*] Initializing embedding model...")
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

print("[*] Mounting Google Drive...")
drive.mount('/content/drive')

# 🔽 UPDATED FINAL PATH AS REQUESTED 🔽
drive_db_path = "/content/drive/My Drive/sih rag/rag 2"
print(f"[*] Ensuring directory exists: '{drive_db_path}'")
os.makedirs(drive_db_path, exist_ok=True)

print(f"[*] Creating FAISS vector database from {len(chunked_docs)} chunks...")
vector_db = FAISS.from_documents(chunked_docs, embedding_model)

print(f"[*] Saving database to Google Drive at: '{drive_db_path}'")
vector_db.save_local(drive_db_path)

print(f"\n✅ Vector Database #2 (MITRE & Metasploit) built and saved successfully!")

[32m2025-10-13 23:32:27.119[0m | [1mINFO    [0m | [36mmitreattack.attackToExcel.attackToExcel[0m:[36mget_stix_data[0m:[36m71[0m - [1mDownloading ATT&CK data from github.com/mitre/cti[0m



[*] Fetching MITRE ATT&CK Enterprise data...


[32m2025-10-13 23:32:35.340[0m | [1mINFO    [0m | [36mmitreattack.attackToExcel.attackToExcel[0m:[36mexport[0m:[36m322[0m - [1m************ Exporting enterprise-attack to Excel ************[0m
parsing techniques: 100%|██████████| 679/679 [00:00<00:00, 1576.53it/s]
parsing relationships for type=technique: 100%|██████████| 20411/20411 [00:08<00:00, 2294.83it/s]
parsing tactics: 100%|██████████| 14/14 [00:00<00:00, 20800.66it/s]
parsing software: 100%|██████████| 755/755 [00:00<00:00, 39576.32it/s]
parsing relationships for type=software: 100%|██████████| 20411/20411 [00:05<00:00, 3879.44it/s]
parsing groups: 100%|██████████| 166/166 [00:00<00:00, 30852.77it/s]
parsing relationships for type=group: 100%|██████████| 20411/20411 [00:02<00:00, 7886.23it/s]
parsing campaigns: 100%|██████████| 47/47 [00:00<00:00, 21855.02it/s]
parsing relationships for type=campaign: 100%|██████████| 20411/20411 [00:00<00:00, 30151.59it/s]
parsing mitigations: 100%|██████████| 44/44 [00:00<00:00, 

[+] MITRE ATT&CK data saved successfully.

[*] Fetching Exploit-DB data (via Searchsploit CSV)...
[+] Exploit-DB data saved.

[*] Processing MITRE ATT&CK data...


Processing MITRE ATT&CK:   0%|          | 0/679 [00:00<?, ?it/s]

[+] Processed 679 MITRE ATT&CK techniques.

[*] Processing Exploit-DB file to find Metasploit modules...


Processing Metasploit Modules:   0%|          | 0/2464 [00:00<?, ?it/s]

[+] Processed 2464 Metasploit modules from Exploit-DB.

[+] Created a total of 3143 tactic and technique documents.

[*] Chunking documents...
[+] Split documents into 4054 chunks.

[*] Initializing embedding model...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

[*] Mounting Google Drive...
Mounted at /content/drive
[*] Ensuring directory exists: '/content/drive/My Drive/sih rag/rag 2'
[*] Creating FAISS vector database from 4054 chunks...
[*] Saving database to Google Drive at: '/content/drive/My Drive/sih rag/rag 2'

✅ Vector Database #2 (MITRE & Metasploit) built and saved successfully!
