In [None]:
# Cell 1: Install, Download, Process, and Save RAG Database #1b

# --- 1. INSTALL LIBRARIES ---
print("Installing libraries for RAG Database #1b...")
!pip install -q requests pandas tqdm langchain langchain_community langchain_huggingface faiss-cpu sentence-transformers

import requests
import json
import os
import pandas as pd
from tqdm.notebook import tqdm
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from google.colab import drive

# --- 2. DEFINE PATHS AND DOWNLOAD DATA ---
RAW_DATA_DIR = "/content/exploit_data_raw"
os.makedirs(RAW_DATA_DIR, exist_ok=True)
all_exploit_docs = []

# Download CISA KEV
print("[*] Fetching CISA KEV (full catalog)...")
try:
    url_cisa = "https://www.cisa.gov/sites/default/files/feeds/known_exploited_vulnerabilities.json"
    headers = {'User-Agent': 'Mozilla/5.0'}
    response_cisa = requests.get(url_cisa, timeout=30, headers=headers)
    response_cisa.raise_for_status()
    cisa_data = response_cisa.json()
    with open(os.path.join(RAW_DATA_DIR, "cisa_kev.json"), "w") as f:
        json.dump(cisa_data, f)
    print("[+] CISA KEV data saved.")
except Exception as e:
    print(f"❌ Could not download CISA KEV data. Error: {e}")
    raise

# Download Exploit-DB
print("\n[*] Fetching Exploit-DB data (via Searchsploit CSV)...")
try:
    url_edb = "https://gitlab.com/exploit-database/exploitdb/-/raw/main/files_exploits.csv"
    response_edb = requests.get(url_edb, timeout=30)
    response_edb.raise_for_status()
    exploitdb_csv_path = os.path.join(RAW_DATA_DIR, "exploitdb_files.csv")
    with open(exploitdb_csv_path, "w", encoding='utf-8') as f:
        f.write(response_edb.text)
    print("[+] Exploit-DB data saved.")
except Exception as e:
    print(f"❌ Could not download Exploit-DB data. Error: {e}")
    raise

# --- 3. PROCESS ALL DATA ---
print("\n[*] Processing CISA KEV data...")
for vuln in tqdm(cisa_data.get('vulnerabilities', []), desc="Processing CISA KEV"):
    page_content = (
        f"CISA Known Exploited Vulnerability: {vuln.get('vulnerabilityName', 'N/A')}\n"
        f"CVE ID: {vuln.get('cveID', 'N/A')}\n"
        f"Vendor: {vuln.get('vendorProject', 'N/A')}\n"
        f"Product: {vuln.get('product', 'N/A')}\n"
        f"Description: {vuln.get('shortDescription', 'N/A')}\n"
        f"Required Action: {vuln.get('requiredAction', 'N/A')}"
    )
    doc = Document(
        page_content=page_content,
        metadata={"source": "cisa_kev", "cve_id": vuln.get('cveID', 'N/A')}
    )
    all_exploit_docs.append(doc)
print(f"[+] Created {len(cisa_data.get('vulnerabilities', []))} documents from CISA KEV.")


print("\n[*] Processing Exploit-DB data...")
exploitdb_df = pd.read_csv(exploitdb_csv_path)
for _, row in tqdm(exploitdb_df.iterrows(), total=exploitdb_df.shape[0], desc="Processing Exploit-DB"):
    cve_codes = str(row.get('codes', '')).replace(';', ', ')
    page_content = (
        f"Exploit Title: {row['description']}\n"
        f"EDB-ID: {row['id']}\n"
        f"File Path: {row['file']}\n"
        f"Type: {row['type']}\n"
        f"Platform: {row['platform']}\n"
        f"Associated Codes: {cve_codes}"
    )
    doc = Document(
        page_content=page_content,
        metadata={"source": "exploit_db", "edb_id": row['id']}
    )
    all_exploit_docs.append(doc)
print(f"[+] Created {len(exploitdb_df)} documents from Exploit-DB.")

print(f"\n[+] Created a total of {len(all_exploit_docs)} enriched exploit documents.")

# --- 4. CHUNK AND SAVE FINAL DATABASE ---
print("\n[*] Chunking documents...")
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
chunked_docs = text_splitter.split_documents(all_exploit_docs)
print(f"[+] Split documents into {len(chunked_docs)} chunks.")

print("\n[*] Initializing embedding model...")
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

print("[*] Mounting Google Drive...")
drive.mount('/content/drive')

drive_db_path = "/content/drive/My Drive/sih rag/rag 1 b database"
os.makedirs(drive_db_path, exist_ok=True)

print(f"[*] Creating FAISS vector database from {len(chunked_docs)} chunks...")
vector_db = FAISS.from_documents(chunked_docs, embedding_model)

print(f"[*] Saving database to Google Drive at: '{drive_db_path}'")
vector_db.save_local(drive_db_path)

print(f"\n✅ Vector Database #1b (CISA & Exploit-DB) built and saved successfully!")

Installing libraries for RAG Database #1b...
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m45.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.7/64.7 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m63.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.9/50.9 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-colab 1.0.0 requires requests==2.32.4, but you have requests 2.32.5 which is incompatible.[0m[31m
[0m[*] Fetching CISA KEV (full catalog)...
[+] CISA KEV data saved.

[*] Fetching Exploit-DB data (via Searchsploit CSV)...
[+] Exploit-DB data saved.

[*] Processing CISA KEV data...


Processing CISA KEV:   0%|          | 0/1436 [00:00<?, ?it/s]

[+] Created 1436 documents from CISA KEV.

[*] Processing Exploit-DB data...


Processing Exploit-DB:   0%|          | 0/46920 [00:00<?, ?it/s]

[+] Created 46920 documents from Exploit-DB.

[+] Created a total of 48356 enriched exploit documents.

[*] Chunking documents...
[+] Split documents into 48369 chunks.

[*] Initializing embedding model...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

[*] Mounting Google Drive...
Mounted at /content/drive
[*] Creating FAISS vector database from 48369 chunks...
[*] Saving database to Google Drive at: '/content/drive/My Drive/sih rag/rag 1 b database'

✅ Vector Database #1b (CISA & Exploit-DB) built and saved successfully!
