In [2]:
import os
import shutil
from pathlib import Path
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from tqdm import tqdm

# 1. Sökvägar
BASE_DIR = Path(r"C:\Users\Dator\Documents\Data_Science\11_Examensarbete\green_power_sweden")
TEXT_SOURCE_DIR = BASE_DIR / "data" / "02_processed" / "extracted_text"
DB_OUTPUT_DIR = BASE_DIR / "data" / "03_vector_db" / "green_power_sweden_db_local"

# 2. Modell
print("Laddar modell...")
embedding_model = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-mpnet-base-v2",
    model_kwargs={'device': 'cpu'} 
)

# 3. Läs filer
print(f"Läser textfiler från {TEXT_SOURCE_DIR}...")
documents = []
all_files = list(TEXT_SOURCE_DIR.rglob("*.txt"))

if not all_files:
    print("FEL: Hittade inga textfiler! Har du packat upp dem lokalt?")
    exit()

for file_path in tqdm(all_files, desc="Läser filer"):
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            text = f.read()
        # Skapa metadata från filnamnet
        source_name = file_path.name.replace(".txt", "")
        documents.append(Document(page_content=text, metadata={"source": source_name}))
    except Exception:
        continue

print(f"Laddade {len(documents)} dokument.")

# 4. Chunka
print("Chunkar dokument...")
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=2000,
    chunk_overlap=400,
    separators=["\n\n", "\n", " ", ""]
)
chunks = text_splitter.split_documents(documents)
print(f"Skapade {len(chunks)} chunks.")

# 5. Bygg databas (Batchat för att spara minne)
print(f"Bygger databas i {DB_OUTPUT_DIR}...")
if DB_OUTPUT_DIR.exists():
    shutil.rmtree(DB_OUTPUT_DIR) # Rensa gammalt försök

vectordb = Chroma(
    persist_directory=str(DB_OUTPUT_DIR),
    embedding_function=embedding_model
)

BATCH_SIZE = 100 # Mindre batcher för din dators skull
total_chunks = len(chunks)

for i in tqdm(range(0, total_chunks, BATCH_SIZE), desc="Skapar embeddings"):
    batch = chunks[i:i + BATCH_SIZE]
    vectordb.add_documents(batch)

print("Klar! Databasen är byggd lokalt och kompatibel.")

Laddar modell...
Läser textfiler från C:\Users\Dator\Documents\Data_Science\11_Examensarbete\green_power_sweden\data\02_processed\extracted_text...


Läser filer: 100%|██████████| 4483/4483 [00:05<00:00, 870.15it/s] 


Laddade 4483 dokument.
Chunkar dokument...
Skapade 165583 chunks.
Bygger databas i C:\Users\Dator\Documents\Data_Science\11_Examensarbete\green_power_sweden\data\03_vector_db\green_power_sweden_db_local...


Skapar embeddings:   0%|          | 6/1656 [21:21<97:51:28, 213.51s/it]


KeyboardInterrupt: 