### Text-extrahering, konverterar alla filer till rena .txt-filer

##### **Process:**
1. Läs `pdf_analysis_report.csv` för att veta vilka PDF:er som är Text vs OCR.
2. Definiera en "worker"-funktion för varje filtyp.
3. Loopa igenom PDF-rapporten och extrahera text från alla PDF:er.
4. Loopa igenom övriga filer (`.xlsx`, `.docx` etc.) och extrahera text.
5. Spara varje fils extraherade text som en ny `.txt`-fil.
   * Dessa sparas i `02_processed/extracted_text/`.

### Importer

In [1]:
import os
import shutil
import pandas as pd
import pdfplumber
import docx
import email
from bs4 import BeautifulSoup
from pathlib import Path
from tqdm.notebook import tqdm
from email.parser import BytesParser
from email import policy

# OCR-specifika importer
try:
    import pytesseract
    from pdf2image import convert_from_path
    OCR_ENABLED = True
    
    # === VIKTIGT PÅ WINDOWS ===
    # Om du inte lagt Poppler i din PATH, måste du peka ut var den finns
    # t.ex: POPPLER_PATH = r"C:\Program Files\Poppler\poppler-25.07.0\Library\bin"
    POPPLER_PATH = None 
    
    # Om du inte lagt Tesseract i din PATH
    # t.ex: pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

except ImportError:
    OCR_ENABLED = False
    print("VARNING: pytesseract eller pdf2image saknas. OCR-extrahering kommer att misslyckas.")
    print("Kör 'pip install pytesseract pdf2image' och installera Tesseract/Poppler.")


### Sätt upp sökvägar

In [2]:
BASE_DIR = Path(r"C:\Users\Dator\Documents\Data_Science\11_Examensarbete\green_power_sweden")
DATA_DIR = BASE_DIR / "data"
CLEAN_DATA_DIR = DATA_DIR / "01_raw" # Vår städade indata-mapp
PROCESSED_DIR = DATA_DIR / "02_processed"

# Vår nya mapp för all extraherad text
TEXT_OUTPUT_DIR = PROCESSED_DIR / "extracted_text"
TEXT_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# Sökväg till vår PDF-rapport
ANALYSIS_RESULT_FILE = PROCESSED_DIR / "pdf_analysis_report.csv"

print(f"Indata från: {CLEAN_DATA_DIR}")
print(f"Utdata (text) till: {TEXT_OUTPUT_DIR}")
print(f"Läser PDF-plan från: {ANALYSIS_RESULT_FILE}")
print(f"OCR-funktionalitet är: {'AKTIVERAD' if OCR_ENABLED else 'AVSTÄNGD'}")


Indata från: C:\Users\Dator\Documents\Data_Science\11_Examensarbete\green_power_sweden\data\01_raw
Utdata (text) till: C:\Users\Dator\Documents\Data_Science\11_Examensarbete\green_power_sweden\data\02_processed\extracted_text
Läser PDF-plan från: C:\Users\Dator\Documents\Data_Science\11_Examensarbete\green_power_sweden\data\02_processed\pdf_analysis_report.csv
OCR-funktionalitet är: AKTIVERAD


### Definiera extraherings-funktioner

In [3]:
import json

# 1. NY SPAR-FUNKTION (Sparar JSON med metadata)
def save_json(original_path: Path, pages_data: list, output_dir: Path, base_raw_dir: Path):
    """Sparar extraherad data och metadata som JSON."""
    try:
        # Räkna ut den relativa sökvägen (t.ex. "domar/Kulturmiljö/min_fil.pdf")
        try:
            relative_path = original_path.relative_to(base_raw_dir)
        except ValueError:
            relative_path = original_path.name # Fallback om sökvägen strular

        # Detta är den nya datastrukturen vi vill ha!
        data = {
            "filename": original_path.name,
            "full_path": str(relative_path), # HÄR SPARA VI MAPPSTRUKTUREN!
            "pages": pages_data # En lista: [{page: 1, text: "..."}, {page: 2...}]
        }

        output_filename = f"{original_path.name}.json"
        output_path = output_dir / output_filename
        
        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump(data, f, ensure_ascii=False, indent=2)
        return "success"
    except Exception as e:
        return f"error_saving: {e}"

# 2. Uppdaterad PDF-funktion (Text)
def extract_text_from_text_pdf(file_path: Path) -> tuple[list, str]:
    """Returnerar en lista med sid-objekt istället för en sträng."""
    pages_data = []
    try:
        with pdfplumber.open(file_path) as pdf:
            for i, page in enumerate(pdf.pages):
                text = page.extract_text()
                if text:
                    # Vi sparar sidnumret (i+1)
                    pages_data.append({"page_number": i + 1, "text": text})
        return pages_data, "success"
    except Exception as e:
        return [], f"error_pdfplumber: {e}"

# 3. Uppdaterad PDF-funktion (OCR)
def extract_text_from_ocr_pdf(file_path: Path) -> tuple[list, str]:
    if not OCR_ENABLED:
        return [], "error_ocr_disabled"
    pages_data = []
    try:
        images = convert_from_path(file_path, poppler_path=POPPLER_PATH)
        for i, img in enumerate(images):
            text = pytesseract.image_to_string(img, lang='swe')
            if text:
                pages_data.append({"page_number": i + 1, "text": text})
        if not pages_data:
            return [], "ocr_no_text_found"
        return pages_data, "success_ocr"
    except Exception as e:
        return [], f"error_ocr: {e}"

# 4. Uppdaterad Excel (Fejkar sidnummer 1)
def extract_text_from_xlsx(file_path: Path) -> tuple[list, str]:
    text = ""
    try:
        xls = pd.ExcelFile(file_path)
        for sheet_name in xls.sheet_names:
            df = pd.read_excel(xls, sheet_name=sheet_name, header=None)
            text += f"--- Flik: {sheet_name} ---\n"
            text += df.to_string(index=False, header=False) + "\n\n"
        return [{"page_number": 1, "text": text}], "success"
    except Exception as e:
        return [], f"error_excel: {e}"

# 5. Uppdaterad Word
def extract_text_from_docx(file_path: Path) -> tuple[list, str]:
    text = ""
    try:
        doc = docx.Document(file_path)
        for para in doc.paragraphs:
            text += para.text + "\n"
        return [{"page_number": 1, "text": text}], "success"
    except Exception as e:
        return [], f"error_docx: {e}"

# 6. Uppdaterad HTML
def extract_text_from_html(file_path: Path) -> tuple[list, str]:
    try:
        with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
            soup = BeautifulSoup(f, 'html.parser')
            text = soup.get_text(separator="\n", strip=True)
        return [{"page_number": 1, "text": text}], "success"
    except Exception as e:
        return [], f"error_html: {e}"

# 7. Uppdaterad EML
def extract_text_from_eml(file_path: Path) -> tuple[list, str]:
    try:
        with open(file_path, 'rb') as f:
            msg = BytesParser(policy=policy.default).parse(f)
        text = ""
        if msg.is_multipart():
            for part in msg.walk():
                if part.get_content_type() == "text/plain":
                    text += part.get_payload(decode=True).decode('utf-8', errors='ignore')
        else:
            if msg.get_content_type() == "text/plain":
                text = msg.get_payload(decode=True).decode('utf-8', errors='ignore')
        if not text:
             text = msg.get_body(preferencelist=('plain', 'html')).get_content()
        return [{"page_number": 1, "text": text}], "success"
    except Exception as e:
        return [], f"error_eml: {e}"

# 8. Uppdaterad TXT
def extract_text_from_txt(file_path: Path) -> tuple[list, str]:
    try:
        with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
            text = f.read()
        return [{"page_number": 1, "text": text}], "success"
    except Exception as e:
        return [], f"error_txt: {e}"

### Ladda PDF-analysrapporten

In [4]:
try:
    df_analysis = pd.read_csv(ANALYSIS_RESULT_FILE)
    print(f"Laddade {len(df_analysis)} rader från PDF-rapporten.")
    print(df_analysis['status'].value_counts()) # <-- Utskriften du saknade!
except FileNotFoundError:
    print(f"FEL: Kunde inte hitta {ANALYSIS_RESULT_FILE}")
    print("Se till att du har kört notebook 02 först.")
    df_analysis = pd.DataFrame() # Skapa tom dataframe

Laddade 4502 rader från PDF-rapporten.
status
text_based       4372
ocr_candidate     130
Name: count, dtype: int64


###  Bearbeta alla PDF-filer - använder rätt funktion (text eller OCR) för varje fil.

In [5]:
# Laddar filen vi skapade i notebook 02.
# Loopar igenom DataFramen, men **hoppar över filer som redan existerar**.

pdf_results = []
skipped_pdf_count = 0

if not df_analysis.empty:
    print("\nStartar bearbetning av PDF-filer (inkrementell)...")
    
    for _, row in tqdm(df_analysis.iterrows(), total=len(df_analysis), desc="Bearbetar PDF:er"):
        file_path = Path(row['full_path'])
        status = row['status']
        
        # --- KONTROLL-STEG (Kollar efter .json istället för .txt) ---
        output_json_path = TEXT_OUTPUT_DIR / f"{file_path.name}.json"
        if output_json_path.exists():
            pdf_results.append((file_path.name, "skipped_exists"))
            skipped_pdf_count += 1
            continue
        # --------------------------
            
        if not file_path.exists():
            pdf_results.append((file_path.name, "error_file_not_found"))
            continue
            
        pages_data = []
        extract_status = ""

        if status == 'text_based':
            pages_data, extract_status = extract_text_from_text_pdf(file_path)
        elif status == 'ocr_candidate':
            pages_data, extract_status = extract_text_from_ocr_pdf(file_path)
        else:
            extract_status = f"skipped_{status}"
            pdf_results.append((file_path.name, extract_status))
            continue
            
        # SPARA SOM JSON
        # Notera att vi skickar med CLEAN_DATA_DIR för att räkna ut mappsökvägen
        save_status = save_json(file_path, pages_data, TEXT_OUTPUT_DIR, CLEAN_DATA_DIR)
        
        if save_status == "success":
            pdf_results.append((file_path.name, extract_status))
        else:
            pdf_results.append((file_path.name, save_status))

    print(f"Klar med PDF-bearbetning. {skipped_pdf_count} filer hoppades över (fanns redan).")
else:
    print("\nHoppar över PDF-bearbetning (ingen rapport hittades eller så var den tom).")


Startar bearbetning av PDF-filer (inkrementell)...


Bearbetar PDF:er:   0%|          | 0/4502 [00:00<?, ?it/s]

Cannot set gray non-stroke color because /'P1' is an invalid float value
Cannot set gray non-stroke color because /'P1' is an invalid float value
Cannot set gray non-stroke color because /'P2' is an invalid float value
Cannot set gray non-stroke color because /'P3' is an invalid float value
Cannot set gray non-stroke color because /'P4' is an invalid float value
Cannot set gray non-stroke color because /'P5' is an invalid float value
Cannot set gray non-stroke color because /'P6' is an invalid float value
Cannot set gray non-stroke color because /'P7' is an invalid float value
Cannot set gray non-stroke color because /'P8' is an invalid float value
Cannot set gray non-stroke color because /'P9' is an invalid float value
Cannot set gray non-stroke color because /'P10' is an invalid float value
Cannot set gray non-stroke color because /'P266' is an invalid float value
Cannot set gray non-stroke color because /'P121' is an invalid float value
Cannot set gray non-stroke color because /'P14

Klar med PDF-bearbetning. 133 filer hoppades över (fanns redan).


### Bearbeta övriga filtyper - letar upp alla andra filer vi kan läsa och bearbetar dem.

In [6]:
file_types_to_process = ['.xlsx', '.docx', '.html', '.eml', '.txt']
other_files = []
for ext in file_types_to_process:
    other_files.extend(CLEAN_DATA_DIR.rglob(f'*{ext}'))

print(f"\nHittade {len(other_files)} övriga filer att bearbeta.")

other_results = []
skipped_other_count = 0

for file_path in tqdm(other_files, desc="Bearbetar övriga filer"):
    
    # === KONTROLL-STEG (Uppdaterat för JSON) ===
    output_json_path = TEXT_OUTPUT_DIR / f"{file_path.name}.json"
    if output_json_path.exists():
        other_results.append((file_path.name, "skipped_exists"))
        skipped_other_count += 1
        continue
    # ===========================
    
    ext = file_path.suffix.lower()
    pages_data = [] # En lista med objekt
    extract_status = ""
    
    # Här anropar vi de uppdaterade funktionerna som nu returnerar listor
    if ext == '.xlsx':
        pages_data, extract_status = extract_text_from_xlsx(file_path)
    elif ext == '.docx':
        pages_data, extract_status = extract_text_from_docx(file_path)
    elif ext == '.html':
        pages_data, extract_status = extract_text_from_html(file_path)
    elif ext == '.eml':
        pages_data, extract_status = extract_text_from_eml(file_path)
    elif ext == '.txt':
        pages_data, extract_status = extract_text_from_txt(file_path)
    
    # SPARA SOM JSON
    # Vi använder save_json och skickar med CLEAN_DATA_DIR för korrekt sökväg
    save_status = save_json(file_path, pages_data, TEXT_OUTPUT_DIR, CLEAN_DATA_DIR)
    
    if save_status == "success":
        other_results.append((file_path.name, extract_status))
    else:
        other_results.append((file_path.name, save_status))

print(f"Klar med bearbetning av övriga filer. {skipped_other_count} filer hoppades över (fanns redan).")


Hittade 131 övriga filer att bearbeta.


Bearbetar övriga filer:   0%|          | 0/131 [00:00<?, ?it/s]

  warn(msg)


Klar med bearbetning av övriga filer. 17 filer hoppades över (fanns redan).


### Slutsummering

In [8]:
# Den här cellen kan du nu köra om och om igen. Den enda importen den behöver är `pandas`, som importera IGEN här, för säkerhets skull.
import pandas as pd
from pathlib import Path

# Definiera sökvägar igen (ifall kerneln startats om)
BASE_DIR = Path(r"C:\Users\Dator\Documents\Data_Science\11_Examensarbete\green_power_sweden")
DATA_DIR = BASE_DIR / "data"
PROCESSED_DIR = DATA_DIR / "02_processed"
TEXT_OUTPUT_DIR = PROCESSED_DIR / "extracted_text"

print("--- Summering PDF-bearbetning ---")
if 'pdf_results' in locals():
    # Om vi körde allt nu, visa den nya rapporten
    df_pdf_res = pd.DataFrame(pdf_results, columns=['filename', 'status'])
    print(df_pdf_res['status'].value_counts())
else:
    # Om kerneln startats om, visa bara en notis
    print("('pdf_results' finns ej i minnet, visar totalt antal filer istället)")

print("\n--- Summering Övriga filer ---")
if 'other_results' in locals():
    # Om vi körde allt nu, visa den nya rapporten
    df_other_res = pd.DataFrame(other_results, columns=['filename', 'status'])
    print(df_other_res['status'].value_counts())
else:
    # Om kerneln startats om, visa bara en notis
    print("('other_results' finns ej i minnet, visar totalt antal filer istället)")


print(f"\n--- Total inventering på disk ---")
print(f"Totala antalet textfiler skapade i {TEXT_OUTPUT_DIR}:")
total_txt_files = len(list(TEXT_OUTPUT_DIR.glob('*.*.json'))) # Säkrare räkning

print(total_txt_files)

--- Summering PDF-bearbetning ---
status
success                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                4239
skipped_exists                                                                                                                                                                                                                                                                                                                                                                                                                                                                     