### Text-extrahering, konverterar alla filer till rena .txt-filer

##### **Process:**
1. Läs `pdf_analysis_report.csv` för att veta vilka PDF:er som är Text vs OCR.
2. Definiera en "worker"-funktion för varje filtyp.
3. Loopa igenom PDF-rapporten och extrahera text från alla PDF:er.
4. Loopa igenom övriga filer (`.xlsx`, `.docx` etc.) och extrahera text.
5. Spara varje fils extraherade text som en ny `.txt`-fil.
   * Dessa sparas i `02_processed/extracted_text/`.

### Importer

In [2]:
import os
import shutil
import pandas as pd
import pdfplumber
import docx
import email
from bs4 import BeautifulSoup
from pathlib import Path
from tqdm.notebook import tqdm
from email.parser import BytesParser
from email import policy

# OCR-specifika importer
try:
    import pytesseract
    from pdf2image import convert_from_path
    OCR_ENABLED = True
    
    # === VIKTIGT PÅ WINDOWS ===
    # Om du inte lagt Poppler i din PATH, måste du peka ut var den finns
    # t.ex: POPPLER_PATH = r"C:\Program Files\Poppler\poppler-25.07.0\Library\bin"
    POPPLER_PATH = None 
    
    # Om du inte lagt Tesseract i din PATH
    # t.ex: pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

except ImportError:
    OCR_ENABLED = False
    print("VARNING: pytesseract eller pdf2image saknas. OCR-extrahering kommer att misslyckas.")
    print("Kör 'pip install pytesseract pdf2image' och installera Tesseract/Poppler.")


### Sätt upp sökvägar

In [4]:
BASE_DIR = Path(r"C:\Users\Dator\Documents\Data_Science\11_Examensarbete\green_power_sweden")
DATA_DIR = BASE_DIR / "data"
CLEAN_DATA_DIR = DATA_DIR / "01_raw" # Vår städade indata-mapp
PROCESSED_DIR = DATA_DIR / "02_processed"

# Vår nya mapp för all extraherad text
TEXT_OUTPUT_DIR = PROCESSED_DIR / "extracted_text"
TEXT_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# Sökväg till vår PDF-rapport
ANALYSIS_RESULT_FILE = PROCESSED_DIR / "pdf_analysis_report.csv"

print(f"Indata från: {CLEAN_DATA_DIR}")
print(f"Utdata (text) till: {TEXT_OUTPUT_DIR}")
print(f"Läser PDF-plan från: {ANALYSIS_RESULT_FILE}")
print(f"OCR-funktionalitet är: {'AKTIVERAD' if OCR_ENABLED else 'AVSTÄNGD'}")


Indata från: C:\Users\Dator\Documents\Data_Science\11_Examensarbete\green_power_sweden\data\01_raw
Utdata (text) till: C:\Users\Dator\Documents\Data_Science\11_Examensarbete\green_power_sweden\data\02_processed\extracted_text
Läser PDF-plan från: C:\Users\Dator\Documents\Data_Science\11_Examensarbete\green_power_sweden\data\02_processed\pdf_analysis_report.csv
OCR-funktionalitet är: AKTIVERAD


### Definiera extraherings-funktioner

In [5]:
# Skapar en funktion för varje filtyp för att hålla koden ren.

def save_text(original_path: Path, text_content: str, output_dir: Path):
    """Sparar den extraherade texten till en .txt-fil."""
    try:
        # Skapa ett unikt filnamn, t.ex. "min_fil.pdf" -> "min_fil.pdf.txt"
        output_filename = f"{original_path.name}.txt"
        output_path = output_dir / output_filename
        
        with open(output_path, 'w', encoding='utf-8') as f:
            f.write(text_content)
        return "success"
    except Exception as e:
        return f"error_saving: {e}"

def extract_text_from_text_pdf(file_path: Path) -> tuple[str, str]:
    """Extraherar text från en digital, textbaserad PDF."""
    text = ""
    try:
        with pdfplumber.open(file_path) as pdf:
            for page in pdf.pages:
                page_text = page.extract_text()
                if page_text:
                    text += page_text + "\n\n--- Sida Slut ---\n\n"
        return text, "success"
    except Exception as e:
        return "", f"error_pdfplumber: {e}"

def extract_text_from_ocr_pdf(file_path: Path) -> tuple[str, str]:
    """Försöker extrahera text från en skannad PDF med OCR."""
    if not OCR_ENABLED:
        return "", "error_ocr_disabled"
    
    text = ""
    try:
        # 1. Konvertera PDF till en lista med bilder
        # Använd poppler_path om du definierat det
        images = convert_from_path(file_path, poppler_path=POPPLER_PATH)
        
        # 2. Kör OCR på varje bild
        for i, img in enumerate(images):
            # Använd SVENSKT språkpaket!
            page_text = pytesseract.image_to_string(img, lang='swe')
            if page_text:
                text += page_text + f"\n\n--- Sida {i+1} (OCR) Slut ---\n\n"
        
        if not text:
            return "", "ocr_no_text_found"
            
        return text, "success_ocr"
    except Exception as e:
        return "", f"error_ocr: {e}"

def extract_text_from_xlsx(file_path: Path) -> tuple[str, str]:
    """Extraherar text från alla flikar i en Excel-fil."""
    text = ""
    try:
        xls = pd.ExcelFile(file_path)
        for sheet_name in xls.sheet_names:
            df = pd.read_excel(xls, sheet_name=sheet_name, header=None)
            text += f"--- Flik: {sheet_name} ---\n"
            # Konvertera hela dataframe till en sträng
            text += df.to_string(index=False, header=False) + "\n\n"
        return text, "success"
    except Exception as e:
        return "", f"error_excel: {e}"

def extract_text_from_docx(file_path: Path) -> tuple[str, str]:
    """Extraherar text från en Word .docx-fil."""
    text = ""
    try:
        doc = docx.Document(file_path)
        for para in doc.paragraphs:
            text += para.text + "\n"
        return text, "success"
    except Exception as e:
        return "", f"error_docx: {e}"

def extract_text_from_html(file_path: Path) -> tuple[str, str]:
    """Extraherar text från en HTML-fil."""
    try:
        with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
            soup = BeautifulSoup(f, 'html.parser')
            text = soup.get_text(separator="\n", strip=True)
        return text, "success"
    except Exception as e:
        return "", f"error_html: {e}"

def extract_text_from_eml(file_path: Path) -> tuple[str, str]:
    """Extraherar textinnehållet från en .eml e-postfil."""
    try:
        with open(file_path, 'rb') as f:
            msg = BytesParser(policy=policy.default).parse(f)
        
        text = ""
        # Försök få brödtexten
        if msg.is_multipart():
            for part in msg.walk():
                if part.get_content_type() == "text/plain":
                    text += part.get_payload(decode=True).decode('utf-8', errors='ignore')
        else:
            if msg.get_content_type() == "text/plain":
                text = msg.get_payload(decode=True).decode('utf-8', errors='ignore')

        if not text: # Fallback
             text = msg.get_body(preferencelist=('plain', 'html')).get_content()

        return text, "success"
    except Exception as e:
        return "", f"error_eml: {e}"

def extract_text_from_txt(file_path: Path) -> tuple[str, str]:
    """Läser en .txt-fil."""
    try:
        with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
            text = f.read()
        return text, "success"
    except Exception as e:
        return "", f"error_txt: {e}"

### Ladda PDF-analysrapporten

In [6]:
try:
    df_analysis = pd.read_csv(ANALYSIS_RESULT_FILE)
    print(f"Laddade {len(df_analysis)} rader från PDF-rapporten.")
    print(df_analysis['status'].value_counts()) # <-- Utskriften du saknade!
except FileNotFoundError:
    print(f"FEL: Kunde inte hitta {ANALYSIS_RESULT_FILE}")
    print("Se till att du har kört notebook 02 först.")
    df_analysis = pd.DataFrame() # Skapa tom dataframe

Laddade 4502 rader från PDF-rapporten.
status
text_based       4372
ocr_candidate     130
Name: count, dtype: int64


###  Bearbeta alla PDF-filer - använder rätt funktion (text eller OCR) för varje fil.

In [7]:
# Laddar filen vi skapade i notebook 02.
# Loopar igenom DataFramen, men **hoppar över filer som redan existerar**.

pdf_results = []
skipped_pdf_count = 0

if not df_analysis.empty:
    print("\nStartar bearbetning av PDF-filer (inkrementell)...")
    
    for _, row in tqdm(df_analysis.iterrows(), total=len(df_analysis), desc="Bearbetar PDF:er"):
        file_path = Path(row['full_path'])
        status = row['status']
        
        # --- KONTROLL-STEG ---
        output_txt_path = TEXT_OUTPUT_DIR / f"{file_path.name}.txt"
        if output_txt_path.exists():
            pdf_results.append((file_path.name, "skipped_exists"))
            skipped_pdf_count += 1
            continue
        # --------------------------
            
        if not file_path.exists():
            pdf_results.append((file_path.name, "error_file_not_found"))
            continue
            
        text_content, extract_status = "", ""
        if status == 'text_based':
            text_content, extract_status = extract_text_from_text_pdf(file_path)
        elif status == 'ocr_candidate':
            text_content, extract_status = extract_text_from_ocr_pdf(file_path)
        else:
            extract_status = f"skipped_{status}"
            pdf_results.append((file_path.name, extract_status))
            continue
        
        save_status = save_text(file_path, text_content, TEXT_OUTPUT_DIR)
        
        if save_status == "success":
            pdf_results.append((file_path.name, extract_status))
        else:
            pdf_results.append((file_path.name, save_status))

    print(f"Klar med PDF-bearbetning. {skipped_pdf_count} filer hoppades över (fanns redan).")
else:
    print("\nHoppar över PDF-bearbetning (ingen rapport hittades eller så var den tom).")


Startar bearbetning av PDF-filer (inkrementell)...


Bearbetar PDF:er:   0%|          | 0/4502 [00:00<?, ?it/s]

Cannot set gray non-stroke color because /'P61' is an invalid float value
Cannot set gray non-stroke color because /'P70' is an invalid float value
Cannot set gray non-stroke color because /'P74' is an invalid float value
Cannot set gray non-stroke color because /'P76' is an invalid float value


Klar med PDF-bearbetning. 3617 filer hoppades över (fanns redan).


### Bearbeta övriga filtyper - letar upp alla andra filer vi kan läsa och bearbetar dem.

In [8]:
file_types_to_process = ['.xlsx', '.docx', '.html', '.eml', '.txt']
other_files = []
for ext in file_types_to_process:
    other_files.extend(CLEAN_DATA_DIR.rglob(f'*{ext}'))

print(f"\nHittade {len(other_files)} övriga filer att bearbeta.")

other_results = []
skipped_other_count = 0
for file_path in tqdm(other_files, desc="Bearbetar övriga filer"):
    
    # === KONTROLL-STEG ===
    output_txt_path = TEXT_OUTPUT_DIR / f"{file_path.name}.txt"
    if output_txt_path.exists():
        other_results.append((file_path.name, "skipped_exists"))
        skipped_other_count += 1
        continue
    # ===========================
    
    ext = file_path.suffix.lower()
    text_content, extract_status = "", ""
    
    if ext == '.xlsx':
        text_content, extract_status = extract_text_from_xlsx(file_path)
    elif ext == '.docx':
        text_content, extract_status = extract_text_from_docx(file_path)
    elif ext == '.html':
        text_content, extract_status = extract_text_from_html(file_path)
    elif ext == '.eml':
        text_content, extract_status = extract_text_from_eml(file_path)
    elif ext == '.txt':
        text_content, extract_status = extract_text_from_txt(file_path)
    
    save_status = save_text(file_path, text_content, TEXT_OUTPUT_DIR)
    
    if save_status == "success":
        other_results.append((file_path.name, extract_status))
    else:
        other_results.append((file_path.name, save_status))

print(f"Klar med bearbetning av övriga filer. {skipped_other_count} filer hoppades över (fanns redan).")


Hittade 131 övriga filer att bearbeta.


Bearbetar övriga filer:   0%|          | 0/131 [00:00<?, ?it/s]

Klar med bearbetning av övriga filer. 131 filer hoppades över (fanns redan).


### Slutsummering

In [10]:
# Den här cellen kan du nu köra om och om igen. Den enda importen den behöver är `pandas`, som importera IGEN här, för säkerhets skull.
import pandas as pd
from pathlib import Path

# Definiera sökvägar igen (ifall kerneln startats om)
BASE_DIR = Path(r"C:\Users\Dator\Documents\Data_Science\11_Examensarbete\green_power_sweden")
DATA_DIR = BASE_DIR / "data"
PROCESSED_DIR = DATA_DIR / "02_processed"
TEXT_OUTPUT_DIR = PROCESSED_DIR / "extracted_text"

print("--- Summering PDF-bearbetning ---")
if 'pdf_results' in locals():
    # Om vi körde allt nu, visa den nya rapporten
    df_pdf_res = pd.DataFrame(pdf_results, columns=['filename', 'status'])
    print(df_pdf_res['status'].value_counts())
else:
    # Om kerneln startats om, visa bara en notis
    print("('pdf_results' finns ej i minnet, visar totalt antal filer istället)")

print("\n--- Summering Övriga filer ---")
if 'other_results' in locals():
    # Om vi körde allt nu, visa den nya rapporten
    df_other_res = pd.DataFrame(other_results, columns=['filename', 'status'])
    print(df_other_res['status'].value_counts())
else:
    # Om kerneln startats om, visa bara en notis
    print("('other_results' finns ej i minnet, visar totalt antal filer istället)")


print(f"\n--- Total inventering på disk ---")
print(f"Totala antalet textfiler skapade i {TEXT_OUTPUT_DIR}:")
total_txt_files = len(list(TEXT_OUTPUT_DIR.glob('*.*.txt'))) # Säkrare räkning
print(total_txt_files)

--- Summering PDF-bearbetning ---
status
skipped_exists                                                                                           3617
success                                                                                                   862
error_ocr: tesseract is not installed or it's not in your PATH. See README file for more information.      23
Name: count, dtype: int64

--- Summering Övriga filer ---
status
skipped_exists    131
Name: count, dtype: int64

--- Total inventering på disk ---
Totala antalet textfiler skapade i C:\Users\Dator\Documents\Data_Science\11_Examensarbete\green_power_sweden\data\02_processed\extracted_text:
4483
