In [None]:
!pip install -q sentence-transformers faiss-cpu googlesearch-python beautifulsoup4 lxml

In [None]:
from sentence_transformers import SentenceTransformer
from googlesearch import search
import faiss
import numpy as np
import requests
from bs4 import BeautifulSoup
import re

In [None]:
# Modell laden (multilingual, inkl. Deutsch)
model = SentenceTransformer('distiluse-base-multilingual-cased-v1')

In [None]:
def scrape_text_from_url(url):
    try:
        headers = {'User-Agent': 'Mozilla/5.0'}
        response = requests.get(url, headers=headers, timeout=10)
        soup = BeautifulSoup(response.text, 'lxml')
        paragraphs = soup.find_all(['p'])
        text = ' '.join(p.get_text() for p in paragraphs)
        text = re.sub(r'\s+', ' ', text).strip()
        return text if len(text) > 100 else ''
    except:
        return ''

In [None]:
def film_suche(query, num_results=10, top_k=5):
    print("Suche im Web…")
    # deutschsprachige Filmseiten durchsuchen
    sites = ["de.wikipedia.org", "filmstarts.de", "moviepilot.de"]
    query_sites = " OR ".join(f"site:{s}" for s in sites)
    urls = list(search(f"{query} {query_sites}", num_results=num_results, lang='de'))
    print(f"{len(urls)} URLs gefunden.")

    docs, sources = [], []
    for url in urls:
        text = scrape_text_from_url(url)
        if text:
            docs.append(text)
            sources.append(url)
    print(f"{len(docs)} Texte extrahiert.")
    if not docs:
        print("Keine Inhalte gefunden.")
        return

    # Embeddings & FAISS
    doc_emb = model.encode(docs, convert_to_numpy=True)
    index = faiss.IndexFlatL2(doc_emb.shape[1])
    index.add(doc_emb)

    q_emb = model.encode([query], convert_to_numpy=True)
    D, I = index.search(q_emb, top_k)

    print("\nTop-Ergebnisse:")
    for idx in I[0]:
        print("---")
        print(f"Quelle: {sources[idx]}")
        print(docs[idx][:500] + '...')

In [None]:
# Beispiel-Aufruf:
film_suche("Film mit Zeitreise und Motorrad in den 80ern", num_results=15, top_k=5)