In [1]:
import os
from os import path
import sys

import json
import re
import torch

from datetime import datetime

import uuid

from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams
from qdrant_client.http import models
from qdrant_client.http.models import CollectionStatus
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams

import transformers
from sentence_transformers import SentenceTransformer

from tqdm import tqdm

import spacy
nlp = spacy.load("de_core_news_lg")

if torch.cuda.is_available():
    for i in range(torch.cuda.device_count()):
        print(f"Device {i}: {torch.cuda.get_device_name(i)}")
else:
    print("CUDA is not available.")

device = "cuda" if torch.cuda.is_available() else "cpu"

print(f"using {device} as device")

Device 0: NVIDIA RTX 6000 Ada Generation
using cuda as device


In [2]:
from configparser import ConfigParser

cfg = ConfigParser()
cfg.read("config.ini")

print("using the following configuration:\n")

WEBSITE_STORAGE_PATH = cfg["STORAGE"]["PATH"]

QDRANT_URL = cfg["QDRANT"]["URL"]
QDRANT_WEBSITES = cfg["QDRANT"]["COMPANY_WEBSITES"]

JSON_FILE_PATH = cfg["JSON_DATA"]["PATH"]
JSON_COMPANY_ID = cfg["JSON_DATA"]["COMPANY_ID"]
JSON_COMPANY_NAME = cfg["JSON_DATA"]["COMPANY_NAME"]
 
SENTENCE_TRANSFORMER = cfg["LLM"]["SENTENCE_TRANSFORMER"]

print("------ WEBSITE STORAGE ----------")
print("downloaded data:       ", WEBSITE_STORAGE_PATH)
print()
print("------ QDRANT VECTOR STORE ------")
print("webservice url:        ", QDRANT_URL)
print("collection name:       ", QDRANT_WEBSITES)
print()
print("------ JSON_DATA ----------------")
print("JSON data file path:   ", JSON_FILE_PATH)
print("company id field:      ", JSON_COMPANY_ID)
print("company name field:    ", JSON_COMPANY_NAME)
print()
print("------ LLM ----------------------")
print("sentence transformer:  ", SENTENCE_TRANSFORMER)
print()

using the following configuration:

------ WEBSITE STORAGE ----------
downloaded data:        /local/innecs-local/uk-website-data

------ QDRANT VECTOR STORE ------
webservice url:         http://qdrant:6333
collection name:        ies_uk_website_sentences

------ JSON_DATA ----------------
JSON data file path:    /local/innecs-local/uk-website-data.json
company id field:       company_id
company name field:     company_name

------ LLM ----------------------
sentence transformer:   all-MiniLM-L6-v2



# Load creditreform metadata

In [3]:
f = open(JSON_FILE_PATH)
data = json.loads(f.read())
f.close()

In [4]:
print(json.dumps(data[0], indent=4))

{
    "company_name": "KRELLIAN LTD",
    "company_id": "12543219",
    "weburl": "http://krellian.com",
    "category": "Private Limited Company",
    "status": "Active",
    "incorporation_date": "01/04/2020",
    "country_of_origin": "United Kingdom",
    "address_posttown": "NEWCASTLE UPON TYNE",
    "address_postcode": "NE1 2DF",
    "address_country": "ENGLAND",
    "address_lines": [
        "TOFFEE FACTORY",
        "TOFFEE FACTORY"
    ],
    "sic_code_texts": [
        "47410 - Retail sale of computers, peripheral units and software in specialised stores"
    ],
    "storage_folder": "12543219_krellian-com"
}


# Load website data and embedd

In [5]:
model = SentenceTransformer(SENTENCE_TRANSFORMER)

qdrant = QdrantClient(url=QDRANT_URL)
# qdrant = QdrantClient(host="172.18.0.4", port=6333, timeout=20)

qdrant_collection_name = QDRANT_WEBSITES

qdrant.recreate_collection(
    collection_name=qdrant_collection_name,
    vectors_config=VectorParams(
        size=model.get_sentence_embedding_dimension(), 
        distance=Distance.COSINE,
        on_disk=True,
    ),
)

def create_index(field, type):
    qdrant.create_payload_index(
        collection_name=qdrant_collection_name,
        field_name=field,
        field_schema=type,        
    )
    
create_index("company_id", "keyword")
create_index("company_name", "keyword")
create_index("idx", "integer")
create_index("filename", "keyword")

  qdrant = QdrantClient(url=QDRANT_URL)
  qdrant.recreate_collection(


KeyboardInterrupt: 

In [None]:
company_dict = dict()
with open(JSON_FILE_PATH) as f:
    data = json.load(f)
    print(json.dumps(data[0:2], indent=4))
    for entry in data:
        company_dict[entry[JSON_COMPANY_ID]] = entry

In [None]:
import pickle

error_counter = 0

def handle_websitefolder(websitefolder, companydata):
    global error_counter

    no_prev_entries = 0
    
    for file in os.listdir(websitefolder):
        filepath = path.join(websitefolder, file)
        if path.isfile(filepath) and filepath.endswith(".txt"):
            with open(filepath, "r") as f:
                sentences = []
                doc = nlp(f.read())
                sentences = list([sent.text for sent in doc.sents])
                embeddings = model.encode(sentences)
                
                payloads = [
                    {
                        "filepath": filepath, 
                        "filename": file, 
                        "company_id": companydata[JSON_COMPANY_ID], 
                        "name": companydata[JSON_COMPANY_NAME], 
                        "idx": i, 
                        "text": sentence.strip(), 
                        "company_data": companydata
                    } for i, sentence in enumerate(sentences)
                ]
                
                ids = [(no_prev_entries + i + 1) for i in range(len(payloads))]
                no_prev_entries += len(payloads)

                batch_size = 100
                for i in range(0, len(payloads), batch_size):
                    batch_payloads = payloads[i:i + batch_size]
                    batch_vectors = embeddings[i:i + batch_size]
                    try:
                        qdrant.upsert(
                            collection_name=qdrant_collection_name,
                            points=models.Batch(
                                ids = [str(uuid.uuid4()) for i in range(len(batch_vectors))],
                                vectors=batch_vectors,
                                payloads=batch_payloads
                            )
                        )
                    except Exception as e: 
                        with open(f"logs/error-logs.log", "a") as f:
                            f.write(str(e) + "\n")
                        print(e)
                        error_counter += 1
                        with open(f"logs/error-payloads_{error_counter}.pkl", "wb") as f:
                            pickle.dump(payloads, f)    

with open("logs/progress.log", "w") as progress_log:
    print("parsing website folder at:", WEBSITE_STORAGE_PATH)
    print("writing progress log at: 'logs/progress.log'")
    progress_log.write(str(datetime.now()) + "\n")
    for folder in tqdm(os.listdir(WEBSITE_STORAGE_PATH), file=progress_log):
        websitefolder = path.join(WEBSITE_STORAGE_PATH, folder)
        if path.isdir(websitefolder):
            company_id = folder[0: folder.find("_")]
            company_data = company_dict[company_id]            
            handle_websitefolder(websitefolder, company_data)