In [1]:
import ray
import torch
import os
import langchain_community
from ray.data import ActorPoolStrategy
from tqdm import tqdm
import pandas as pd
from ray.data import from_pandas
from functools import partial
import torch
from functools import partial
from bioBERT_encoder import bioBERTEncoder
from medCPT_encoder import medCPTArticleEncoder

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using {device}.")

Using cuda.


### Initializing Ray

In [3]:
runtime_env = {
    "pip": [
        "langchain-text-splitters",
        "langchain_community", 
        "sentence_transformers"
    ],
}

if not ray.is_initialized():
    ray.init(runtime_env=runtime_env)
else:
    ray.shutdown()

2024-05-02 15:37:12,105	INFO worker.py:1567 -- Connecting to existing Ray cluster at address: 10.10.2.206:6379...
2024-05-02 15:37:12,116	INFO worker.py:1743 -- Connected to Ray cluster. View the dashboard at [1m[32m127.0.0.1:8265 [39m[22m


In [4]:
available_resources = ray.available_resources()
print("Verfügbare Ressourcen:", available_resources)

Verfügbare Ressourcen: {'CPU': 32.0, 'object_store_memory': 17714153471.0, 'memory': 40311046965.0, 'GPU': 4.0, 'accelerator_type:T4': 4.0, 'node:10.10.3.5': 1.0, 'node:10.10.3.72': 1.0, 'node:10.10.2.206': 1.0, 'node:__internal_head__': 1.0, 'node:10.10.2.65': 1.0}


In [5]:
directory_path = "data/pubmed/chunk/"
file_names = os.listdir(directory_path)
file_paths = [os.path.join(directory_path, file_name) for file_name in file_names]
jsonl_file_paths = [file_path for file_path in file_paths if file_path.endswith('.jsonl')]

jsonl_file_paths[:5]

['data/pubmed/chunk/pubmed23n0046.jsonl',
 'data/pubmed/chunk/pubmed23n0050.jsonl',
 'data/pubmed/chunk/pubmed23n0003.jsonl',
 'data/pubmed/chunk/pubmed23n0117.jsonl',
 'data/pubmed/chunk/pubmed23n0068.jsonl']

### Using only head node for embedding.

Initializing BioBERT Embedding Model

In [6]:
encoder = bioBERTEncoder()

Iterating through every JSONL file adding the attribute "embeddings"

In [10]:
import os
import json
from pathlib import Path

# Definiere die Pfade für die Quell- und Zielverzeichnisse
source_directory = Path('data/pubmed/chunk')
target_directory = Path('data/pubmed/embedded')
target_directory.mkdir(parents=True, exist_ok=True)

# Iteriert durch jede Datei im Quellverzeichnis
for file_name in tqdm(os.listdir(source_directory)):
    if file_name.endswith('.jsonl'):
        source_file = source_directory / file_name
        target_file = target_directory / file_name

        # Erstellt eine neue Datei im Zielverzeichnis
        with open(target_file, 'w') as target:
            with open(source_file, 'r') as source:
                for line in source:
                    # Jede Zeile ist ein JSON-Objekt
                    item = json.loads(line)
                    # Verarbeite das Item mit EmbedChunks
                    embedded_item = encoder([item])[0]  # [0], weil embedder eine Liste zurückgibt
                    # Schreibe das bearbeitete Objekt in die Zieldatei
                    target.write(json.dumps(embedded_item) + '\n')
            #print(f"{target_file} has been successfully written to data/pubmed/embedded")
                    
print("Alle Dateien wurden verarbeitet und gespeichert.")

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 99/99 [13:00:29<00:00, 473.02s/it]

Alle Dateien wurden verarbeitet und gespeichert.





To improve performance we'll try to distribute the embedding process on the Ray cluster using 4 nodes with GPUs. With one node the embedding of 1.8 mio documents took 14 hours.

### MedCPT
now encode with article encoder of MedCPT

In [5]:
encoder = medCPTArticleEncoder()

In [None]:
import os
import json
from pathlib import Path

# Definiere die Pfade für die Quell- und Zielverzeichnisse
source_directory = Path('data/pubmed/chunk')
target_directory = Path('data/pubmed/embedded_MedCPT')
target_directory.mkdir(parents=True, exist_ok=True)

# Iteriert durch jede Datei im Quellverzeichnis
for file_name in tqdm(os.listdir(source_directory)):
    if file_name.endswith('.jsonl'):
        source_file = source_directory / file_name
        target_file = target_directory / file_name

        # Erstellt eine neue Datei im Zielverzeichnis
        with open(target_file, 'w') as target:
            with open(source_file, 'r') as source:
                for line in source:
                    # Jede Zeile ist ein JSON-Objekt
                    item = json.loads(line)
                    # Verarbeite das Item mit EmbedChunks
                    embedded_item = encoder([item])[0]  # [0], weil embedder eine Liste zurückgibt
                    # Schreibe das bearbeitete Objekt in die Zieldatei
                    target.write(json.dumps(embedded_item) + '\n')
            #print(f"{target_file} has been successfully written to data/pubmed/embedded")
                    
print("Alle Dateien wurden verarbeitet und gespeichert.")