In [1]:
import os
os.environ['TRANSFORMERS_CACHE'] = '/home/smoeding/caches/'
os.environ['XDG_CACHE_HOME'] = '/home/smoeding/caches/'
import math

import numpy as np
from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer, GenerationConfig
import torch
import pandas as pd
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import math
from sklearn.metrics import accuracy_score
import plotly.express as px
from plotly import graph_objects as go



In [2]:
icd_codes = pd.read_csv("icd10gm2024syst_kodes.txt", sep=";", header=None)

In [3]:
icd_codes.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,18,19,20,21,22,23,24,25,26,27
0,3,N,X,1,A00,A00.-,A00,A00,Cholera,Cholera,...,1,9,9,9999,9999,9,J,J,J,J
1,4,T,X,1,A00,A00.0,A00.0,A000,"Cholera durch Vibrio cholerae O:1, Biovar chol...",Cholera,...,1,9,9,9999,9999,9,J,J,J,J
2,4,T,X,1,A00,A00.1,A00.1,A001,"Cholera durch Vibrio cholerae O:1, Biovar eltor",Cholera,...,1,9,9,9999,9999,9,J,J,J,J
3,4,T,X,1,A00,A00.9,A00.9,A009,"Cholera, nicht näher bezeichnet",Cholera,...,1,9,9,9999,9999,9,J,J,J,J
4,3,N,X,1,A00,A01.-,A01,A01,Typhus abdominalis und Paratyphus,Typhus abdominalis und Paratyphus,...,2,9,9,9999,9999,9,N,J,J,J


In [4]:
diseases = list(set(icd_codes[9]))

In [5]:
diseases = [item for item in diseases if "Sonstige" not in item]
diseases = [item for item in diseases if "Nicht belegte Schlüssel" not in item]
diseases = [item for item in diseases if "Nicht näher bezeichnet" not in item]


In [6]:
len(diseases)

1377

In [7]:
model_name = "LeoLM/leo-mistral-hessianai-7b"
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype=torch.bfloat16,
)

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [8]:
tokenizer=AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token_id=tokenizer.eos_token_id

In [9]:
def get_embeddings(model, tokenizer, text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(model.device)
    with torch.no_grad():
        outputs = model(**inputs, output_hidden_states=True)
    embeddings = outputs.hidden_states[-1].mean(dim=1) # Mean pooling over tokens
    
    return embeddings

In [10]:
test_diseases = ["Alzheimer", "Tremor", "Unwillkürliche Bewegung"]

In [11]:
embeddings = []
diseases += test_diseases
for disease in diseases:
    embeddings.append(get_embeddings(model, tokenizer, disease).cpu())


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [12]:
tsne = TSNE(n_components=2, random_state=1)
X_tsne = tsne.fit_transform(np.array([tensor.float().numpy().reshape(4096) for tensor in embeddings]))



In [14]:
df = pd.DataFrame({'x': X_tsne[:,0], 'y': X_tsne[:,1], 'label': diseases})
df['color'] = 'lightblue'

# Additional DataFrame for specific diseases with red color
additional_df = pd.DataFrame()
for disease in test_diseases:
    additional_df = pd.concat([additional_df, pd.DataFrame({
        'x': X_tsne[diseases.index(disease), 0],
        'y': X_tsne[diseases.index(disease), 1],
        'label': disease,
        "color": "red"
    }, index=[1])])


fig = go.Figure()

fig.add_trace(go.Scatter(
    x=df['x'],
    y=df['y'],
    mode='markers',
    marker=dict(color=df['color']),
    text=df['label'],
))
fig.add_trace(go.Scatter(
    x=additional_df['x'],
    y=additional_df['y'],
    mode='markers+text',
    marker=dict(color=additional_df['color']),
    text=additional_df['label'],
))

fig.update_layout(title_text='ICD10 diseases german word embeddings', hovermode='closest', showlegend=False, width=800, height=800)
fig.update_traces(textposition='top center')
fig.show()