In [1]:
from contextlib import contextmanager
from sklearn.metrics import precision_recall_curve, accuracy_score, roc_auc_score
from sklearn.metrics import f1_score, recall_score, precision_score
from collections import OrderedDict, defaultdict
from itertools import repeat
from datetime import datetime
from pathlib import Path
from collections import defaultdict
from scipy.sparse import linalg
import sklearn
import matplotlib.cm as cm
import pandas as pd
import torch.nn.functional as F
import torch.nn as nn
import math
import tqdm
import shutil
import queue
import random
import time
import json
import torch
import h5py
import logging
import numpy as np
import os
import sys
import pickle
import scipy.sparse as sp
import wandb
from sentence_transformers import SentenceTransformer
from constants import CORTEX_REGIONS_DESCRIPTIONS, ELECTRODES_BROADMANN_MAPPING, BROADMANN_AREA_DESCRIPTIONS
from collections import defaultdict
import torch
import os
import pandas as pd
from torch.utils.data import DataLoader, Dataset
from torch.nn import BCEWithLogitsLoss
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score


2024-05-11 05:53:17.122053: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
def get_electrode_descriptions(electrode_brodmann_map, brodmann_area_descrips):
    """map electrode names to brodmann areas descriptions and return a dictionary for electrode descriptions
    
    Args:
        electrode_brodmann_map (dict): electrode to brodmann area mapping
        brodmann_area_descrips (dict): brodmann area descriptions
            
    Returns:
        dict: electrode descriptions
    """
    electrode_descriptions = dict()
    for electrode, brodmann_area in electrode_brodmann_map.items():
        electrode_descriptions[electrode] = brodmann_area_descrips[brodmann_area]
    return electrode_descriptions
    
import pandas as pd
def preprocess_data(csv_file):
    data = pd.read_csv(csv_file)
    text_data = []
    for file_path in data['filename']:
        file_path = file_path.replace('\\', '/')
        file_path = os.path.join('/home/user/Downloads/', file_path)
        try:
            with open(file_path, 'r', encoding='utf-8') as file:
                text = file.read()
        except (UnicodeDecodeError, FileNotFoundError):
            text = ''
        text_data.append(text)
    data['text'] = text_data

    seizure_types = ['cpsz', 'gnsz', 'fnsz', 'absz', 'tnsz', 'tcsz', 'spsz', 'mysz']
    for seizure_type in seizure_types:
        data[seizure_type] = data[seizure_type].apply(lambda x: 1 if x > 0 else 0)

    return data, seizure_types


In [7]:
import os
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split

def get_semantic_embeds():
    os.environ["TOKENIZERS_PARALLELISM"] = "false"
    
    # Load Sentence Transformer
    st_model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
    
    # Load and preprocess data
    csv_file = '/DATA/seizure_count.csv'
    data, seizure_types = preprocess_data(csv_file)
    train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)
    
    # Get embeddings for the text data
    train_embeddings = st_model.encode(list(train_data['text']), convert_to_tensor=True)
    val_embeddings = st_model.encode(list(val_data['text']), convert_to_tensor=True)
    
    # Encode descriptions
    node_descriptions = get_electrode_descriptions(ELECTRODES_BROADMANN_MAPPING, BROADMANN_AREA_DESCRIPTIONS)
    cortex_descriptions = CORTEX_REGIONS_DESCRIPTIONS.values()
    descriptions = list(node_descriptions.values()) + list(cortex_descriptions)
    
    # Get embeddings for descriptions
    description_embeddings = st_model.encode(descriptions, convert_to_tensor=True)
    
    return train_embeddings, val_embeddings, description_embeddings


In [8]:
a,b,c=get_semantic_embeds()

In [13]:
print("Size of train embeddings:", a.size())
print("Size of validation embeddings:", b.size())
print("Size of description embeddings:", c.size())

Size of train embeddings: torch.Size([1138, 768])
Size of validation embeddings: torch.Size([285, 768])
Size of description embeddings: torch.Size([25, 768])


In [10]:
b

tensor([[-0.0254, -0.0167,  0.0169,  ..., -0.0350, -0.0460, -0.0333],
        [-0.0405, -0.0535,  0.0193,  ..., -0.0456, -0.0073, -0.0172],
        [-0.0182, -0.0097,  0.0273,  ..., -0.0406, -0.0360, -0.0270],
        ...,
        [-0.0430, -0.0401,  0.0096,  ..., -0.0499, -0.0382, -0.0340],
        [-0.0079, -0.0567,  0.0167,  ..., -0.0509, -0.0341, -0.0344],
        [-0.0207, -0.0281,  0.0100,  ..., -0.0383, -0.0360, -0.0416]],
       device='cuda:0')

In [11]:
c

tensor([[ 0.0198, -0.0619, -0.0255,  ..., -0.0266,  0.0161, -0.0167],
        [ 0.0135, -0.0590, -0.0275,  ..., -0.0193,  0.0202, -0.0129],
        [ 0.0307, -0.0438, -0.0280,  ..., -0.0353,  0.0168, -0.0239],
        ...,
        [ 0.0021, -0.0331, -0.0372,  ..., -0.0159,  0.0293, -0.0088],
        [ 0.0144, -0.0657, -0.0281,  ..., -0.0214,  0.0156, -0.0443],
        [ 0.0160, -0.0452, -0.0007,  ..., -0.0107,  0.0222, -0.0350]],
       device='cuda:0')

In [14]:
import torch

def aggregate_embeddings(embeddings, num_nodes=25):

    mean_embedding = torch.mean(embeddings, dim=0, keepdim=True)
    aggregated_embeddings = mean_embedding.repeat(num_nodes, 1)
    return aggregated_embeddings


train_aggregated = aggregate_embeddings(a)
val_aggregated = aggregate_embeddings(b)

print("Size of aggregated train embeddings:", train_aggregated.size())
print("Size of aggregated validation embeddings:", val_aggregated.size())
print("Size of description embeddings:", c.size())


Size of aggregated train embeddings: torch.Size([25, 768])
Size of aggregated validation embeddings: torch.Size([25, 768])
Size of description embeddings: torch.Size([25, 768])


In [15]:
combined_embeddings = torch.stack([train_aggregated, val_aggregated, c], dim=0)

print("Shape of combined embeddings:", combined_embeddings.shape)

Shape of combined embeddings: torch.Size([3, 25, 768])


In [16]:
average_embeddings = (train_aggregated + val_aggregated + c) / 3

print("Shape of average embeddings:", average_embeddings.shape)

Shape of average embeddings: torch.Size([25, 768])


In [19]:
average_embeddings1= average_embeddings.cpu()  
avg_embeddings_np = average_embeddings1.numpy() 

In [20]:
np.save('avg_embeddings.npy', avg_embeddings_np)