In [1]:
import pandas as pd
from typing import List, Dict, Tuple
import pickle
from neo4j import GraphDatabase
import json
from transformers import AutoTokenizer, AutoModel
from neo4j_helpers import NodeModel, RelationshipModel, create_nodes, create_relationships
from data_processing_helpers import flatten_dict

driver = GraphDatabase.driver("bolt://localhost:7687", auth=("neo4j", "password"))

train_data_path = 'data/train_data.json'
val_data_path = 'data/val_data.json'

file_path = 'data/val_result.pkl'

with open(train_data_path, 'r') as file:
    train_data = [json.loads(line) for line in file.readlines()]

with open(val_data_path, 'r') as file:
    val_data = [json.loads(line) for line in file.readlines()]
                         
# Open the file in read-binary mode and load the list of dictionaries
with open(file_path, 'rb') as file:
    val_result_list = pickle.load(file)

train_list = [flatten_dict(item) for item in train_data]
val_list = [flatten_dict(item) for item in val_data]

train_df = pd.DataFrame(train_list)
val_df = pd.DataFrame(val_list)

result_list, label_list = zip(*val_result_list)
result_df, label_df = pd.DataFrame(result_list), pd.DataFrame(label_list)


In [2]:
train_df.shape, val_df.shape

((14339, 6), (3585, 6))

In [3]:
train_df.head()

Unnamed: 0,text,object,object_type,predicate,subject,subject_type
0,产后抑郁症@区分产后抑郁症与轻度情绪失调（产后忧郁或“婴儿忧郁”）是重要的，因为轻度情绪失调...,轻度情绪失调,疾病,鉴别诊断,产后抑郁症,疾病
1,类风湿关节炎@尺侧偏斜是由于MCP关节炎症造成的。,尺侧偏斜,症状,临床表现,MCP关节炎症,疾病
2,唇腭裂@ ### 腭瘘 | 存在差异 | 低 大约 10% 至 20% 颚成形术发生腭瘘。 ...,婴儿伤口,社会学,风险评估因素,腭瘘,疾病
3,成人哮喘@ 应在低剂量 ICS 的基础上加用一种 LABA， 或将ICS增加到中等剂量。 成...,ICS,药物,药物治疗,成人哮喘,疾病
4,口咽癌@[ 声嘶及发声障碍的评估 ](/topics/zh-cn/845) ### 手术或放...,放疗后吞咽困难,疾病,并发症,口咽癌,疾病


In [4]:
# Load model directly
from transformers import AutoTokenizer, AutoModel
import torch

tokenizer = AutoTokenizer.from_pretrained("BAAI/bge-large-en")
model = AutoModel.from_pretrained("BAAI/bge-large-en")
model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 1024, padding_idx=0)
    (position_embeddings): Embedding(512, 1024)
    (token_type_embeddings): Embedding(2, 1024)
    (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-23): 24 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=1024, out_features=1024, bias=True)
            (key): Linear(in_features=1024, out_features=1024, bias=True)
            (value): Linear(in_features=1024, out_features=1024, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=1024, out_features=1024, bias=True)
            (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, 

In [5]:
from functools import partial

from typing import List
def sentences2embeddings(sentence: str, model, tokenizer) -> pd.Series:
    sentence_list = [sentence]
    # Tokenize sentences
    encoded_input = tokenizer(sentence_list, padding=True, truncation=True, return_tensors='pt')

    # Compute token embeddings
    with torch.no_grad():
        model_output = model(**encoded_input)
        # Perform pooling. In this case, cls pooling.
        sentence_embeddings = model_output[0][:, 0]
    # normalize embeddings
    sentence_embeddings = torch.nn.functional.normalize(sentence_embeddings, p=2, dim=1)

    embeddings = sentence_embeddings.numpy()[0].tolist()

    return embeddings


embedding_func = partial(sentences2embeddings, model=model, tokenizer=tokenizer)   

In [6]:
embeddings = embedding_func('abc')

In [7]:
len(embeddings)

1024

In [8]:
# label_mask = label_df.map(lambda x: isinstance(x, str)).all(axis=1)
# label_df_filtered = label_df[label_mask]

# result_mask = result_df.map(lambda x: isinstance(x, str)).all(axis=1)
# result_df_filtered = result_df[result_mask]

In [None]:
# result_df.shape, result_df_filtered.shape, label_df.shape, label_df_filtered.shape

In [9]:
import re

# Define a function to wrap numbers with backticks
def wrap_numbers_with_backticks(text):
    return re.sub(r'(\d+)', r'`\1`', text)


# Define a regular expression pattern for special characters
pattern = r'[^\u4e00-\u9fffA-Za-z\s]'

for col in ['subject', 'object', 'predicate']:
    # Apply the function to the column
    train_df[col] = train_df[col].str.replace(pattern, '', regex=True)
    train_df = train_df.drop(train_df[train_df[col] == ''].index)
    train_df[col] = train_df[col].str.replace(' ', '_')
    
    # Remove special characters from 'Column1'
    # label_df_filtered[col] = label_df_filtered[col].str.replace(pattern, '', regex=True)
    # label_df_filtered = label_df_filtered.drop(label_df_filtered[label_df_filtered[col] == ''].index)
    # label_df_filtered[col] = label_df_filtered[col].str.replace(' ', '_')
    
    # # Apply the function to the column
    # result_df_filtered[col] = result_df_filtered[col].str.replace(pattern, '', regex=True)
    # result_df_filtered = result_df_filtered.drop(result_df_filtered[result_df_filtered[col] == ''].index)
    # result_df_filtered[col] = result_df_filtered[col].str.replace(' ', '_')

    # Apply the function to the column
    # result_df_filtered[col] = result_df_filtered[col].apply(wrap_numbers_with_backticks)


In [None]:
node_knowledge_train = NodeModel(
    label=("hard_coded_label", "Knowledge"),
    id_prop=('text', 'text'),
    properties={
        "text": ("text_embedding", embedding_func)
    }
)

node_subject_train = NodeModel(
    # label=("column_name_label", "subject"),
    label=("hard_coded_label", "Subject"),
    id_prop=('subject', 'name'),
    properties={
        "subject_type": ("type", None)
    },
    # extra_labels=['Subject', 'Result']
)

node_object_train = NodeModel(
    # label=("column_name_label", "object"),
    label=("hard_coded_label", "Object"),
    id_prop=('object', 'name'),
    properties={
        "object_type": ("type", None)
    },
)


# node_subject_label = NodeModel(
#     label=("column_name_label", "subject"),
#     id_prop=('subject', 'name'),
#     properties={
#         "subject_type": "type"
#     },
#     extra_labels=['Subject', 'Label']
# )

# node_object_label = NodeModel(
#     label=("column_name_label", "object"),
#     id_prop=('object', 'name'),
#     properties={
#         "object_type": "type"
#     },
#     extra_labels=['Object', 'Label']
# )

# node_subject_result = NodeModel(
#     # label=("column_name_label", "subject"),
#     label=("hard_coded_label", "Subject"),
#     id_prop=('subject', 'name'),
#     properties={
#         "subject_type": "type"
#     },
#     # extra_labels=['Subject', 'Result']
# )

# node_object_result = NodeModel(
#     # label=("column_name_label", "object"),
#     label=("hard_coded_label", "Object"),
#     id_prop=('object', 'name'),
#     properties={
#         "object_type": "type"
#     },
#     # extra_labels=['Object', 'Result']
# )

train_df_copy = train_df.loc[:1000, :].copy()

with driver.session() as session:
    session.write_transaction(create_nodes, train_df_copy, [node_knowledge_train, node_subject_train, node_object_train])

    # session.write_transaction(create_nodes, label_df_filtered, [node_subject_label, node_object_label])
    # session.write_transaction(create_nodes, result_df_filtered, [node_subject_result, node_object_result])



In [11]:
train_df_copy = train_df.loc[:1000, :].copy()

rel_train_knowledge2subject = RelationshipModel(
    source_node='Knowledge',
    target_node='Subject',
    # rel_label=('column_name_label', 'predicate'),
    rel_label=('hard_coded_label', 'HAS_SUBJECT'),
    source_id=('text', 'text'),
    target_id=('subject', 'name'),
    # extra_labels=['Predicate', 'Result']
)

rel_train_subject2object = RelationshipModel(
    source_node='Subject',
    target_node='Object',
    # rel_label=('column_name_label', 'predicate'),
    rel_label=('hard_coded_label', 'PREDICATE'),
    source_id=('subject', 'name'),
    target_id=('object', 'name'),
    properties={
        "predicate": "type"
    }
)

# rel_label = RelationshipModel(
#     source_node='Subject',
#     target_node='Object',
#     rel_label=('column_name_label', 'predicate'),
#     source_id=('subject', 'name'),
#     target_id=('object', 'name'),
#     extra_labels=['Predicate', 'Label']
# )

# rel_result = RelationshipModel(
#     source_node='Subject',
#     target_node='Object',
#     # rel_label=('column_name_label', 'predicate'),
#     rel_label=('hard_coded_label', 'Predicate'),
#     source_id=('subject', 'name'),
#     target_id=('object', 'name'),
#     # extra_labels=['Predicate', 'Result']
# )

with driver.session() as session:
    session.write_transaction(create_relationships, train_df_copy, [rel_train_knowledge2subject, rel_train_subject2object])
    # session.write_transaction(create_relationships, label_df_filtered, [rel_label])
    # session.write_transaction(create_relationships, result_df_filtered, [rel_result])

  session.write_transaction(create_relationships, train_df_copy, [rel_train_knowledge2subject, rel_train_subject2object])
