In [5]:
import json
import logging
import math
import os
import random
import re
import shutil
import sys
from collections import defaultdict
from pathlib import Path

import numpy as np
import pandas as pd
from pandas import DataFrame
from tqdm import tqdm
from icecream import ic

ic.configureOutput(includeContext=True, argToStringFunction=str)
ic.lineWrapWidth = 120
sys.path.append(os.path.abspath('.'))

SEED = 0
random.seed(SEED)
np.random.seed(SEED)


def read_chem_x_gene():
    """  """
    file = Path('/mnt/nas1/corpus-bio-nlp/NER/PGx_CTD_chem_x_gene.csv')
    df_data = pd.read_csv(file)
    print(df_data.columns.to_list())
    print(df_data.entity_type.unique())
    unique_entity_names = df_data.entity_name.unique().tolist()
    print(len(unique_entity_names), unique_entity_names)
    print('genes' in df_data.entity_name.unique().tolist())
    print('proteins' in df_data.entity_name.unique().tolist())
    print('chemicals' in df_data.entity_name.unique().tolist())

    # check short general entities
    possible_general_entities = []
    short_general_entities = ['gene', 'protein', 'chemical', 'drug']
    for short_entity in short_general_entities:
        for unique_entity in unique_entity_names:
            if short_entity in unique_entity:
                possible_general_entities.append(unique_entity)
    possible_general_entities = sorted(possible_general_entities, key=lambda x: len(x.split()))
    possible_general_entities_file = 'possible_general_entities_file.log'
    with open(possible_general_entities_file, 'w') as f:
        f.write('\n'.join(possible_general_entities))

    # filter general entities, not filter drup
    # general_entities = ['genes', 'proteins', 'multidrug', 'drug']
    general_entities = ['genes', 'proteins']
    df_data = df_data[~df_data.entity_name.isin(general_entities)]
    df_data.rename(columns={"id": "uid"}, inplace=True)

    # save and check uid contains pgx_
    _df_data = df_data[df_data["uid"].str.contains("pgx_")]
    ic(len(df_data), len(_df_data))
    file = 'PGx_CTD_chem_x_gene_pgx-starts.csv'
    _df_data.to_csv(file, index=False)

    ic(len(df_data), df_data.sentence.nunique())

    file = 'PGx_CTD_chem_x_gene_sentence_same_sorted.csv'
    df_data = df_data.sort_values(by=['sentence'])
    df_data.to_csv(file, index=False)
    return df_data


def merge_ner(x):
    sent = x.sentence.tolist()[0]
    lst_ents = []
    for ini, end, entity_name, entity_type in zip(x.entity_ini.tolist(), x.entity_end.tolist(), x.entity_name.tolist(), x.entity_type.tolist()):
        if pd.isna(ini):
            continue
        entity_in_sent = sent[ini:end]
        if entity_in_sent != entity_name:
            ic(entity_in_sent, entity_name, ini, end, sent)
            if sent[ini: ini+len(entity_name)] == entity_name:
                end = ini+len(entity_name)
                lst_ents.append([ini, end, entity_name, entity_type])
        else:
            lst_ents.append([ini, end, entity_name, entity_type])
    return pd.Series([lst_ents])


df_data = read_chem_x_gene()
df_data_ner = df_data.groupby(by=["sentence"]).apply(lambda x: merge_ner(x)).reset_index().rename(columns={0: "entity_info"})
ic(df_data_ner.columns.to_list())
df_data_ner = df_data_ner.merge(df_data[["uid", "sentence"]].drop_duplicates(), on=["sentence"])
df_data_ner.drop_duplicates(subset=["sentence"])
print(len(df_data_ner))

ic| 263826966.py:59 in read_chem_x_gene()- len(df_data): 6537, len(_df_data): 3441
ic| 263826966.py:63 in read_chem_x_gene()- len(df_data): 6537, df_data.sentence.nunique(): 1655
ic| 263826966.py:79 in merge_ner()
    entity_in_sent: H(2)O(2),
    entity_name: H(2)O(2)
    ini: 110
    end: 119
    sent: Apoptosis induced by 1 microM of As(2)O(3) in NKM-1 cells was accompanied by an increased cellular content of H(2)O(2), a decreased mitochondrial membrane potential (Deltapsim), and activation of caspase-3.
ic| 263826966.py:79 in merge_ner()
    entity_in_sent: As(2)O(3)).
    entity_name: As(2)O(3))
    ini: 202
    end: 213
    sent: OBJECTIVE: To investigate molecular mechanism of tissue factor (TF) expression on acute promyelocytic leukemia cell line NB4 cells down-regulated by all-trans retinoic acid (ATRA) and arsenic trioxide (As(2)O(3)).


['sentence', 'entity_type', 'entity_name', 'entity_ini', 'entity_end', 'text', 'id']
['Chemical' 'Gene']
3013 ['estrone', 'estradiol', 'testosterone', 'androstenedione', 'dehydroepiandrosterone', 'dehydroepiandrosterone sulfate', 'PACAP', 'c-fos gene', 'PACAP - ( 6-38 ) - NH2', 'PACAP receptor antagonist', 'amphetamine', 'RGS2', 'STZ', 'streptozotocin', 'iNOS', 'clotrimazole', 'cisplatin', 'p53', 'CYP2D6', 'alpha-hydroxymetoprolol', 'Norepinephrine', 'NE', 'CRH', 'folate', 'MTHFR', 'nNOS', 'METH', '5-fluorouracil', 'levamisole', 'chemotherapy with 5-fluorouracil and levamisole', 'TPMT', '6-mercaptopurine', 'thioguanine', 'mdr1a', 'digoxin', 'CYP', 'CYP3A4', 'haloperidol', 'Haloperidol', 'antibody to hepatitis B e', 'alanine transaminase', 'IFN-alpha', 'ribavirin', 'anti-HBe', 'ALT', 'interferon alfa', 'combination therapy with ribavirin and IFN-alpha', 'paclitaxel', 'DOR-1', 'morphine', 'quinpirole', 'RU-24969', 'c-Fos', 'D2-like dopamine agonist', 'CRBP', 'cellular retinol -binding pr

ic| 263826966.py:79 in merge_ner()
    entity_in_sent: bcl-x(L),
    entity_name: bcl-x(L)
    ini: 96
    end: 105
    sent: RA repressed c-fos mRNA expression in control and irradiated SiHa cultures, but did not repress bcl-x(L), p53, GADD45, p21, bax, bcl-2, or mcl-1 mRNA expression.
ic| 263826966.py:79 in merge_ner()
    entity_in_sent: As(2)O(3)-
    entity_name: As(2)O(3)
    ini: 51
    end: 61
    sent: The effect of As(2)O(3) on NB cell number involved As(2)O(3)-induced apoptotic pathways (decreased expression of Bcl-2 and stimulation of caspase-3 activity) with no clear evidence of induced differentiation.
ic| 263826966.py:79 in merge_ner()
    entity_in_sent: T(4),
    entity_name: T(4)
    ini: 30
    end: 35
    sent: Thyroid hormone (l-thyroxine, T(4), or 3,5,3'-triiodo-l-thyronine, T(3)) treatment of human papillary and follicular thyroid cancer cell lines resulted in enhanced cell proliferation, measured by proliferating cell nuclear antigen (PCNA).
ic| 263826966.py:79 

1655


In [36]:
_df = pd.read_csv('test.csv')
print(_df)
_df.dropna(inplace=True)
print(_df)
_df = pd.read_csv('test.csv', dtype=str)
print(_df)
_df.dropna(inplace=True)
print(_df)
# pd auto convert 'nan' str to np.nan, even with dytpe=str
_df.to_json('test.json', orient='records',)
d = {'a': 1, 'b': 22}
df1 = pd.DataFrame.from_dict(d, orient='index')
print(df1)
print(df1.reset_index())

     a   b
0  NaN   0
1  0.0   1
2  1.0   3
     a   b
1  0.0   1
2  1.0   3
     a   b
0  NaN   0
1    0   1
2    1   3
   a   b
1  0   1
2  1   3
    0
a   1
b  22
  index   0
0     a   1
1     b  22


In [6]:
def create_data(x):
    # uid = x.pmid.tolist()[0]
    # question = (
    #    "{sentence}\n"
    #    "---------------\n"
    #    "please extract all Herbs in the above text, "
    #    "Herbs includes Chinese medicine, fruits, plants, medical plants. "
    #    "Format your answer in the form of <entity name, entity type>.")

    question = (
       "{sentence}\n"
       "---------------\n"
       "please extract all Chemical and Gene in the above text, "
       "Gene includes gene or protein, excluding Limited variation, Genomic variation, Genomic factor, Haplotype."
       "Chemical includes chemical and drug, excluding disease."
    #    "The output format should be '<entity name, entity span, entity type>' ."
    #    "The output format should be '<entity name, starting position of entity name, entity type>' ."
    #    "The output format should be '<entity name, entity type>' ."
       "The output format should be '<starting index in sentence, ending index in sentence, entity name, entity type>' ."
       )

    lst_qa = []
    sentence = x.sentence.tolist()[0]
    uid = x.uid.tolist()[0]
    question = question.format(sentence=sentence)
    for entities in x.entity_info:
        ans = []
        for entity_name in entities:
            if pd.isna(entity_name[-2]):
                continue
            # ans.append(f"<{entity_name[-2]}, Herb>")
            # ans.append(f"<{entity_name[-2]}, {entity_name[0]}, {entity_name[-1]}>")
            # ans.append(f"<{entity_name[-2]}, {entity_name[-1]}>")
            ans.append(f"<{entity_name[0]}, {entity_name[1]}, {entity_name[2]}, {entity_name[3]}>")
            
        # dict_qa = {
        #     "human": question,
        #     "assistant": "|#|".join(ans)
        #     }
        # lst_qa.append(dict_qa)
        
    return {
            "conversation_id": uid,
            "category": "NER",
            "conversation": [{
                                "human": question,
                                "assistant": ", ".join(ans)
                            }]
    }


df = df_data_ner.groupby(by=["sentence"]).apply(lambda x: create_data(x))
df = df.reset_index().rename(columns={0:"llm_data"})

In [7]:
print(df.columns.to_list())


['sentence', 'llm_data']


In [8]:
print(df_data_ner.columns.to_list())
print(df_data_ner.iloc[0].entity_info)


def check_ner(row):
    """  """
    sentence = row.sentence
    # print(type(sentence), sentence)
    for entities in row.entity_info:
        # print(entities, type(entities))
        ini, end, entity_name, entity_type = entities
        int_ini = int(ini)
        int_end = int(end)
        if pd.isna(ini):
            # print(sentence)
            continue
        ent = sentence[int_ini: int_end]
        if ent != entity_name:
            return False
    return True


df_data_ner["check"] = df_data_ner.apply(lambda x: check_ner(x), axis=1)
ner_false = df_data_ner[df_data_ner["check"] == False]
ner_false.to_csv("ner_false.csv", index=False)
print(len(ner_false))

['sentence', 'entity_info', 'uid']
[[51, 68, 'phosphorylcholine', 'Chemical'], [116, 119, 'VH1', 'Gene'], [132, 143, 'S107 family', 'Gene']]
0


In [12]:
import json


jsonl_file = 'chem_gene_with_indexes.jsonl'
with open(jsonl_file, 'w', encoding='utf-8') as f:
    for data in df.llm_data.values.tolist():
        json.dump(data, f, ensure_ascii=False)
        f.write('\n')

with open(jsonl_file, 'r', encoding='utf-8') as f:
    data = f.readlines()
print(data[0])
print(data[1])
d = json.loads(data[0])
print(d)


# file = 'chem_gene_with_indexes.json'
# with open(file, 'w', encoding='utf-8') as f:
#     json.dump(df.llm_data.values.tolist(), f, ensure_ascii=False, indent=4)

# with open(file, 'r', encoding='utf-8') as f:
#     data = json.load(f)
# print(len(data))
# print(data[0])
# print(data[1])

{"conversation_id": "pgx_747", "category": "NER", "conversation": [{"human": "( NZB x NZW ) F1 mice respond to immunization with phosphorylcholine with a response that is largely encoded by the VH1 gene of the S107 family .\n---------------\nplease extract all Chemical and Gene in the above text, Gene includes gene or protein, excluding Limited variation, Genomic variation, Genomic factor, Haplotype.Chemical includes chemical and drug, excluding disease.The output format should be '<starting index in sentence, ending index in sentence, entity name, entity type>' .", "assistant": "<51, 68, phosphorylcholine, Chemical>, <116, 119, VH1, Gene>, <132, 143, S107 family, Gene>"}]}

{"conversation_id": "21145", "category": "NER", "conversation": [{"human": "(2) After treatment with ATRA, the fusion protein disappeared and PML protein resumed in NB4 cells, while in HL-60 and K562 cells there was no difference from control cells.\n---------------\nplease extract all Chemical and Gene in the abov