In [1]:
!python -m spacy validate

[2K[38;5;2m✔ Loaded compatibility table[0m
[1m
[38;5;4mℹ spaCy installation: /opt/conda/lib/python3.7/site-packages/spacy[0m

NAME              SPACY                 VERSION                            
en_core_web_sm    >=3.3.0.dev0,<3.4.0   [38;5;2m3.3.0[0m   [38;5;2m✔[0m
en_core_web_trf   >=3.3.0.dev0,<3.4.0   [38;5;2m3.3.0[0m   [38;5;2m✔[0m



In [2]:
import itertools
import json
import pandas as pd
import spacy
nlp = spacy.load('en_core_web_sm')

### Load Dataset from HuggingFace

In [3]:
from datasets import load_dataset

dataset_builder = load_dataset("lewtun/autoevaluate__ncbi_disease")

Downloading builder script:   0%|          | 0.00/5.83k [00:00<?, ?B/s]

Downloading and preparing dataset ncbi_disease/ncbi_disease to /path/to/dir/.cache/huggingface/datasets/lewtun___ncbi_disease/ncbi_disease/1.0.0/92314c7992b0b8a5ea2ad101be33f365b684a2cc011e0ffa29c691e6d32b2d03...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/284k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/51.2k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/52.4k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset ncbi_disease downloaded and prepared to /path/to/dir/.cache/huggingface/datasets/lewtun___ncbi_disease/ncbi_disease/1.0.0/92314c7992b0b8a5ea2ad101be33f365b684a2cc011e0ffa29c691e6d32b2d03. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [4]:
print(dataset_builder)

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 5433
    })
    validation: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 924
    })
    test: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 941
    })
})


In [9]:
dataset_builder['train'][0]

{'id': '0',
 'tokens': ['Identification',
  'of',
  'APC2',
  ',',
  'a',
  'homologue',
  'of',
  'the',
  'adenomatous',
  'polyposis',
  'coli',
  'tumour',
  'suppressor',
  '.'],
 'ner_tags': [0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 2, 0, 0]}

In [14]:
ner_tagging_dict = {0: 'O',
                    1: 'B-Disease',
                    2: 'I-Disease'
                   }

### 1.Prepare Spacy_Tokenized_Data

In [15]:
def hf_dataset_2_token_tag_df(hf_dataset):
    token_tag_tuple_list = [] 
    for each in range(len(hf_dataset)):
        ids_list = [int(hf_dataset[each]['id']) for every in range(len(hf_dataset[each]['tokens']))]
        token_tag_tuple_list.extend(list(zip(ids_list,hf_dataset[each]['tokens'],
                                        hf_dataset[each]['ner_tags'])
                                       )
                                   )
        token_tag_tuple_list.append((int(hf_dataset[each]['id']),'[SEP]','[SEP]'))
    token_tag_df = pd.DataFrame(token_tag_tuple_list,
                                columns=['Sentence_Id','Tokens','Tags']
                               )
    
    print(token_tag_df.head())
    token_tag_df['BIO_Tags'] = token_tag_df['Tags'].apply(lambda x: ner_tagging_dict[x] if x != '[SEP]' else '[SEP]')
    print("spacy processing started")
    spacy_small_tokens_list = []
    for i,doc in enumerate(nlp.pipe(token_tag_df['Tokens'],
                                    as_tuples=False,  
                                    n_process=-1,
                                    batch_size=100
                       )):
        if i%1000 == 0:
            print(f"{i} tokens processed")
        if doc.text == '[SEP]':
            spacy_small_tokens_list.append(['[SEP]'])
        else:
            spacy_small_tokens_list.append([token.text for token in doc])
    token_tag_df['Spacy_Small_Tokens'] = spacy_small_tokens_list
    print("spacy processing over")
    token_tag_df = token_tag_df.explode('Spacy_Small_Tokens')
    token_tag_df.reset_index(drop=True,inplace=True)
    return token_tag_df

In [16]:
%%time
train_data_df = hf_dataset_2_token_tag_df(dataset_builder['train'])

   Sentence_Id          Tokens Tags
0            0  Identification    0
1            0              of    0
2            0            APC2    0
3            0               ,    0
4            0               a    0
spacy processing started
0 tokens processed
1000 tokens processed
2000 tokens processed
3000 tokens processed
4000 tokens processed
5000 tokens processed
6000 tokens processed
7000 tokens processed
8000 tokens processed
9000 tokens processed
10000 tokens processed
11000 tokens processed
12000 tokens processed
13000 tokens processed
14000 tokens processed
15000 tokens processed
16000 tokens processed
17000 tokens processed
18000 tokens processed
19000 tokens processed
20000 tokens processed
21000 tokens processed
22000 tokens processed
23000 tokens processed
24000 tokens processed
25000 tokens processed
26000 tokens processed
27000 tokens processed
28000 tokens processed
29000 tokens processed
30000 tokens processed
31000 tokens processed
32000 tokens processed
33000 tokens 

In [73]:
train_data_df.head(25)

Unnamed: 0,Sentence_Id,Tokens,Tags,BIO_Tags,Spacy_Small_Tokens,New_Sentence_Id
0,0,Identification,0,O,Identification,tr_0
1,0,of,0,O,of,tr_0
2,0,APC2,0,O,APC2,tr_0
3,0,",",0,O,",",tr_0
4,0,a,0,O,a,tr_0
5,0,homologue,0,O,homologue,tr_0
6,0,of,0,O,of,tr_0
7,0,the,0,O,the,tr_0
8,0,adenomatous,1,B-Disease,adenomatous,tr_0
9,0,polyposis,2,I-Disease,polyposis,tr_0


In [18]:
train_data_df[train_data_df['BIO_Tags'].str.contains('Disease')].head(20)

Unnamed: 0,Sentence_Id,Tokens,Tags,BIO_Tags,Spacy_Small_Tokens
8,0,adenomatous,1,B-Disease,adenomatous
9,0,polyposis,2,I-Disease,polyposis
10,0,coli,2,I-Disease,coli
11,0,tumour,2,I-Disease,tumour
16,1,adenomatous,1,B-Disease,adenomatous
17,1,polyposis,2,I-Disease,polyposis
18,1,coli,2,I-Disease,coli
19,1,(,2,I-Disease,(
20,1,APC,2,I-Disease,APC
21,1,),2,I-Disease,)


In [19]:
%%time
val_data_df = hf_dataset_2_token_tag_df(dataset_builder['validation'])
test_data_df = hf_dataset_2_token_tag_df(dataset_builder['test'])

   Sentence_Id    Tokens Tags
0            0     BRCA1    0
1            0        is    0
2            0  secreted    0
3            0       and    0
4            0  exhibits    0
spacy processing started
0 tokens processed
1000 tokens processed
2000 tokens processed
3000 tokens processed
4000 tokens processed
5000 tokens processed
6000 tokens processed
7000 tokens processed
8000 tokens processed
9000 tokens processed
10000 tokens processed
11000 tokens processed
12000 tokens processed
13000 tokens processed
14000 tokens processed
15000 tokens processed
16000 tokens processed
17000 tokens processed
18000 tokens processed
19000 tokens processed
20000 tokens processed
21000 tokens processed
22000 tokens processed
23000 tokens processed
24000 tokens processed
spacy processing over
   Sentence_Id      Tokens Tags
0            0  Clustering    0
1            0          of    0
2            0    missense    0
3            0   mutations    0
4            0          in    0
spacy processing st

In [80]:
test_data_df.tail()

Unnamed: 0,Sentence_Id,Tokens,Tags,BIO_Tags,Spacy_Small_Tokens,New_Sentence_Id
25476,939,cancers,2,I-Disease,cancers,tes_939
25477,939,.,0,O,.,tes_939
25478,939,.,0,O,.,tes_939
25479,939,[SEP],[SEP],[SEP],[SEP],tes_939
25480,940,[SEP],[SEP],[SEP],[SEP],tes_940


In [81]:
final_token_level_annotated_data = pd.concat([train_data_df[:-1], val_data_df[:-1], test_data_df[:-1]], # ignoring empty sentences 
                                             axis=0
                                            )

In [82]:
final_token_level_annotated_data.shape

(191977, 6)

In [83]:
final_token_level_annotated_data.tail()

Unnamed: 0,Sentence_Id,Tokens,Tags,BIO_Tags,Spacy_Small_Tokens,New_Sentence_Id
25475,939,breast,2,I-Disease,breast,tes_939
25476,939,cancers,2,I-Disease,cancers,tes_939
25477,939,.,0,O,.,tes_939
25478,939,.,0,O,.,tes_939
25479,939,[SEP],[SEP],[SEP],[SEP],tes_939


In [84]:
final_token_level_annotated_data = final_token_level_annotated_data[['New_Sentence_Id','Spacy_Small_Tokens','BIO_Tags']]

In [86]:
final_token_level_annotated_data.rename(columns={'BIO_Tags':'Human_Annotated_Tag_BIO'},inplace=True)

In [20]:
!mkdir -p ../data/diease_ner/conll_spacy_tokenized_ner_data/

In [25]:
SPACY_TOKENIZED_DIR = '../data/diease_ner/conll_spacy_tokenized_ner_data/'

In [87]:
final_token_level_annotated_data.to_csv(f'{SPACY_TOKENIZED_DIR}/token_level_annotated_data.csv',index=None)

In [21]:
train_data_df['New_Sentence_Id'] = "tr_" + train_data_df['Sentence_Id'].astype(str) 
val_data_df['New_Sentence_Id'] = "val_" + val_data_df['Sentence_Id'].astype(str) 
test_data_df['New_Sentence_Id'] = "tes_" + test_data_df['Sentence_Id'].astype(str) 

In [22]:
train_data_df.head()

Unnamed: 0,Sentence_Id,Tokens,Tags,BIO_Tags,Spacy_Small_Tokens,New_Sentence_Id
0,0,Identification,0,O,Identification,tr_0
1,0,of,0,O,of,tr_0
2,0,APC2,0,O,APC2,tr_0
3,0,",",0,O,",",tr_0
4,0,a,0,O,a,tr_0


In [26]:
train_data_df.to_csv(f'{SPACY_TOKENIZED_DIR}/train_data.csv',index=None)
val_data_df.to_csv(f'{SPACY_TOKENIZED_DIR}/val_data.csv',index=None)
test_data_df.to_csv(f'{SPACY_TOKENIZED_DIR}/test_data.csv',index=None)

In [48]:
train_data_df[train_data_df['New_Sentence_Id']=='tr_0']

Unnamed: 0,Sentence_Id,Tokens,Tags,BIO_Tags,Spacy_Small_Tokens,New_Sentence_Id
0,0,Identification,0,O,Identification,tr_0
1,0,of,0,O,of,tr_0
2,0,APC2,0,O,APC2,tr_0
3,0,",",0,O,",",tr_0
4,0,a,0,O,a,tr_0
5,0,homologue,0,O,homologue,tr_0
6,0,of,0,O,of,tr_0
7,0,the,0,O,the,tr_0
8,0,adenomatous,1,B-Disease,adenomatous,tr_0
9,0,polyposis,2,I-Disease,polyposis,tr_0


### 2. Prepare Model Training Data 

In [27]:
CONLL_TRAINING_DIR = '../data/diease_ner/conll_compatible_ner_data/'

In [31]:
!mkdir -p $CONLL_TRAINING_DIR

In [28]:
def convert_token_df_2_conll_format(token_tag_df,
                                    token_column_name,
                                    tag_column_name
                                   ):
    token_string = ''
    for each in range(len(token_tag_df)):
        current_token_string = str(token_tag_df.loc[each,token_column_name])
        current_tag_string = str(token_tag_df.loc[each,tag_column_name])
        
        if current_token_string !='[SEP]':
            current_line = current_token_string + "\t" + current_tag_string + "\n"
        else:
            current_line = "\n"
        token_string = token_string + current_line
    return token_string

In [29]:
%%time
train_data_string = convert_token_df_2_conll_format(train_data_df,
                                             'Spacy_Small_Tokens',
                                             'BIO_Tags'
                                            )

CPU times: user 2.43 s, sys: 37.8 ms, total: 2.47 s
Wall time: 2.47 s


In [32]:
with open(f'{CONLL_TRAINING_DIR}/train_data.conll','w',encoding='utf-8') as f:
    f.write(train_data_string)

In [33]:
%%time
val_data_string = convert_token_df_2_conll_format(val_data_df,
                                             'Spacy_Small_Tokens',
                                             'BIO_Tags'
                                            )

with open(f'{CONLL_TRAINING_DIR}/val_data.conll','w',encoding='utf-8') as f:
    f.write(val_data_string)
    
    
test_data_string = convert_token_df_2_conll_format(test_data_df,
                                             'Spacy_Small_Tokens',
                                             'BIO_Tags'
                                            )

with open(f'{CONLL_TRAINING_DIR}/test_data.conll','w',encoding='utf-8') as f:
    f.write(test_data_string)

CPU times: user 878 ms, sys: 17.9 ms, total: 896 ms
Wall time: 897 ms


In [34]:
!head -n 25 $CONLL_TRAINING_DIR/test_data.conll

Clustering	O
of	O
missense	O
mutations	O
in	O
the	O
ataxia	B-Disease
-	I-Disease
telangiectasia	I-Disease
gene	O
in	O
a	O
sporadic	B-Disease
T	I-Disease
-	I-Disease
cell	I-Disease
leukaemia	I-Disease
.	O

Ataxia	B-Disease
-	I-Disease
telangiectasia	I-Disease
(	O
A	B-Disease
-	I-Disease


### 3.Prepare `Unlabeled` Sentences for building a Spacy Rules Model 

In [61]:
def _split_a_sequence(sequence, sep):
    chunk = []
    for val in sequence:
        if val[1] == sep:
            yield chunk
            chunk = []
        else:
            chunk.append(val)
    yield chunk
    

def token_tags_2_sentences(tokens_tags_df):
    new_df = pd.DataFrame(tokens_tags_df.groupby('New_Sentence_Id')['Spacy_Small_Tokens'].apply(list))
    new_df = new_df.reset_index()
    new_df['Sentence'] = new_df['Spacy_Small_Tokens'].apply(lambda x: " ".join(x[:-1]))
    new_df = new_df[['New_Sentence_Id', 'Sentence']]
    return new_df

In [62]:
UNLABELED_SENTENCE_DIR = '../data/diease_ner/unlabeled_sentences/'

In [63]:
!mkdir -p $UNLABELED_SENTENCE_DIR

In [64]:
train_data_df.shape

(141590, 6)

In [65]:
%%time
train_sentence_df = token_tags_2_sentences(train_data_df)
val_sentence_df = token_tags_2_sentences(val_data_df)
test_sentence_df = token_tags_2_sentences(test_data_df)

CPU times: user 167 ms, sys: 7.99 ms, total: 175 ms
Wall time: 174 ms


In [66]:
train_sentence_df.head()

Unnamed: 0,New_Sentence_Id,Sentence
0,tr_0,"Identification of APC2 , a homologue of the ad..."
1,tr_1,The adenomatous polyposis coli ( APC ) tumour ...
2,tr_10,A common MSH2 mutation in English and North Am...
3,tr_100,The positive control for DMT1 up - regulation ...
4,tr_1000,The history further indicated intrauterine gro...


In [67]:
train_sentence_df.loc[0,'Sentence']

'Identification of APC2 , a homologue of the adenomatous polyposis coli tumour suppressor .'

In [68]:
complete_sentence_level_data = pd.concat([train_sentence_df, 
                                          val_sentence_df,
                                          test_sentence_df
                                         ])

In [69]:
complete_sentence_level_data.reset_index(drop=True,inplace=True)

In [70]:
complete_sentence_level_data.head()

Unnamed: 0,New_Sentence_Id,Sentence
0,tr_0,"Identification of APC2 , a homologue of the ad..."
1,tr_1,The adenomatous polyposis coli ( APC ) tumour ...
2,tr_10,A common MSH2 mutation in English and North Am...
3,tr_100,The positive control for DMT1 up - regulation ...
4,tr_1000,The history further indicated intrauterine gro...


In [71]:
complete_sentence_level_data.to_csv(f'{UNLABELED_SENTENCE_DIR}/complete_data_for_rules_tagging.csv',index=None)

Training % of Sentences

In [45]:
len(train_sentence_df)/ (len(train_sentence_df) + 
                            len(val_sentence_df) +
                            len(test_sentence_df))

0.7444505343929844

Validation % of Sentences

In [46]:
len(val_sentence_df)/ (len(train_sentence_df) + 
                            len(val_sentence_df) +
                            len(test_sentence_df))

0.12661003014524527

Test % of Sentences

In [47]:
len(test_sentence_df)/ (len(train_sentence_df) + 
                            len(val_sentence_df) +
                            len(test_sentence_df))

0.12893943546177034

### Key Disease Words

In [50]:
disease_words = list(train_data_df[train_data_df['BIO_Tags'].str.contains('I-Disease')]['Tokens'])

In [56]:
%%time
disease_words = [token.lemma_ for word in disease_words for token in nlp(word) if token.tag_.startswith('N')]

CPU times: user 17.7 s, sys: 88.7 ms, total: 17.8 s
Wall time: 17.8 s


In [57]:
unique_disease_words = list(set(disease_words))

In [58]:
len(unique_disease_words)

503

In [59]:
unique_disease_words[0:20]

['CETP',
 'deficienty',
 'male',
 'retina',
 'secretion',
 'CYP27',
 'scalp',
 'peripheral',
 'PKU',
 'willi',
 'keratoderma',
 'homeostasis',
 'cyst',
 'EMD',
 'syndrome',
 'demyelination',
 'hemolytic',
 'sarcoma',
 'IIA',
 'EC']

In [60]:
"cancer" in unique_disease_words

True

In [None]:
"cancer" in unique_disease_words