In [15]:
!TOKENIZERS_PARALLELISM=false

In [16]:
!mkdir -p spacy_model_ner/

In [17]:
!pip install tabulate

Defaulting to user installation because normal site-packages is not writeable
[0m

In [18]:
import re
import itertools
import json
import pandas as pd

In [19]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [20]:
spacy.require_gpu()

True

#### Load sentences

In [21]:
UNLABELED_SENTENCE_DIR = '../data/diease_ner/unlabeled_sentences/'

In [22]:
sentences_df = pd.read_csv(f'{UNLABELED_SENTENCE_DIR}/complete_data_for_rules_tagging.csv',index_col=False)

In [23]:
sentences_df.head()

Unnamed: 0,New_Sentence_Id,Sentence
0,tr_0,"Identification of APC2 , a homologue of the ad..."
1,tr_1,The adenomatous polyposis coli ( APC ) tumour ...
2,tr_10,A common MSH2 mutation in English and North Am...
3,tr_100,The positive control for DMT1 up - regulation ...
4,tr_1000,The history further indicated intrauterine gro...


In [24]:
sentences_df.loc[0,'Sentence']

'Identification of APC2 , a homologue of the adenomatous polyposis coli tumour suppressor .'

In [25]:
sentences_df.loc[2,'Sentence']

'A common MSH2 mutation in English and North American HNPCC families : origin , phenotypic expression , and sex specific differences in colorectal cancer .'

In [26]:
sentences_df.shape

(7298, 2)

In [27]:
text = sentences_df.loc[0,'Sentence']

In [28]:
text

'Identification of APC2 , a homologue of the adenomatous polyposis coli tumour suppressor .'

In [29]:
doc = nlp(text)

[(token.text, token.tag_, token.dep_) for token in doc]

[('Identification', 'NN', 'nsubj'),
 ('of', 'IN', 'prep'),
 ('APC2', 'NNP', 'pobj'),
 (',', ',', 'punct'),
 ('a', 'DT', 'det'),
 ('homologue', 'NN', 'appos'),
 ('of', 'IN', 'prep'),
 ('the', 'DT', 'det'),
 ('adenomatous', 'JJ', 'amod'),
 ('polyposis', 'NN', 'pobj'),
 ('coli', 'VBZ', 'ROOT'),
 ('tumour', 'NN', 'compound'),
 ('suppressor', 'NN', 'dobj'),
 ('.', '.', 'punct')]

In [30]:
!mkdir -p ../data/diease_ner/diseases_words/

In [31]:
with open('../data/diease_ner/diseases_words/diseases_ner.json','r') as f:
    diseases_json = json.load(f)

In [32]:
!ls spacy_model_ner

diseases_ner.json
diseases_ner.sjon
split_of_classes_sentence_level.csv
split_of_classes_sentence_level_dev.csv
split_of_classes_sentence_level_test.csv
split_of_classes_sentence_level_train.csv
token_level_tags_on_one_unlabeled_sentence.csv
token_level_tags_on_one_unlabeled_sentence_2.csv


In [33]:
!cp ../data/diease_ner/diseases_words/diseases_ner.json spacy_model_ner/diseases_ner.json

Source for the disease words: <br>
> thanks to Shivanshu Gupta to have crawled the below source of diseases. We can use this as a base list to produce our rules patterns
> https://raw.githubusercontent.com/Shivanshu-Gupta/web-scrapers/master/medical_ner/medicinenet-diseases.json

In [34]:
diseases_json[0:5]

[{'disease': 'Hemophilia'},
 {'disease': 'A, Hemophilia'},
 {'disease': 'Hemophilia A'},
 {'disease': 'Hepatitis A'},
 {'disease': 'A, Hepatitis'}]

In [35]:
list_of_diseases = [each['disease'] for each in diseases_json if not re.search('[,]|test',each['disease'],re.I)]

In [36]:
print(len(diseases_json))

4969


In [37]:
print(len(list_of_diseases))

4283


In [38]:
list_of_diseases[0:5]

['Hemophilia',
 'Hemophilia A',
 'Hepatitis A',
 'Abdominal Aortic Aneurysm',
 'AAA']

In [39]:
list_of_diseases.extend(['tumor','tumour'])

In [40]:
def list_of_words_2_spacy_patterns(list_of_words,
                                   nlp_model,
                                   label_name
                                  ):
    spacy_patterns = []
    for each_word in list_of_words:
        sub_pattern_list = [] # [{"ORTH": user_text_entity_df.loc[each_pattern_index,'TEXT']}]
        for token in  nlp_model(each_word.lower()):
            if re.search('^\W{1,}$',token.text):
                sub_pattern_list.append({"ORTH": token.text,"OP":"*"})
            else:
                sub_pattern_list.append({"LOWER":token.text})
        temp_dict = {"label": label_name,
                     "pattern": sub_pattern_list}
        spacy_patterns.append(temp_dict)
    return spacy_patterns


def load_rules_nlp_model_from_spacy_patterns(spacy_patterns):
    rules_nlp = spacy.load('en_core_web_sm',disable=['ner'])
    rules_config = {
        "validate": True,
        "overwrite_ents": True,
    }

    disease_rules = rules_nlp.add_pipe("entity_ruler", # invoke entity_ruler pipe 
                                       "disease_rules", # give a name to the pipe
                                       config=rules_config)
    disease_rules.add_patterns(spacy_patterns)
    return rules_nlp

In [41]:
disease_spacy_rules_patterns = list_of_words_2_spacy_patterns(list_of_diseases,
                               nlp,
                               "DISEASE"
                              )

In [42]:
disease_ner_rules_nlp = load_rules_nlp_model_from_spacy_patterns(disease_spacy_rules_patterns)

In [43]:
disease_ner_rules_nlp.pipe_names

['tok2vec',
 'tagger',
 'parser',
 'attribute_ruler',
 'lemmatizer',
 'disease_rules']

In [44]:
disease_ner_rules_nlp.pipe_labels['disease_rules']

['DISEASE']

In [52]:
!mkdir -p ../data/model_weights/spacy_rules
!mkdir -p ../data/model_weights/spacy_rules_packaged_model/

In [50]:
disease_ner_rules_nlp.to_disk('../data/model_weights/spacy_rules')

In [47]:
!ls ../data/

conll_compatible_ner_data	diease_ner    ner_sentences_level_data
conll_spacy_tokenized_ner_data	model_config


In [53]:
!python -m spacy package ../data/model_weights/spacy_rules/ ../data/model_weights/spacy_rules_packaged_model/ --name disease_ner_rules

[38;5;4mℹ Building package artifacts: sdist[0m
[38;5;2m✔ Loaded meta.json from file[0m
../data/model_weights/spacy_rules/meta.json
[38;5;2m✔ Generated README.md from meta.json[0m
[38;5;2m✔ Successfully created package directory
'en_disease_ner_rules-3.3.0'[0m
../data/model_weights/spacy_rules_packaged_model/en_disease_ner_rules-3.3.0
running sdist
running egg_info
creating en_disease_ner_rules.egg-info
writing en_disease_ner_rules.egg-info/PKG-INFO
writing dependency_links to en_disease_ner_rules.egg-info/dependency_links.txt
writing entry points to en_disease_ner_rules.egg-info/entry_points.txt
writing requirements to en_disease_ner_rules.egg-info/requires.txt
writing top-level names to en_disease_ner_rules.egg-info/top_level.txt
writing manifest file 'en_disease_ner_rules.egg-info/SOURCES.txt'
reading manifest file 'en_disease_ner_rules.egg-info/SOURCES.txt'
reading manifest template 'MANIFEST.in'
writing manifest file 'en_disease_ner_rules.egg-info/SOURCES.txt'
running check

In [68]:
sentences_df.tail()

Unnamed: 0,New_Sentence_Id,Sentence
7293,tes_95,Deletion mapping studies have unambiguously id...
7294,tes_96,"A new tumor suppressor gene , PTEN / MMAC1 , w..."
7295,tes_97,We screened 80 prostate tumors by microsatelli...
7296,tes_98,We then proceeded with sequence analysis of th...
7297,tes_99,The identification of the second mutational ev...


In [69]:
sentences_df.shape

(7298, 2)

In [71]:
sentences_df.isnull().sum()

New_Sentence_Id    0
Sentence           3
dtype: int64

In [72]:
sentences_df.dropna(subset=['Sentence'],axis=0,inplace=True)

In [73]:
sentences_df.reset_index(drop=True,inplace=True)
sentences_df.isnull().sum()

New_Sentence_Id    0
Sentence           0
dtype: int64

In [77]:
sentences_df.shape

(7295, 2)

### Prepare rules output NER

In [74]:
sentences_ids_tuples = list(zip(sentences_df['Sentence'],sentences_df['New_Sentence_Id']))

In [85]:
%%time
token_tag_tuples = []
for i, (doc, doc_id) in enumerate(disease_ner_rules_nlp.pipe(sentences_ids_tuples,
                                                           as_tuples=True,
                                                           batch_size=100,
                                                          )                                
                               ):    
    if i%100 == 0:
        print(f"Process {i} sentences")
    for token in doc:
        token_ent_type = 'O' if token.ent_type_ == '' else token.ent_type_
        token_tag_tuples.append((doc_id,token.text,token_ent_type))
    token_tag_tuples.append((doc_id, '[SEP]', '[SEP]'))

Process 0 sentences
Process 100 sentences
Process 200 sentences
Process 300 sentences
Process 400 sentences
Process 500 sentences
Process 600 sentences
Process 700 sentences
Process 800 sentences
Process 900 sentences
Process 1000 sentences
Process 1100 sentences
Process 1200 sentences
Process 1300 sentences
Process 1400 sentences
Process 1500 sentences
Process 1600 sentences
Process 1700 sentences
Process 1800 sentences
Process 1900 sentences
Process 2000 sentences
Process 2100 sentences
Process 2200 sentences
Process 2300 sentences
Process 2400 sentences
Process 2500 sentences
Process 2600 sentences
Process 2700 sentences
Process 2800 sentences
Process 2900 sentences
Process 3000 sentences
Process 3100 sentences
Process 3200 sentences
Process 3300 sentences
Process 3400 sentences
Process 3500 sentences
Process 3600 sentences
Process 3700 sentences
Process 3800 sentences
Process 3900 sentences
Process 4000 sentences
Process 4100 sentences
Process 4200 sentences
Process 4300 sentences


In [86]:
token_tag_df = pd.DataFrame(token_tag_tuples,columns=['New_Sentence_Id','Token','Rules_Tag'])

In [89]:
def convert_token_df_2_conll_format(token_tag_df,
                                    token_column_name,
                                    tag_column_name
                                   ):
    new_tag_column_name = tag_column_name + '_BIO'
    token_tag_df[new_tag_column_name] = ''
    # token_string = ''
    previous_tag_string = 'O'
    for each in range(len(token_tag_df)):
        if each %1000 == 0:
            print(f"{each} tokens processed")
        current_token_string = str(token_tag_df.loc[each,token_column_name])
        current_tag_string = str(token_tag_df.loc[each,tag_column_name])
        
        if current_token_string !='[SEP]':
            if previous_tag_string == 'O':
                if current_tag_string != 'O':
                    current_line = current_token_string + "\t" + "B-" + current_tag_string + "\n"
                else:
                    current_line = current_token_string + "\t" + current_tag_string + "\n"
            else:
                if current_tag_string == previous_tag_string:
                    current_line = current_token_string + "\t" + "I-" + current_tag_string + "\n"
                else:
                    if current_tag_string != 'O':
                        current_line = current_token_string + "\t" + "B-" + current_tag_string + "\n"
                    else:
                        current_line = current_token_string + "\t" + current_tag_string + "\n"
                        
            token_tag_df.loc[each,new_tag_column_name] = current_line.strip().split("\t")[1]
        else:
            current_line = "\n"
            token_tag_df.loc[each,new_tag_column_name] = '[SEP]'
        # token_string = token_string + current_line
        previous_tag_string = current_tag_string
        
    return token_tag_df

In [90]:
%%time 
token_tag_df_2 = convert_token_df_2_conll_format(token_tag_df,
                                                    'Token',
                                                    'Rules_Tag'
                                                   )

0 tokens processed
1000 tokens processed
2000 tokens processed
3000 tokens processed
4000 tokens processed
5000 tokens processed
6000 tokens processed
7000 tokens processed
8000 tokens processed
9000 tokens processed
10000 tokens processed
11000 tokens processed
12000 tokens processed
13000 tokens processed
14000 tokens processed
15000 tokens processed
16000 tokens processed
17000 tokens processed
18000 tokens processed
19000 tokens processed
20000 tokens processed
21000 tokens processed
22000 tokens processed
23000 tokens processed
24000 tokens processed
25000 tokens processed
26000 tokens processed
27000 tokens processed
28000 tokens processed
29000 tokens processed
30000 tokens processed
31000 tokens processed
32000 tokens processed
33000 tokens processed
34000 tokens processed
35000 tokens processed
36000 tokens processed
37000 tokens processed
38000 tokens processed
39000 tokens processed
40000 tokens processed
41000 tokens processed
42000 tokens processed
43000 tokens processed
4

In [93]:
token_tag_df_2.head()

Unnamed: 0,New_Sentence_Id,Token,Rules_Tag,Rules_Tag_BIO
0,tr_0,Identification,O,O
1,tr_0,of,O,O
2,tr_0,APC2,O,O
3,tr_0,",",O,O
4,tr_0,a,O,O


In [97]:
SPACY_RULES_NER_DIR = '../data/diease_ner/rules_ner'
!mkdir -p $SPACY_RULES_NER_DIR

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [98]:
token_tag_df_2.to_csv(f'{SPACY_RULES_NER_DIR}/token_level_spacy_rules_ner_output.csv',index=None)

In [25]:
token_tag_df_2.head(25)

Unnamed: 0,New_Sentence_Id,Token,Rules_Tag,Rules_Tag_BIO
0,tr_0,Identification,O,O
1,tr_0,of,O,O
2,tr_0,APC2,O,O
3,tr_0,",",O,O
4,tr_0,a,O,O
5,tr_0,homologue,O,O
6,tr_0,of,O,O
7,tr_0,the,O,O
8,tr_0,adenomatous,DISEASE,B-DISEASE
9,tr_0,polyposis,DISEASE,I-DISEASE


### Adding human annotated tags

In [99]:
SPACY_RULES_NER_DIR = '../data/diease_ner/rules_ner'

token_tag_df_2 = pd.read_csv(f'{SPACY_RULES_NER_DIR}/token_level_spacy_rules_ner_output.csv',index_col=False)

In [100]:
token_tag_df_2[token_tag_df_2['New_Sentence_Id'] == 'tr_1007']

Unnamed: 0,New_Sentence_Id,Token,Rules_Tag,Rules_Tag_BIO
205,tr_1007,Chromosomal,O,O
206,tr_1007,analysis,O,O
207,tr_1007,was,O,O
208,tr_1007,normal,O,O
209,tr_1007,(,O,O
210,tr_1007,46,O,O
211,tr_1007,",",O,O
212,tr_1007,XY,O,O
213,tr_1007,),O,O
214,tr_1007,.,O,O


In [97]:
SPACY_TOKENIZED_DIR = '../data/diease_ner/conll_spacy_tokenized_ner_data/'
final_token_level_annotated_data = pd.read_csv(f'{SPACY_TOKENIZED_DIR}/token_level_annotated_data.csv',index_col=False)

In [98]:
final_token_level_annotated_data[final_token_level_annotated_data['New_Sentence_Id'] == 'tr_1007']

Unnamed: 0,New_Sentence_Id,Spacy_Small_Tokens,Human_Annotated_Tag_BIO
26281,tr_1007,Chromosomal,O
26282,tr_1007,analysis,O
26283,tr_1007,was,O
26284,tr_1007,normal,O
26285,tr_1007,(,O
26286,tr_1007,46,O
26287,tr_1007,",",O
26288,tr_1007,XY,O
26289,tr_1007,),O
26290,tr_1007,.,O


In [101]:
final_token_level_annotated_data.columns = ['New_Sentence_Id', 'Token','Human_Annotated_Tag_BIO']

In [106]:
final_token_level_annotated_data['New_index'] = [every for every in range(len(final_token_level_annotated_data))]
final_token_level_annotated_data['New_index'] = final_token_level_annotated_data['New_index'].astype(str)
final_token_level_annotated_data['New_Spacy_Tokens'] = final_token_level_annotated_data['New_index'] + final_token_level_annotated_data['Token']

In [107]:
final_token_level_annotated_data.head()

Unnamed: 0,New_Sentence_Id,Token,Human_Annotated_Tag_BIO,New_index,New_Spacy_Tokens
0,tr_0,Identification,O,0,0Identification
1,tr_0,of,O,1,1of
2,tr_0,APC2,O,2,2APC2
3,tr_0,",",O,3,3
4,tr_0,a,O,4,4a


In [108]:
final_token_level_annotated_data.sort_values(by=['New_Sentence_Id','New_Spacy_Tokens'],inplace=True)

In [111]:
final_token_level_annotated_data.reset_index(drop=True,inplace=True)

In [113]:
final_token_level_annotated_data[final_token_level_annotated_data['New_Sentence_Id'] == 'tr_1007']

Unnamed: 0,New_Sentence_Id,Token,Human_Annotated_Tag_BIO,New_index,New_Spacy_Tokens
25685,tr_1007,Chromosomal,O,26281,26281Chromosomal
25686,tr_1007,analysis,O,26282,26282analysis
25687,tr_1007,was,O,26283,26283was
25688,tr_1007,normal,O,26284,26284normal
25689,tr_1007,(,O,26285,26285(
25690,tr_1007,46,O,26286,2628646
25691,tr_1007,",",O,26287,26287
25692,tr_1007,XY,O,26288,26288XY
25693,tr_1007,),O,26289,26289)
25694,tr_1007,.,O,26290,26290.


In [115]:
token_tag_df_2['New_index'] = [every for every in range(len(token_tag_df_2))]
token_tag_df_2['New_index'] = token_tag_df_2['New_index'].astype(str)
token_tag_df_2['New_Spacy_Tokens'] = token_tag_df_2['New_index'] + token_tag_df_2['Token']
token_tag_df_2.sort_values(by=['New_Sentence_Id','New_Spacy_Tokens'],inplace=True)
token_tag_df_2.reset_index(drop=True,inplace=True)

In [116]:
token_tag_df_2[token_tag_df_2['New_Sentence_Id'] == 'tr_1007']

Unnamed: 0,New_Sentence_Id,Token,Rules_Tag,Rules_Tag_BIO,New_index,New_Spacy_Tokens
25685,tr_1007,Chromosomal,O,O,205,205Chromosomal
25686,tr_1007,analysis,O,O,206,206analysis
25687,tr_1007,was,O,O,207,207was
25688,tr_1007,normal,O,O,208,208normal
25689,tr_1007,(,O,O,209,209(
25690,tr_1007,46,O,O,210,21046
25691,tr_1007,",",O,O,211,211
25692,tr_1007,XY,O,O,212,212XY
25693,tr_1007,),O,O,213,213)
25694,tr_1007,.,O,O,214,214.


In [117]:
final_token_tag_df = pd.concat([token_tag_df_2[['New_Sentence_Id','Token','Rules_Tag_BIO']], 
                                final_token_level_annotated_data[['Human_Annotated_Tag_BIO']]],axis=1)

In [118]:
final_token_tag_df[final_token_tag_df['New_Sentence_Id']== 'tr_1007']

Unnamed: 0,New_Sentence_Id,Token,Rules_Tag_BIO,Human_Annotated_Tag_BIO
25685,tr_1007,Chromosomal,O,O
25686,tr_1007,analysis,O,O
25687,tr_1007,was,O,O
25688,tr_1007,normal,O,O
25689,tr_1007,(,O,O
25690,tr_1007,46,O,O
25691,tr_1007,",",O,O
25692,tr_1007,XY,O,O
25693,tr_1007,),O,O
25694,tr_1007,.,O,O


In [119]:
final_token_tag_df.head(50)

Unnamed: 0,New_Sentence_Id,Token,Rules_Tag_BIO,Human_Annotated_Tag_BIO
0,tes_0,Clustering,O,O
1,tes_0,of,O,O
2,tes_0,missense,O,O
3,tes_0,mutations,O,O
4,tes_0,in,O,O
5,tes_0,the,O,O
6,tes_0,ataxia,O,B-Disease
7,tes_0,-,O,I-Disease
8,tes_0,telangiectasia,O,I-Disease
9,tes_0,gene,O,O


In [120]:
final_token_tag_df['Human_Annotated_Tag_BIO'] = final_token_tag_df['Human_Annotated_Tag_BIO'].str.upper()

In [121]:
!mkdir -p ../data/diease_ner/output/

In [122]:
final_token_tag_df.to_csv('../data/diease_ner/output/token_level_annotated_rules_data.csv',index=None)

In [123]:
final_token_tag_df.tail(35)

Unnamed: 0,New_Sentence_Id,Token,Rules_Tag_BIO,Human_Annotated_Tag_BIO
191942,val_99,was,O,O
191943,val_99,found,O,O
191944,val_99,in,O,O
191945,val_99,five,O,O
191946,val_99,families,O,O
191947,val_99,",",O,O
191948,val_99,the,O,O
191949,val_99,C,O,O
191950,val_99,1806,O,O
191951,val_99,T,O,O


### Aggregate the labels at sentence level

**Types of Sentences based on entities**

In [None]:
final_token_tag_df_2 = final_token_tag_df.copy()

final_token_tag_df_2 = final_token_tag_df_2[final_token_tag_df_2['Token'] != '[SEP]']

In [128]:
final_token_tag_df_2[final_token_tag_df_2['Human_Annotated_Tag_BIO']== '[SEP]']

Unnamed: 0,New_Sentence_Id,Token,Rules_Tag_BIO,Human_Annotated_Tag_BIO
25588,tr_100,levels,O,[SEP]
26509,tr_1035,ventilatory,O,[SEP]
35490,tr_1339,be,O,[SEP]
82607,tr_3,",",O,[SEP]
103257,tr_37,activity,O,[SEP]
107395,tr_3836,22,O,[SEP]
107507,tr_384,less,O,[SEP]
125489,tr_4481,to,O,[SEP]


In [133]:
final_token_tag_df_2['Human_Annotated_Tag_BIO'] = final_token_tag_df_2['Human_Annotated_Tag_BIO'].apply(lambda x: x if x!='[SEP]' else 'O')

In [134]:
final_token_tag_df_2[final_token_tag_df_2['Human_Annotated_Tag_BIO']== '[SEP]']

Unnamed: 0,New_Sentence_Id,Token,Rules_Tag_BIO,Human_Annotated_Tag_BIO


In [135]:
%%time

sentence_level_df = final_token_tag_df_2.groupby('New_Sentence_Id')['Human_Annotated_Tag_BIO']\
.apply(lambda x: "|".join(
    list(set(sorted(x)))
))

N = len(sentence_level_df)

sentence_level_df = pd.DataFrame(sentence_level_df).reset_index()

sentence_level_df = sentence_level_df[sentence_level_df['New_Sentence_Id'] != '[SEP]']

sentence_level_df.reset_index(drop=True,inplace=True)

sentence_level_count_df = pd.DataFrame(sentence_level_df['Human_Annotated_Tag_BIO'].value_counts()).reset_index()
sentence_level_count_df.columns = ['Sentence_level_tags','Count']
sentence_level_count_df['%_contribution'] = round(sentence_level_count_df['Count']/len(sentence_level_df) * 100.0)


sentence_level_count_df

CPU times: user 146 ms, sys: 0 ns, total: 146 ms
Wall time: 145 ms


Unnamed: 0,Sentence_level_tags,Count,%_contribution
0,O,3337,46.0
1,I-DISEASE|B-DISEASE|O,2698,37.0
2,B-DISEASE|O,1260,17.0


In [127]:
sentence_level_df.Human_Annotated_Tag_BIO.value_counts()

O                              3333
I-DISEASE|B-DISEASE|O          2695
B-DISEASE|O                    1259
[SEP]|O                           4
I-DISEASE|B-DISEASE|[SEP]|O       3
B-DISEASE|[SEP]|O                 1
Name: Human_Annotated_Tag_BIO, dtype: int64

In [141]:
sentence_level_count_df.to_csv('spacy_model_ner/split_of_classes_sentence_level.csv',index=None)

In [142]:
sentence_level_df.shape

(7295, 2)

In [143]:
sentence_level_df.head()

Unnamed: 0,New_Sentence_Id,Human_Annotated_Tag_BIO
0,tes_0,B-DISEASE|I-DISEASE|O
1,tes_1,B-DISEASE|I-DISEASE|O
2,tes_10,B-DISEASE|I-DISEASE|O
3,tes_100,B-DISEASE|I-DISEASE|O
4,tes_101,B-DISEASE|I-DISEASE|O


Split the sentences into 80-10-10 train, dev and test datasets

In [136]:
from sklearn.model_selection import train_test_split

In [137]:
train_df,intermediate_df =  train_test_split(sentence_level_df,
                                             test_size=0.2, 
                                             stratify=sentence_level_df['Human_Annotated_Tag_BIO'],
                                             random_state=0)


dev_df, test_df = train_test_split(intermediate_df,
                                             test_size=0.5, 
                                             stratify=intermediate_df['Human_Annotated_Tag_BIO'],
                                             random_state=0)

In [138]:
train_df.head()

Unnamed: 0,New_Sentence_Id,Human_Annotated_Tag_BIO
5354,tr_4971,O
187,tes_267,O
3577,tr_3371,O
2366,tr_2281,I-DISEASE|B-DISEASE|O
655,tes_689,B-DISEASE|O


In [139]:
train_token_level_data = final_token_tag_df[final_token_tag_df['New_Sentence_Id'].isin(train_df['New_Sentence_Id'])]

In [140]:
final_token_tag_df.shape

(191977, 4)

In [141]:
train_token_level_data.shape

(154023, 4)

In [142]:
!mkdir -p ../data/diease_ner/train_dev_test_split/

In [143]:
train_token_level_data.to_csv('../data/diease_ner/train_dev_test_split/train_token_level_df.csv', index=None)

In [144]:
train_token_level_data.tail()

Unnamed: 0,New_Sentence_Id,Token,Rules_Tag_BIO,Human_Annotated_Tag_BIO
191972,val_99,in,O,O
191973,val_99,two,O,O
191974,val_99,families,O,O
191975,val_99,.,O,O
191976,val_99,[SEP],[SEP],[SEP]


In [145]:
dev_token_level_data = final_token_tag_df[final_token_tag_df['New_Sentence_Id'].isin(dev_df['New_Sentence_Id'])]
dev_token_level_data.to_csv('../data/diease_ner/train_dev_test_split/dev_token_level_df.csv', index=None)
dev_token_level_data.shape

(19203, 4)

In [146]:
test_token_level_data = final_token_tag_df[final_token_tag_df['New_Sentence_Id'].isin(test_df['New_Sentence_Id'])]
test_token_level_data.to_csv('../data/diease_ner/train_dev_test_split/test_token_level_df.csv', index=None)
test_token_level_data.shape

(18751, 4)

In [147]:
train_token_level_data.reset_index(drop=True,inplace=True)
dev_token_level_data.reset_index(drop=True,inplace=True)
test_token_level_data.reset_index(drop=True,inplace=True)

#### Spacy ML Compatible Data

In [148]:
!mkdir -p ../data/diease_ner/train_dev_test_split_conll_data/

In [8]:
train_token_level_data = pd.read_csv('../data/diease_ner/train_dev_test_split/train_token_level_df.csv',index_col=False)
dev_token_level_data = pd.read_csv('../data/diease_ner/train_dev_test_split/dev_token_level_df.csv',index_col=False)
test_token_level_data = pd.read_csv('../data/diease_ner/train_dev_test_split/test_token_level_df.csv',index_col=False)

In [4]:
train_token_level_data.head()

Unnamed: 0,New_Sentence_Id,Token,Rules_Tag_BIO,Human_Annotated_Tag_BIO
0,tes_0,Clustering,O,O
1,tes_0,of,O,O
2,tes_0,missense,O,O
3,tes_0,mutations,O,O
4,tes_0,in,O,O


In [5]:
train_token_level_data['Human_Annotated_Tag_BIO'].value_counts()

O            136106
I-DISEASE      6582
[SEP]          5836
B-DISEASE      5499
Name: Human_Annotated_Tag_BIO, dtype: int64

In [6]:
def convert_token_df_2_conll_string(token_tag_df,
                                    token_column_name,
                                    tag_column_name
                                   ):
    token_string = ''
    for each in range(len(token_tag_df)):
        if each %1000 == 0:
            print(f"{each} tokens processed")
        current_token_string = str(token_tag_df.loc[each,token_column_name])
        current_tag_string = str(token_tag_df.loc[each,tag_column_name])
        if current_tag_string == 'EP]':
            current_tag_string = 'O'
        if current_token_string !='[SEP]':
            current_line = current_token_string + "\t" + current_tag_string + "\n"
        else:
            current_line = "\n"
        token_string = token_string + current_line
    return token_string

In [9]:
%%time
test_conll_string = convert_token_df_2_conll_string(test_token_level_data,
                                'Token',
                                'Human_Annotated_Tag_BIO'
                               )

0 tokens processed
1000 tokens processed
2000 tokens processed
3000 tokens processed
4000 tokens processed
5000 tokens processed
6000 tokens processed
7000 tokens processed
8000 tokens processed
9000 tokens processed
10000 tokens processed
11000 tokens processed
12000 tokens processed
13000 tokens processed
14000 tokens processed
15000 tokens processed
16000 tokens processed
17000 tokens processed
18000 tokens processed
CPU times: user 324 ms, sys: 0 ns, total: 324 ms
Wall time: 323 ms


In [10]:
test_token_level_data.head()

Unnamed: 0,New_Sentence_Id,Token,Rules_Tag_BIO,Human_Annotated_Tag_BIO
0,tes_102,Limits,O,O
1,tes_102,to,O,O
2,tes_102,accuracy,O,O
3,tes_102,included,O,O
4,tes_102,recombination,O,O


In [11]:
with open('../data/diease_ner/train_dev_test_split_conll_data/test_data.conll','w',encoding='utf-8') as f:
    f.write(test_conll_string)

In [12]:
!tail -n 25 ../data/diease_ner/train_dev_test_split_conll_data/test_data.conll

investigate	O
the	O
rate	O
of	O
BRCA2	O
mutation	O
in	O
sporadic	B-DISEASE
breast	I-DISEASE
cancers	I-DISEASE
and	O
in	O
a	O
set	O
of	O
cell	O
lines	O
that	O
represent	O
twelve	O
other	O
tumour	B-DISEASE
types	O
.	O



In [156]:
!wc -l ../data/diease_ner/train_dev_test_split_conll_data/test_data.conll

18751 ../data/diease_ner/train_dev_test_split_conll_data/test_data.conll


In [13]:
%%time
dev_conll_string = convert_token_df_2_conll_string(dev_token_level_data,
                                'Token',
                                'Human_Annotated_Tag_BIO'
                               )


with open('../data/diease_ner/train_dev_test_split_conll_data/dev_data.conll','w',encoding='utf-8') as f:
    f.write(dev_conll_string)
    
!wc -l ../data/diease_ner/train_dev_test_split_conll_data/dev_data.conll

0 tokens processed
1000 tokens processed
2000 tokens processed
3000 tokens processed
4000 tokens processed
5000 tokens processed
6000 tokens processed
7000 tokens processed
8000 tokens processed
9000 tokens processed
10000 tokens processed
11000 tokens processed
12000 tokens processed
13000 tokens processed
14000 tokens processed
15000 tokens processed
16000 tokens processed
17000 tokens processed
18000 tokens processed
19000 tokens processed
19203 ../data/diease_ner/train_dev_test_split_conll_data/dev_data.conll
CPU times: user 341 ms, sys: 0 ns, total: 341 ms
Wall time: 622 ms


In [14]:
%%time
train_conll_string = convert_token_df_2_conll_string(train_token_level_data,
                                'Token',
                                'Human_Annotated_Tag_BIO'
                               )


with open('../data/diease_ner/train_dev_test_split_conll_data/train_data.conll','w',encoding='utf-8') as f:
    f.write(train_conll_string)
    
!wc -l ../data/diease_ner/train_dev_test_split_conll_data/train_data.conll

0 tokens processed
1000 tokens processed
2000 tokens processed
3000 tokens processed
4000 tokens processed
5000 tokens processed
6000 tokens processed
7000 tokens processed
8000 tokens processed
9000 tokens processed
10000 tokens processed
11000 tokens processed
12000 tokens processed
13000 tokens processed
14000 tokens processed
15000 tokens processed
16000 tokens processed
17000 tokens processed
18000 tokens processed
19000 tokens processed
20000 tokens processed
21000 tokens processed
22000 tokens processed
23000 tokens processed
24000 tokens processed
25000 tokens processed
26000 tokens processed
27000 tokens processed
28000 tokens processed
29000 tokens processed
30000 tokens processed
31000 tokens processed
32000 tokens processed
33000 tokens processed
34000 tokens processed
35000 tokens processed
36000 tokens processed
37000 tokens processed
38000 tokens processed
39000 tokens processed
40000 tokens processed
41000 tokens processed
42000 tokens processed
43000 tokens processed
4

In [158]:
def count_of_tags(df,
                  name_of_file
                 ):
    df = pd.DataFrame(df['Human_Annotated_Tag_BIO'].value_counts()).reset_index()
    df.columns = ['Sentence_level_tags','Count']
    df['%_contribution'] = round(df['Count']/sum(df['Count']) * 100.0)
    df.to_csv(f'spacy_model_ner/{name_of_file}',index=None)
    return df

In [159]:
train_count_df = count_of_tags(train_df,name_of_file='split_of_classes_sentence_level_train.csv')
dev_count_df = count_of_tags(dev_df,name_of_file='split_of_classes_sentence_level_dev.csv')
test_count_df = count_of_tags(test_df,name_of_file='split_of_classes_sentence_level_test.csv')

In [160]:
train_count_df

Unnamed: 0,Sentence_level_tags,Count,%_contribution
0,O,2670,46.0
1,I-DISEASE|B-DISEASE|O,2158,37.0
2,B-DISEASE|O,1008,17.0


In [161]:
dev_count_df

Unnamed: 0,Sentence_level_tags,Count,%_contribution
0,O,333,46.0
1,I-DISEASE|B-DISEASE|O,270,37.0
2,B-DISEASE|O,126,17.0


In [162]:
test_count_df

Unnamed: 0,Sentence_level_tags,Count,%_contribution
0,O,334,46.0
1,I-DISEASE|B-DISEASE|O,270,37.0
2,B-DISEASE|O,126,17.0


### Appendix

In [113]:
new_df = token_tag_df_2[token_tag_df_2['New_Sentence_Id']=='tr_0']

In [117]:
new_df.to_csv('spacy_model_ner/token_level_tags_on_one_unlabeled_sentence.csv',index=None)

In [131]:
new_df_2 = final_token_tag_df[final_token_tag_df['New_Sentence_Id']=='tr_0']
new_df_2.to_csv('spacy_model_ner/token_level_tags_on_one_unlabeled_sentence_2.csv',index=None)

In [132]:
new_df_2

Unnamed: 0,New_Sentence_Id,Token,Rules_Tag_BIO,Human_Annotated_Tag_BIO
0,tr_0,Identification,O,O
1,tr_0,of,O,O
2,tr_0,of,O,O
3,tr_0,of,O,O
4,tr_0,of,O,O
5,tr_0,APC2,O,O
6,tr_0,",",O,O
7,tr_0,a,O,O
8,tr_0,homologue,O,O
9,tr_0,the,O,O
