In [1]:
!pip install python-crfsuite



In [2]:
import pandas as pd
import pycrfsuite
from sklearn.metrics import classification_report

## Question 1

In [3]:
def seq_generate(fp): 
    data=[]
    with open(fp, 'r', encoding='utf-8') as tsvfile:

        s=0
        temp_tok=[]
        temp_tag=[]
        seq_tok=[]
        seq_tag=[]
        for row in tsvfile:

            data.append(row)
            if row=='\n':
                seq_tok.append(temp_tok)
                seq_tag.append(temp_tag)
                temp_tok=[]
                temp_tag=[]


            else:
                row=row.strip()
                row=row.split('\t')
                temp_tok.append(row[0])
                temp_tag.append(row[1])
    return seq_tok,seq_tag
            
            


In [4]:
train_tokens, train_tags = seq_generate('ncbi-disease-master/conll/train.tsv')
test_tokens, test_tags = seq_generate('ncbi-disease-master/conll/test.tsv')

print("number of sequence in train",len(train_tokens))
print("number of sequence in tag",len(test_tokens))

print("train tokens 1st sequence ")
print( train_tokens[0])
print("train tags 1st sequence ")
print( train_tags[0])


number of sequence in train 5432
number of sequence in tag 940
train tokens 1st sequence 
['Identification', 'of', 'APC2', ',', 'a', 'homologue', 'of', 'the', 'adenomatous', 'polyposis', 'coli', 'tumour', 'suppressor', '.']
train tags 1st sequence 
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-Disease', 'I-Disease', 'I-Disease', 'I-Disease', 'O', 'O']


## Question 2

In [5]:
b_count=0
i_count=0
o_count=0
for i in train_tags:
    for j in i:
        if(j=='I-Disease'):
            i_count=i_count+1
        if(j=='B-Disease'):
            b_count=b_count+1
        if(j=='O'):
            o_count=o_count+1
        

In [6]:
print("count of I-Disease ",i_count)
print("count of B-Disease ",b_count)
print("count of O         ",o_count)

count of I-Disease  6122
count of B-Disease  5145
count of O          124819


In [7]:
# freq_tag=pd.DataFrame(columns=['words','counts','tags'])
dict_tags={}
for i in range(0,len(train_tokens)):
    
    for j in range(len(train_tokens[i])):
        if(train_tags[i][j]=='B-Disease' or train_tags[i][j] == 'I-Disease'):
            if(train_tokens[i][j] not in dict_tags.keys()):
                dict_tags[train_tokens[i][j]]=1
            if(train_tokens[i][j] in dict_tags.keys()):
                dict_tags[train_tokens[i][j]]=dict_tags[train_tokens[i][j]]+1
        


In [8]:
decending_order_cnt=sorted(list(dict_tags.values()), reverse=True)

In [9]:
common_words=[]
for i in decending_order_cnt[:20]:
    for k in dict_tags.keys():
        if(dict_tags[k]==i):
            common_words.append(k)

In [10]:
print("the 20 common words are")
common_words

the 20 common words are


['-',
 'deficiency',
 'syndrome',
 'cancer',
 'disease',
 'of',
 'dystrophy',
 'breast',
 'ovarian',
 'X',
 'and',
 'DM',
 'and',
 'DM',
 'ALD',
 'DMD',
 'APC',
 'disorder',
 'muscular',
 'disorder',
 'muscular',
 'G6PD',
 'linked',
 'the']

In [11]:
import nltk
from nltk import word_tokenize, pos_tag

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\raghu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\raghu\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

## Question 3

In [12]:
def feature_words(seq,pos):
    feature_vect=[]
    low=seq[pos].lower()
    
    feature_vect.append("w0.lower="+str(low))
    
    suff=seq[pos][-3:]
    feature_vect.append("w0.suffix3="+str(suff))
    if(pos==0):
        feature_vect.append("w-1.previous=BOS")
    else:
        feature_vect.append("w-1.previous="+seq[pos-1].lower())
    if(pos==len(seq)-1):
        feature_vect.append("w1.post=EOS")
    else:
        feature_vect.append("w1.post="+seq[pos+1].lower())
        
    tags = pos_tag([str(seq[pos])])
    feature_vect.append("w0.pos="+tags[0][1])
    feature_vect.append("w0.tit="+str(seq[pos].istitle()))
    return feature_vect

        
    
    

In [13]:
# !pip install textblob
# !pip install spacy

In [14]:
features_tok_train=[]
for i in train_tokens:
    temp=[]
    for j in range(0,len(i)):
        fun_return=feature_words(i,j)
        temp.append(fun_return)
    features_tok_train.append(temp)
features_tok_test=[]       
for i in test_tokens:
    temp=[]
    for j in range(0,len(i)):
        fun_return=feature_words(i,j)
        temp.append(fun_return)
    features_tok_test.append(temp) 


In [15]:
features_tok_train[0][:3]

[['w0.lower=identification',
  'w0.suffix3=ion',
  'w-1.previous=BOS',
  'w1.post=of',
  'w0.pos=NN',
  'w0.tit=True'],
 ['w0.lower=of',
  'w0.suffix3=of',
  'w-1.previous=identification',
  'w1.post=apc2',
  'w0.pos=IN',
  'w0.tit=False'],
 ['w0.lower=apc2',
  'w0.suffix3=PC2',
  'w-1.previous=of',
  'w1.post=,',
  'w0.pos=NN',
  'w0.tit=False']]

## Question 4

In [16]:
trainer = pycrfsuite.Trainer(verbose=False)

for xseq, yseq in zip(features_tok_train, train_tags):
    trainer.append(xseq, yseq)

In [18]:
trainer.set_params({
    'c1': 1.0,   # coefficient for L1 penalty
    'c2': 1e-3,  # coefficient for L2 penalty
    'max_iterations': 50,  # stop earlier

    # include transitions that are possible, but not observed
    'feature.possible_transitions': True
})

In [19]:
trainer.params()

['feature.minfreq',
 'feature.possible_states',
 'feature.possible_transitions',
 'c1',
 'c2',
 'max_iterations',
 'num_memories',
 'epsilon',
 'period',
 'delta',
 'linesearch',
 'max_linesearch']

In [20]:
trainer.train('conll2002-esp.crfsuite')

In [21]:
!dir /B .\conll2002-esp.crfsuite

conll2002-esp.crfsuite


In [22]:

trainer.logparser.last_iteration

{'num': 50,
 'scores': {},
 'loss': 5634.827547,
 'feature_norm': 69.356852,
 'error_norm': 403.67243,
 'active_features': 3147,
 'linesearch_trials': 1,
 'linesearch_step': 1.0,
 'time': 0.1}

In [23]:
print(len(trainer.logparser.iterations), trainer.logparser.iterations[-1])

50 {'num': 50, 'scores': {}, 'loss': 5634.827547, 'feature_norm': 69.356852, 'error_norm': 403.67243, 'active_features': 3147, 'linesearch_trials': 1, 'linesearch_step': 1.0, 'time': 0.1}


In [24]:
tagger = pycrfsuite.Tagger()
tagger.open('conll2002-esp.crfsuite')

<contextlib.closing at 0x297bf7f26d0>

In [25]:
y_pred = [tagger.tag(xseq) for xseq in features_tok_test]

In [26]:
true_labels = [label for labels in test_tags for label in labels]
predicted_labels_flat = [label for labels in y_pred for label in labels]

In [27]:
len(true_labels)

24497

In [28]:
report = classification_report(true_labels, predicted_labels_flat, labels=["B-Disease", "I-Disease", "O"])


print(report)

              precision    recall  f1-score   support

   B-Disease       0.85      0.72      0.78       960
   I-Disease       0.82      0.78      0.80      1087
           O       0.98      0.99      0.99     22450

    accuracy                           0.97     24497
   macro avg       0.88      0.83      0.85     24497
weighted avg       0.97      0.97      0.97     24497



## Question 5

In [29]:
from collections import Counter


In [30]:
tagger = pycrfsuite.Tagger()
tagger.open('conll2002-esp.crfsuite')

<contextlib.closing at 0x297bf7f2580>

In [31]:
transitions = tagger.info().transitions


In [32]:
transitions

{('O', 'O'): 2.464039,
 ('O', 'B-Disease'): 0.140019,
 ('O', 'I-Disease'): -8.10381,
 ('B-Disease', 'O'): -1.633622,
 ('B-Disease', 'B-Disease'): -5.725083,
 ('B-Disease', 'I-Disease'): 1.438976,
 ('I-Disease', 'O'): -1.475686,
 ('I-Disease', 'B-Disease'): -4.160078,
 ('I-Disease', 'I-Disease'): 1.804314}

In [33]:
state_features = tagger.info().state_features

In [34]:
state_features

{('w0.suffix3=ion', 'B-Disease'): -1.173761,
 ('w0.suffix3=ion', 'I-Disease'): 0.012461,
 ('w-1.previous=BOS', 'O'): 3.811297,
 ('w-1.previous=BOS', 'B-Disease'): 2.73649,
 ('w1.post=of', 'O'): 0.926477,
 ('w1.post=of', 'B-Disease'): -0.206796,
 ('w1.post=of', 'I-Disease'): -1.726665,
 ('w0.pos=NN', 'O'): -1.282814,
 ('w0.pos=NN', 'B-Disease'): 0.809206,
 ('w0.pos=NN', 'I-Disease'): 0.04371,
 ('w0.tit=True', 'O'): 0.571671,
 ('w0.tit=True', 'B-Disease'): -0.698097,
 ('w0.tit=True', 'I-Disease'): -0.874791,
 ('w0.lower=of', 'O'): 0.076663,
 ('w0.lower=of', 'I-Disease'): 0.723994,
 ('w0.suffix3=of', 'O'): 0.058899,
 ('w0.suffix3=of', 'I-Disease'): 0.734596,
 ('w0.pos=IN', 'O'): 1.115929,
 ('w0.pos=IN', 'B-Disease'): -1.902654,
 ('w0.pos=IN', 'I-Disease'): -0.337155,
 ('w0.tit=False', 'O'): 0.606712,
 ('w0.tit=False', 'B-Disease'): -0.721301,
 ('w0.tit=False', 'I-Disease'): -0.077017,
 ('w-1.previous=of', 'O'): -1.740149,
 ('w-1.previous=of', 'B-Disease'): -0.166082,
 ('w-1.previous=of', 

In [35]:
for i in state_features.keys():
    if(("w0.pos" in i[0])or ("w0.tit" in i[0] )):
        print(i)
    

('w0.pos=NN', 'O')
('w0.pos=NN', 'B-Disease')
('w0.pos=NN', 'I-Disease')
('w0.tit=True', 'O')
('w0.tit=True', 'B-Disease')
('w0.tit=True', 'I-Disease')
('w0.pos=IN', 'O')
('w0.pos=IN', 'B-Disease')
('w0.pos=IN', 'I-Disease')
('w0.tit=False', 'O')
('w0.tit=False', 'B-Disease')
('w0.tit=False', 'I-Disease')
('w0.pos=,', 'O')
('w0.pos=,', 'I-Disease')
('w0.pos=DT', 'O')
('w0.pos=DT', 'B-Disease')
('w0.pos=DT', 'I-Disease')
('w0.pos=JJ', 'O')
('w0.pos=JJ', 'B-Disease')
('w0.pos=JJ', 'I-Disease')
('w0.pos=NNS', 'O')
('w0.pos=NNS', 'B-Disease')
('w0.pos=NNS', 'I-Disease')
('w0.pos=.', 'O')
('w0.pos=.', 'I-Disease')
('w0.pos=(', 'O')
('w0.pos=(', 'I-Disease')
('w0.pos=)', 'O')
('w0.pos=)', 'I-Disease')
('w0.pos=:', 'O')
('w0.pos=:', 'I-Disease')
('w0.pos=VBG', 'O')
('w0.pos=VBG', 'B-Disease')
('w0.pos=VBG', 'I-Disease')
('w0.pos=CD', 'O')
('w0.pos=CD', 'I-Disease')
('w0.pos=CC', 'O')
('w0.pos=CC', 'I-Disease')
('w0.pos=TO', 'O')
('w0.pos=TO', 'I-Disease')
('w0.pos=PRP', 'O')
('w0.pos=VBN', 'O

here the model did not discard the features that i have included
w0.tit and w0.pos is available in the state_features

## Question 6

In [36]:
def agg_tok_lvl(t):
    if("B-Disease" in t):
        return 1
    else:

        return 0
    
test_arr_gt=[]    
for i in test_tags:
    test_arr_gt.append(agg_tok_lvl(i))
    
test_arr_pred=[]    
for i in y_pred:
    test_arr_pred.append(agg_tok_lvl(i))

In [37]:
len(test_arr_pred)

940

In [38]:
from sklearn.metrics import precision_score,recall_score
precision = precision_score(test_arr_gt, test_arr_pred)
recall = recall_score(test_arr_gt, test_arr_pred)

In [39]:
print("the precision is",precision)

the precision is 0.963855421686747


In [40]:

print("the recall is",recall)

the recall is 0.8905380333951762
