# NER USING CRF

In [83]:
import nltk
import sklearn_crfsuite
import eli5
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import train_test_split
from collections import Counter

In [84]:
raw_text = open('S21-gene-train.txt').readlines()

entries = {'index':[],
            'tokenized_sents':[],
            'label':[]}

current_index = []
current_sentence = []
current_label = []



for i, line in enumerate(raw_text) : 

        if line=="\n":
            entries['index'].append(current_index)
            entries['tokenized_sents'].append(current_sentence)
            entries['label'].append(current_label)
            
            current_index = []
            current_sentence = []
            current_label = []
        
        else : 

            index, word, label = line.split("\t")

            current_index.append(index)
            current_sentence.append(word)
            current_label.append(label.strip())

df = pd.DataFrame(entries)

# df['tokenized_sents'] = df.apply(lambda row: nltk.word_tokenize(row['sentence']), axis=1)
# df = df.drop(columns = ['sentence'])
df

Unnamed: 0,index,tokenized_sents,label
0,"[1, 2, 3, 4, 5, 6, 7, 8, 9]","[Comparison, with, alkaline, phosphatases, and...","[O, O, B, I, O, B, I, I, O]"
1,"[1, 2, 3, 4, 5, 6]","[Pharmacologic, aspects, of, neonatal, hyperbi...","[O, O, O, O, O, O]"
2,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[When, CSF, [, HCO3, -], is, shown, as, a, fun...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
3,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[Flurazepam, thus, appears, to, be, an, effect...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
4,"[1, 2, 3, 4]","[Beta, blocking, agents, .]","[O, O, O, O]"
...,...,...,...
13790,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[Expression, of, SREBP, -, 1a, stimulated, StA...","[O, O, B, I, I, O, B, I, O, O, O, O, O, O, O, ..."
13791,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[Because, the, high, -, density, lipoprotein, ...","[O, O, B, I, I, I, I, O, B, I, I, O, O, O, O, ..."
13792,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[IFN, -, stimulated, gene, factor, -, 3, and, ...","[B, I, I, I, I, I, I, O, B, I, O, O, O, O, B, ..."
13793,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[We, have, therefore, studied, the, molecular,...","[O, O, O, O, O, O, O, O, B, I, I, O, O, B, O, ..."


In [85]:
np.random.seed(10)
train_df, test_df = train_test_split(df, test_size=0.2)

def format_data(dataset):
    data = []
    for l1,l2 in zip(dataset['tokenized_sents'].tolist(), dataset['label'].tolist()):
        temp_list = []
        for ele1, ele2 in zip(l1, l2):
            temp_list.append((ele1, ele2))
        data.append(temp_list)
    return data



train_data = format_data(train_df)
test_data = format_data(test_df)

In [86]:
b_words = []
i_words = []
o_words = []

all_words = []
all_labels = []

for i, row in df.iterrows():
    for k, ele in enumerate(row['label']):
        all_words.append(row['tokenized_sents'][k])
        all_labels.append(ele)
        if ele == 'B':
            b_words.append(row['tokenized_sents'][k])
        elif ele == 'I':
            i_words.append(row['tokenized_sents'][k])
        elif ele == 'O':
            o_words.append(row['tokenized_sents'][k])    
            
i_most_common = set([ele[0] for ele in Counter(i_words).most_common(20)])
b_most_common = set([ele[0] for ele in Counter(b_words).most_common(20)])
o_most_common = set([ele[0] for ele in Counter(o_words).most_common(20)])

def find_diff(i, ii, iii):
    diff = i.difference(ii)
    diff = diff.difference(iii)
    return diff


In [87]:
import nltk
from nltk.corpus import stopwords
from tqdm import tqdm

def build_features(sent, i, test = False):
    word = sent[i][0]
    
    if test:
        word = sent[i]
    # stop_words = set(stopwords.words('english'))
    # def is_stopword(word):
    #     if word in stop_words:
    #         return True
    
    def check_ase(word):
        if word[-3:] == 'ase':
            return True
        return False

    def check_ein(word):
        if word[-3:] == 'ein':
            return True
        return False

    def check_hyphen(word):
        if '-' in word:
            return True
        return False

    def check_ene(word):
        if word[-3:] == 'ene':
            return True
        return False
    
        

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word.upper()': word.upper(),
        'word[-3:]': word[-3:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'word.islower()': word.islower(),
        # 'word.is_stopword()' : is_stopword(word),
        'word.is_dash()' : check_hyphen(word),
        'word.i_most_common()' : word in find_diff(i_most_common, b_most_common, o_most_common),
        'word.b_most_common()' : word in find_diff(b_most_common, i_most_common, o_most_common),
        'word.o_most_common()' : word in find_diff(o_most_common, i_most_common, b_most_common),
        'word.ene()' : check_ene(word),
        'word.ein()' : check_ein(word),
        'word.ase()': check_ase(word)
    }
    if i > 0:
        word1 = sent[i-1][0]
        if test == True:
            word1 = sent[i-1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:word.isdigit()': word.isdigit(),
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        if test == True:
            word1 = sent[i+1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:word.isdigit()': word.isdigit(),
        })
    else:
        features['EOS'] = True

    return features


def sent2features(sent):
    return [build_features(sent, i) for i in range(len(sent))]

def sent2features_test(sent):
    return [build_features(sent, i, test=True) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, label in sent]

def sent2tokens(sent):
    return [token for token, label in sent]


X_train = [sent2features(s) for s in train_data]
y_train = [sent2labels(s) for s in train_data]

X_test = [sent2features(s) for s in test_data]
y_test = [sent2labels(s) for s in test_data]

In [89]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=200,
    all_possible_transitions=False,
)
crf.fit(X_train, y_train)
eli5.show_weights(crf, top=30)



From \ To,B,I,O
B,0.0,4.045,-5.493
I,0.0,5.61,-4.887
O,8.733,0.0,2.982

Weight?,Feature,Unnamed: 2_level_0
Weight?,Feature,Unnamed: 2_level_1
Weight?,Feature,Unnamed: 2_level_2
+10.332,BOS,
+4.076,-1:word.lower():scid,
+3.952,word[-3:]:trp,
+3.467,word[-3:]:ip1,
+3.432,+1:word.lower():acetylase,
+3.421,word[-3:]:p53,
+3.418,+1:word.lower():umud,
+3.357,word[-3:]:ac1,
+3.206,word[-3:]:p85,
+3.118,word[-3:]:p60,

Weight?,Feature
+10.332,BOS
+4.076,-1:word.lower():scid
+3.952,word[-3:]:trp
+3.467,word[-3:]:ip1
+3.432,+1:word.lower():acetylase
+3.421,word[-3:]:p53
+3.418,+1:word.lower():umud
+3.357,word[-3:]:ac1
+3.206,word[-3:]:p85
+3.118,word[-3:]:p60

Weight?,Feature
+4.043,+1:word.lower():::
+3.641,-1:word.lower():gcn3
+3.450,-1:word.lower():uncb
+3.302,-1:word.lower():il
+3.150,-1:word.lower():bvg
+3.070,-1:word.lower():small
+3.054,-1:word.lower():hrp
+3.027,+1:word.lower():truncations
+2.977,-1:word.lower():anti
+2.965,-1:word.lower():homeotic

Weight?,Feature
+6.681,EOS
+4.595,word[-3:]:tes
+4.469,word[-3:]:sed
+4.364,word[-3:]:ell
+4.320,-1:word.lower():gag
+4.162,-1:word.lower():spc1
+4.161,BOS
+4.142,-1:word.lower():snf1
+4.020,word[-3:]:tly
+3.981,word.lower():release


In [90]:
# Evaluation of individual labels

from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
preds = crf.predict(X_test)

bin_preds = mlb.fit_transform(preds)
bin_test = mlb.fit_transform(y_test)

from sklearn.metrics import precision_recall_fscore_support as score
precision, recall, fscore, support = score(bin_test, bin_preds)

scores = pd.DataFrame()
scores['labels'] = ['B', 'I', 'O']
scores['precision'] = precision
scores['recall'] = recall 
scores['fscore'] = fscore 
scores['support'] = support

scores.style.format({
    'precision': '{:,.2%}'.format,
    'recall': '{:,.2%}'.format,
    'fscore': '{:,.2f}'.format,
})


Unnamed: 0,labels,precision,recall,fscore,support
0,B,90.44%,90.90%,0.91,1395
1,I,88.39%,86.88%,0.88,1113
2,O,100.00%,100.00%,1.0,2758


In [91]:
def write_to_file(filename, df, label_col):
    sentences = df.tokenized_sents.tolist()
    labels = df[label_col].tolist()
    
    with open(filename, 'w') as f:
        for k,ele in enumerate(sentences):
            for i,val in enumerate(zip(ele, labels[k])):
                # print(ele, labels[k])
                f.write("\t".join([str(i+1),val[0],val[1]]) + "\n")
            f.write("\n")

In [92]:
test_df['preds'] = preds
test_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['preds'] = preds


Unnamed: 0,index,tokenized_sents,label,preds
9700,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[The, goal, of, this, study, was, to, identify...","[O, O, O, O, O, O, O, O, O, O, O, O, O, B, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, B, O, ..."
5160,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]","[A, new, Onchocerca, species, ,, a, parasite, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O]","[O, O, O, O, O, O, O, O, O, O, O, O, O]"
9449,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[The, second, primary, mutant, contained, a, p...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
2853,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[The, human, immunodeficiency, virus, type, 1,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, B, I, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, B, I, ..."
10721,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[The, aim, of, the, study, was, to, correlate,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
...,...,...,...,...
5211,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[Five, control, subjects, with, COPD, ,, who, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
4219,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[Effects, of, temperature, and, moulting, cycl...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]","[O, O, O, O, O, O, O, B, O, O, O, O, O, O, O, O]"
5612,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[The, patients, undergoing, VTLB, had, signifi...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
11230,"[1, 2, 3, 4, 5, 6, 7]","[Improving, the, evidence, base, for, anaesthe...","[O, O, O, O, O, O, O]","[O, O, O, O, O, O, O]"


In [93]:
write_to_file('goldstandardfile.txt', test_df, 'label')
write_to_file('yoursystemoutput.txt', test_df, 'preds')

# Test data - 11/29/21

In [94]:
raw_text = open('F21-gene-test.txt').readlines()

entries = {'index':[],
            'tokenized_sents':[]}

current_index = []
current_sentence = []



for i, line in enumerate(raw_text) : 

        if line=="\n":
            entries['index'].append(current_index)
            entries['tokenized_sents'].append(current_sentence)
            
            current_index = []
            current_sentence = []

        else : 

            index, word = line.split("\t")

            current_index.append(index)
            current_sentence.append(word.strip())

test_df = pd.DataFrame(entries)

# df['tokenized_sents'] = df.apply(lambda row: nltk.word_tokenize(row['sentence']), axis=1)
# df = df.drop(columns = ['sentence'])
test_df

Unnamed: 0,index,tokenized_sents
0,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[BACKGROUND, :, Ischemic, heart, disease, is, ..."
1,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[More, importantly, ,, this, fusion, converted..."
2,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[Reverse, transcription, -, PCR, analysis, of,..."
3,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[Using, the, postural, and, force, data, as, i..."
4,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[Sequence, analysis, revealed, significant, di..."
...,...,...
503,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[Nrf2, regulates, expression, of, genes, encod..."
504,"[1, 2, 3, 4, 5, 6, 7]","[Workload, ,, UAPs, ,, and, you, .]"
505,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[RESULTS, :, At, latest, examination, ,, mean,..."
506,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]","[Perfusion, technique, for, perfusion, -, assi..."


In [95]:
def format_data(dataset):
    data = []
    for l1 in dataset['tokenized_sents'].tolist():
        temp_list = []
        for ele1 in l1:
            temp_list.append(ele1)
        data.append(temp_list)
    return data


test_final = [sent2features_test(s) for s in format_data(test_df)]

fin_preds = crf.predict(test_final)

test_df['preds'] = fin_preds 
test_df

Unnamed: 0,index,tokenized_sents,preds
0,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[BACKGROUND, :, Ischemic, heart, disease, is, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
1,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[More, importantly, ,, this, fusion, converted...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
2,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[Reverse, transcription, -, PCR, analysis, of,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
3,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[Using, the, postural, and, force, data, as, i...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
4,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[Sequence, analysis, revealed, significant, di...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
...,...,...,...
503,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[Nrf2, regulates, expression, of, genes, encod...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
504,"[1, 2, 3, 4, 5, 6, 7]","[Workload, ,, UAPs, ,, and, you, .]","[O, O, B, O, O, O, O]"
505,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[RESULTS, :, At, latest, examination, ,, mean,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
506,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]","[Perfusion, technique, for, perfusion, -, assi...","[O, O, O, O, O, O, O, O, O, O, O, O, O]"


In [96]:
write_to_file('labeled_test_data.txt', test_df, 'preds')