# Building a rule-based system to extract term candidates

In [1]:
import os #readfile
import textract #pdftotext


In [2]:
def convertpdfstotext(path):

    filelist = []
    
    for fp in os.listdir(path):
        allfiles = filelist.append(os.path.join(path,fp))
        
    for f in filelist:
        doc = textract.process(f)
    return doc

In [23]:
path = r'C:\Users\User\Desktop\Terminology project\Data'  
doc = convertpdfstotext(path)
print(doc)

b'Neural Text Generation from Structured Data\r\nwith Application to the Biography Domain\r\nRe\xcc\x81mi Lebret\xe2\x88\x97\r\nEPFL, Switzerland\r\n\r\nDavid Grangier\r\nFacebook AI Research\r\n\r\nAbstract\r\nThis paper introduces a neural model for\r\nconcept-to-text generation that scales to large,\r\nrich domains. It generates biographical sentences from fact tables on a new dataset of\r\nbiographies from Wikipedia. This set is an\r\norder of magnitude larger than existing resources with over 700k samples and a 400k\r\nvocabulary. Our model builds on conditional\r\nneural language models for text generation.\r\nTo deal with the large vocabulary, we extend these models to mix a fixed vocabulary\r\nwith copy actions that transfer sample-specific\r\nwords from the input database to the generated output sentence. To deal with structured\r\ndata, we allow the model to embed words\r\ndifferently depending on the data fields in\r\nwhich they occur. Our neural model significantly outperfo

In [24]:
import re
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords


def preprocessing(convertpdftotext):
    text = doc.decode('utf8') #convert to byte
    removing = text.replace('\n','') #remove the production of textract
    removing = text.replace('\r','') #remove the production of extract
    sentence=str(removing)
    sentence=sentence.replace('{html}',"") 
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', sentence)
    rem_url=re.sub(r'http\S+', '',cleantext)
    rem_num = re.sub('[0-9]+', '', rem_url)
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(rem_num)  
    filtered_words = [w for w in tokens if len(w) > 2 if not w in stopwords.words('english')]
    referenceremoval = text.partition("References")[0]
    return " ".join(filtered_words)

cleandata = preprocessing(doc)
print(cleandata)


Neural Text Generation Structured Data Application Biography Domain Lebret EPFL Switzerland David Grangier Facebook Research Abstract This paper introduces neural model concept text generation scales large rich domains generates biographical sentences fact tables new dataset biographies Wikipedia This set order magnitude larger existing resources samples vocabulary Our model builds conditional neural language models text generation deal large vocabulary extend models mix fixed vocabulary copy actions transfer sample specific words input database generated output sentence deal structured data allow model embed words differently depending data fields occur Our neural model significantly outperforms Templated Kneser Ney language model nearly BLEU Introduction Concept text generation renders structured records natural language Reiter typical application generate weather forecast based set structured meteorological measurements contrast previous work scale large diverse problem generating b

In [25]:
import spacy #import library
nlp = spacy.load('en_core_web_sm') #load model which is sm small model and en means english
data = nlp(cleandata) 
print(data)
# print(type(data))

Neural Text Generation Structured Data Application Biography Domain Lebret EPFL Switzerland David Grangier Facebook Research Abstract This paper introduces neural model concept text generation scales large rich domains generates biographical sentences fact tables new dataset biographies Wikipedia This set order magnitude larger existing resources samples vocabulary Our model builds conditional neural language models text generation deal large vocabulary extend models mix fixed vocabulary copy actions transfer sample specific words input database generated output sentence deal structured data allow model embed words differently depending data fields occur Our neural model significantly outperforms Templated Kneser Ney language model nearly BLEU Introduction Concept text generation renders structured records natural language Reiter typical application generate weather forecast based set structured meteorological measurements contrast previous work scale large diverse problem generating b

Build a rule-based system for extracting candidates for the first time (before manual filtering)

In [26]:
import collections
from collections import Counter
from spacy.matcher import Matcher #import matcher


matcher = Matcher(nlp.vocab) # Initialize the matcher with the shared vocab

#define rules
pattern1 = [{'POS': 'ADJ'},{'POS': 'NOUN'}]
pattern2 = [{'POS': 'NOUN'}, {'POS': 'NOUN'}]
pattern3 = [{'POS': 'NOUN'}, {'POS': 'NOUN'}, {'POS': 'NOUN'} ]
pattern4 = [{'POS': 'NOUN'}]
pattern5 = [{'POS': 'ADJ'},{'POS': 'NOUN'}, {'POS': 'NOUN'}]

#add rules to matcher 
matcher.add('ADJ+N', [pattern1]) 
matcher.add('N+N', [pattern2]) 
matcher.add('N+N+N', [pattern3]) 
matcher.add('N', [pattern4]) 
matcher.add('ADJ+N+N', [pattern5]) 

matches = matcher(data)

d=[]
for match_id, start, end in matches:
    rule_id = nlp.vocab.strings[match_id]  # get the unicode ID, i.e. 'COLOR'
    span = data[start : end]  # get the matched slice of the doc
    d.append((rule_id, span.text))
    keyterm = span.text
print("\n".join(f'{i[0]} {i[1]} ({j})' for i,j in Counter(d).items()))
    

N paper (2)
ADJ+N neural model (2)
N model (57)
ADJ+N+N neural model concept (1)
N+N model concept (2)
N concept (3)
N+N+N model concept text (2)
N+N concept text (3)
N text (10)
N+N+N concept text generation (3)
N+N text generation (7)
N generation (34)
ADJ+N rich domains (1)
N domains (2)
ADJ+N biographical sentences (1)
N sentences (10)
ADJ+N+N biographical sentences fact (1)
N+N sentences fact (1)
N fact (7)
ADJ+N dataset biographies (1)
N biographies (6)
N order (3)
N+N order magnitude (2)
N magnitude (3)
N resources (1)
N+N resources samples (1)
N samples (2)
ADJ+N neural language (11)
N language (42)
ADJ+N+N neural language models (6)
N+N language models (10)
N models (28)
N+N+N language models text (1)
N+N models text (1)
N+N+N models text generation (1)
ADJ+N vocabulary extend (1)
N extend (1)
ADJ+N+N vocabulary extend models (1)
N+N extend models (1)
ADJ+N vocabulary copy (1)
N copy (9)
ADJ+N+N vocabulary copy actions (1)
N+N copy actions (8)
N actions (10)
N sample (1)
ADJ+N

rank the extracted key term and do manual filtering based on the list below \
note: most frequent words don't always refer that they are terms

In [27]:
total_terms = Counter(d)
rank_term = total_terms.most_common() #list all
# total_terms.most_common()[:200] #top 200

In [28]:
import pandas as pd

df = pd.DataFrame(rank_term, columns=['terms', 'frequency'])

print(df)

                                    terms  frequency
0                              (N, model)         57
1                              (N, table)         55
2                              (N, field)         51
3                           (N, language)         42
4                              (N, words)         36
...                                   ...        ...
1700  (N+N+N, network conversation model)          1
1701            (N+N, conversation model)          1
1702    (N+N+N, conversation model arXiv)          1
1703                   (N+N, model arXiv)          1
1704        (N+N+N, model arXiv preprint)          1

[1705 rows x 2 columns]


In [9]:
df.to_csv('first_time_keyterm_extraction.csv', encoding='utf-8')

After this, we need to do manual filtering to extract a list of terms (real terms) used to adapt a new rule based system with IOB tagging according to the list of terms derived from manual annotation (almost the same like the above system except that we need to modify to get exacly the keyterm we want to have ex: string matching , lemmatization, junker and so on). then apply this modified rule based system on all the same documents (pdf files) to get IOB annotation.

With the new rule based system, we will get a set of silver corpora to train our model. 

In [29]:
p1 = [{'LOWER': 'encoder'}, {'IS_PUNCT': True, 'OP':'?'}, {'LOWER': 'decoder'}]
p2 = [{"POS": {"IN": ["NOUN", "ADJ"]}}, {'LOWER': 'encoder'}]
p3 = [{"POS": {"IN": ["NOUN", "ADJ"]}}, {'LOWER': 'decoder'}]
p4 = [{'LOWER': 'encoder'}, {'IS_PUNCT': True}, {'LOWER': 'decoder'},{"POS": {"IN": ["NOUN"]}}]
p5 = [{'LOWER': 'encoder'}]
p6 = [{'LOWER': 'decoder'}]
p7 = [{'LEMMA': 'encoder'}]
p8 = [{'LEMMA': 'decoder'}]
p9 = [{'LOWER': 'token'}]
p10 = [{'LEMMA': 'token'}]
p11 = [{'LOWER': 'infobox'}]
p12 = [{'LOWER': 'info'}, {'IS_PUNCT': True, 'OP':'?'} , {'LOWER': 'box'}]
p13 = [{'LOWER': 'machine'}, {'LOWER': 'translation'},{'POS': 'NOUN','OP':'?'}]
p14 = [{"POS": {"IN": ["NOUN", "ADJ"]}}, {'LOWER': 'machine'}, {'LOWER': 'translation'}]
p15 = [{'LOWER': 'nlg'}]
p16 = [{'LOWER': 'natural'},{'IS_PUNCT': True}, {'LOWER': 'language'},{'IS_PUNCT': True}, {'LOWER': 'generation'}]
p17 = [{'LOWER': 'neural'}, {'LOWER': 'network'},{'POS': 'NOUN','OP':'?'}]
p18 = [{"POS": {"IN": ["NOUN", "ADJ"]}}, {'LOWER': 'neural'}, {'LOWER': 'network'}]
p19 = [{"POS": {"IN": ["NOUN", "ADJ"]}}, {'LOWER': 'embedding'}]
p20 = [{"POS": {"IN": ["NOUN", "ADJ"]}}, {'LEMMA': 'embedding'}]
p21 = [{'LOWER': 'beam'}, {'POS': 'NOUN'}]
p22 = [{'LOWER': 'ngram'}]
p23 = [{'LOWER': 'n'}, {'LOWER': 'gram'}]
p24 = [{'LOWER': 'n'},{'IS_PUNCT': True, 'OP':'?'}, {'LOWER': 'gram'}]
p25 = [{'LOWER':'generator'}]
p26 = [{'LOWER':'parser'}]
p27 = [{'LOWER':'ontology'}]
p28 = [{'LOWER':'softmax'}]
p29 = [{'LEMMA':'ontology'}]
p30 = [{'LOWER':'backprop'}]
p31 = [{'LOWER':'token'}]
p32 = [{'LOWER':'vector'}]
p33 = [{'LOWER':'array'}]
p34 = [{'LOWER':'copy'}, {'LEMMA':'action'},{'POS': 'NOUN','OP':'?'} ]
p35 = [{'LOWER':'copy'}, {'LOWER':'action'},{'POS': 'NOUN','OP':'?'} ]
p36 = [{'LOWER':'sochastic','OP':'?'}, {'LOWER':'gradient'}, {'LOWER':'descent'}]
p37 = [{'LOWER':'attention'}, {'POS': 'NOUN','OP':'?'}]
p38 = [{"POS": {"IN": ["NOUN","ADJ"]}}, {'LOWER':'attention'}]
p39 = [{"POS": {"IN": ["NOUN","ADJ","PROPN"]}},{'POS': 'NOUN','OP':'?'}, {'LOWER': 'function'}]
p40 = [{"POS": {"IN": ["NOUN","ADJ","PROPN"]}},{'POS': 'NOUN','OP':'?'}, {'LEMMA': 'function'}]
p41 = [{"POS": {"IN": ["NOUN","ADJ","PROPN"]}}, {'LOWER': 'layer'}]
p42 = [{"POS": {"IN": ["NOUN","ADJ","PROPN"]}}, {'LEMMA': 'layer'}]
p43 = [{"POS": {"IN": ["NOUN","ADJ"]}}, {'LOWER': 'search'}]
p44 = [{'POS': 'NOUN','OP':'?'}, {'LOWER':'vector'}]
p45 = [{'POS': 'NOUN','OP':'?'}, {'LEMMA':'vector'}]
p46 = [{'POS': 'COMP','OP':'?'}, {'LOWER':'knowledge'},{'IS_PUNCT': True, 'OP':'?'}, {'LOWER':'base'}]
p47 = [{'LOWER':'knowledge'},{'IS_PUNCT': True, 'OP':'?'}, {'LOWER':'bases'}]
p48 = [{'LOWER':'dialog'}, {'LOWER':'system'}, {'POS': 'NOUN','OP':'?'} ]
p49 = [{'LOWER':'dialog'}, {'LEMMA':'system'}, {'POS': 'NOUN','OP':'?'} ]
p50 = [{'LOWER':'local'}, {'LOWER':'field'}]
p51 = [{'LOWER':'global'}, {'LOWER':'field'}]
p52 = [{'LOWER':'dot'}, {'LOWER':'product'}]
p53 = [{'LOWER':'statistical'}, {'LOWER':'generation'}, {'LOWER':'model'}]
p54 = [{'LOWER':'alignment'}, {'LEMMA':'tree'}]
p55 = [{'LOWER':'lstm'}, {'POS': 'NOUN','OP':'?'}]
p56 = [{'LOWER':'linear'}, {'LOWER': 'transformation'}]
p57 = [{'POS': 'NOUN','OP':'?'}, {'LOWER':'likelihood'}, ]
p58 = [{'LOWER':'computational'}, {'LEMMA': 'linguistic'}]
p59 = [{'LOWER':'bleu'}, {'LEMMA': 'score'}]
p60 = [{'LOWER':'hyperparameter'}]
p61 = [{'LOWER':'corenlp'}]
p62 = [{'LEMMA':'hyperparameter'}]
p63 = [{'LOWER':'hyper'}, {'IS_PUNCT': True, 'OP':'?'}, {'lemma':'parameter'} ]
p64 = [{'LOWER':'content'}, {'LOWER':'selection'}, {'POS': 'NOUN','OP':'?'} ]
p65 = [{'LOWER':'sentence'}, {'LOWER':'planning'}, {'POS': 'NOUN','OP':'?'} ]
p66 = [{'LOWER':'factorization'}, {'POS': 'NOUN','OP':'?'} ]
p67 = [{'LEMMA':'weight'}, {'POS': 'NOUN','OP':'?'} ]
p68 = [{'LOWER':'space'}, {'LOWER': 'latent'}]
p69 = [{"POS": {"IN": ["NOUN","ADJ"]}},{'POS': 'NOUN','OP':'?'}, {'LEMMA':'model'}]
p70 = [{'LOWER':'surface'}, {'LOWER':'realization'}, {'POS': 'NOUN','OP':'?'} ]
p71 = [{'LOWER':'model'}, {'LOWER':'conditioning'}]

string_matcher = Matcher(nlp.vocab)

string_matcher.add('rulebased', [p1,p2,p3,p4,p5,p6,p7,p8,p9,p10,p11,p12,p13,p14,p15,p16,p17,p18,p19,p20,p21,p22,p23,p24,p25,p26,p27,
                        p28,p29,p30,p31,p32,p33,p34,p35,p36,p37,p38,p39,p40,p41,p42,p43,p44,p45,p46,p47,p48,p49,p50,p51,p52,
                        p53,p54,p55,p56,p57,p58,p59,p60,p61,p62,p63,p64,p65,p66,p67,p68,p69,p70,p71])




string_matches = string_matcher(data)

# print(string_matches)

d1=[]
for match_id, start, end in string_matches:
    rule_id = nlp.vocab.strings[match_id]  # get the unicode ID, i.e. 'COLOR'
    span = data[start : end]  # get the matched slice of the doc
    d1.append(span.text)
    keyterm = span.text
print(d1)



['neural model', 'language models', 'neural language models', 'extend models', 'vocabulary extend models', 'copy actions', 'neural model', 'language model', 'infobox', 'knowledge base', 'statistical generation model', 'generation model', 'infobox', 'language model', 'neural language model', 'factorization', 'alignment tree', 'tokens', 'infobox', 'sentence planning', 'surface realization', 'content selection', 'content selection model', 'selection model', 'generative models', 'Sentence planning', 'sentence planning', 'sentence planning surface', 'surface realization', 'content selection', 'content selection surface', 'surface realization', 'language models', 'neural language models', 'machine translation', 'use encoder', 'encoder', 'encoder decoder', 'decoder', 'style neural network', 'neural network', 'neural network model', 'network model', 'LSTM', 'LSTM units', 'units attention', 'attention', 'attention mechanism', 'infobox', 'language models', 'Conditional language models', 'languag

In [30]:
total_terms = Counter(d1)
rank_term = total_terms.most_common() #list all
# total_terms.most_common()[:200] #top 200
rank_term

[('language model', 19),
 ('tokens', 16),
 ('token', 12),
 ('language models', 10),
 ('vectors', 10),
 ('copy actions', 9),
 ('infobox', 7),
 ('machine translation', 7),
 ('vector', 7),
 ('neural language models', 6),
 ('neural language model', 5),
 ('attention', 5),
 ('encoder', 4),
 ('neural network', 4),
 ('Local field', 4),
 ('Global field', 4),
 ('surface realization', 3),
 ('content selection', 3),
 ('dialog systems', 3),
 ('word embeddings', 3),
 ('neural models', 3),
 ('neural model', 2),
 ('factorization', 2),
 ('sentence planning', 2),
 ('decoder', 2),
 ('neural network model', 2),
 ('network model', 2),
 ('Language model', 2),
 ('weight', 2),
 ('neural machine translation', 2),
 ('softmax', 2),
 ('field embedding', 2),
 ('dot product', 2),
 ('CoreNLP', 2),
 ('beam size', 2),
 ('Attention', 2),
 ('loss function', 2),
 ('statistical machine translation', 2),
 ('extend models', 1),
 ('vocabulary extend models', 1),
 ('knowledge base', 1),
 ('statistical generation model', 1),
 

In [31]:
import pandas as pd

df = pd.DataFrame(rank_term, columns=['terms', 'frequency'])

print(df)

                           terms  frequency
0                 language model         19
1                         tokens         16
2                          token         12
3                language models         10
4                        vectors         10
..                           ...        ...
171          Attention intention          1
172     intention neural network          1
173  neural network conversation          1
174           conversation model          1
175   network conversation model          1

[176 rows x 2 columns]


Define iob tagging rule-based system

In [20]:
from nltk import word_tokenize
def iob_rule(all_corpora,silver_corpus):


    tokens = word_tokenize(all_corpora)
    offset = 0
    entities = []
    i = 0
    while i in  range(len(tokens)):
        offset = all_corpora[0].find(tokens[i], offset)
        if i < len(tokens) - 3 and " ".join(tokens[i:i+3]) in silver_corpus:
            entities.append((offset,offset+len(tokens[i]),'B'))
            entities.append((offset+len(tokens[i])+1,offset+len(tokens[i])+len(tokens[i+1])+1,'I'))
            entities.append((offset+len(tokens[i])+len(tokens[i+1])+1,offset+len(tokens[i])+len(tokens[i+1])+len(tokens[i+2])+2,'I'))
            #offset = offset+len(tokens[i])+len(tokens[i+1])
            i = i+3
        elif i < len(tokens) - 2 and " ".join(tokens[i:i+2]) in silver_corpus:
            entities.append((offset,offset+len(tokens[i]),'B'))
            entities.append((offset+len(tokens[i])+1,offset+len(tokens[i])+len(tokens[i+1])+1,'I'))
            i = i+2 
        elif i < len(tokens) - 1 and " ".join(tokens[i:i+1]) in silver_corpus:
            entities.append((offset,offset+len(tokens[i]),'B'))
            i = i+1
        else:
            entities.append((offset,offset+len(tokens[i]),'O'))
            i = i+1
    return(tokens,entities)

In [21]:
all_corpora = data.text
silver_corpus = d1

In [22]:
iob_rule(all_corpora,silver_corpus)

(['Neural',
  'Text',
  'Generation',
  'Structured',
  'Data',
  'Application',
  'Biography',
  'Domain',
  'Lebret',
  'EPFL',
  'Switzerland',
  'David',
  'Grangier',
  'Facebook',
  'Research',
  'Abstract',
  'This',
  'paper',
  'introduces',
  'neural',
  'model',
  'concept',
  'text',
  'generation',
  'scales',
  'large',
  'rich',
  'domains',
  'generates',
  'biographical',
  'sentences',
  'fact',
  'tables',
  'new',
  'dataset',
  'biographies',
  'Wikipedia',
  'This',
  'set',
  'order',
  'magnitude',
  'larger',
  'existing',
  'resources',
  'samples',
  'vocabulary',
  'Our',
  'model',
  'builds',
  'conditional',
  'neural',
  'language',
  'models',
  'text',
  'generation',
  'deal',
  'large',
  'vocabulary',
  'extend',
  'models',
  'mix',
  'fixed',
  'vocabulary',
  'copy',
  'actions',
  'transfer',
  'sample',
  'specific',
  'words',
  'input',
  'database',
  'generated',
  'output',
  'sentence',
  'deal',
  'structured',
  'data',
  'allow',
  'mo

Getting gold data from a new text (unseen text which is different form the above 20 texts)

In [32]:
path1 = r'C:\Users\User\Desktop\Terminology project\evaluation'  
doc = convertpdfstotext(path1)
# print(doc)
cleandata = preprocessing(doc)

In [33]:
import collections
from collections import Counter
from spacy.matcher import Matcher #import matcher
import spacy #import library
nlp = spacy.load('en_core_web_sm') #load model which is sm small model and en means english
data = nlp(cleandata) 



matcher = Matcher(nlp.vocab) # Initialize the matcher with the shared vocab

#define rules
pattern1 = [{'POS': 'ADJ'},{'POS': 'NOUN'}]
pattern2 = [{'POS': 'NOUN'}, {'POS': 'NOUN'}]
pattern3 = [{'POS': 'NOUN'}, {'POS': 'NOUN'}, {'POS': 'NOUN'} ]
pattern4 = [{'POS': 'NOUN'}]
pattern5 = [{'POS': 'ADJ'},{'POS': 'NOUN'}, {'POS': 'NOUN'}]

#add rules to matcher 
matcher.add('ADJ+N', [pattern1]) 
matcher.add('N+N', [pattern2]) 
matcher.add('N+N+N', [pattern3]) 
matcher.add('N', [pattern4]) 
matcher.add('ADJ+N+N', [pattern5]) 

matches = matcher(data)

d=[]
for match_id, start, end in matches:
    rule_id = nlp.vocab.strings[match_id]  # get the unicode ID, i.e. 'COLOR'
    span = data[start : end]  # get the matched slice of the doc
    d.append((rule_id, span.text))
    keyterm = span.text
print("\n".join(f'{i[0]} {i[1]} ({j})' for i,j in Counter(d).items()))
    

N version (1)
N hal (1)
ADJ+N open access (1)
N access (1)
ADJ+N+N open access archive (1)
N+N access archive (1)
N archive (1)
N+N+N access archive deposit (1)
N+N archive deposit (1)
N deposit (1)
N+N+N archive deposit dissemination (1)
N+N deposit dissemination (1)
N dissemination (1)
ADJ+N scientific research (1)
N research (7)
ADJ+N+N scientific research documents (1)
N+N research documents (1)
N documents (3)
N+N research institutions (1)
N institutions (1)
ADJ+N private research (1)
ADJ+N+N private research centers (1)
N+N research centers (1)
N centers (1)
ADJ+N archive ouverte (1)
N ouverte (1)
ADJ+N+N archive ouverte pluridisciplinaire (1)
N+N ouverte pluridisciplinaire (1)
N pluridisciplinaire (1)
ADJ+N destinée dépôt (1)
N dépôt (1)
ADJ+N+N destinée dépôt diffusion (1)
N+N dépôt diffusion (1)
N diffusion (1)
N+N+N dépôt diffusion documents (1)
N+N diffusion documents (1)
N publics (1)
N+N publics privés (1)
N privés (1)
N CNRS (1)
N LORIA (1)
N MANS (1)
ADJ+N equal contribu

In [14]:
total_terms = Counter(d)
rank_term = total_terms.most_common() #list all
# total_terms.most_common()[:200] #top 200
rank_term

[(('N', 'speech'), 76),
 (('N', 'speaker'), 41),
 (('N', 'data'), 39),
 (('N', 'emotion'), 28),
 (('N', 'recognition'), 26),
 (('N', 'anonymization'), 24),
 (('N', 'system'), 24),
 (('N+N', 'emotion recognition'), 17),
 (('N', 'information'), 16),
 (('N', 'voice'), 15),
 (('N', 'privacy'), 14),
 (('N', 'model'), 13),
 (('N', 'features'), 13),
 (('N', 'vector'), 13),
 (('N', 'performance'), 12),
 (('N', 'transformation'), 12),
 (('N', 'attacker'), 12),
 (('N', 'baseline'), 11),
 (('N', 'results'), 11),
 (('N', 'degradation'), 11),
 (('N', 'evaluation'), 10),
 (('N+N', 'speech data'), 9),
 (('N', 'attack'), 9),
 (('N', 'values'), 9),
 (('N', 'identity'), 8),
 (('N', 'source'), 8),
 (('N', 'VPC'), 8),
 (('N+N', 'baseline system'), 8),
 (('N', 'emotions'), 8),
 (('ADJ+N', 'original speech'), 8),
 (('N', 'input'), 8),
 (('N', 'pseudo'), 8),
 (('N+N', 'pseudo speaker'), 8),
 (('N', 'scenario'), 8),
 (('N', 'research'), 7),
 (('N+N', 'speaker anonymization'), 7),
 (('N', 'paper'), 7),
 (('N+N

In [34]:
import pandas as pd

df = pd.DataFrame(rank_term, columns=['terms', 'frequency'])

print(df)

                           terms  frequency
0                 language model         19
1                         tokens         16
2                          token         12
3                language models         10
4                        vectors         10
..                           ...        ...
171          Attention intention          1
172     intention neural network          1
173  neural network conversation          1
174           conversation model          1
175   network conversation model          1

[176 rows x 2 columns]


In [16]:
df.to_csv('evaluation_data_before_filtering.csv', encoding='utf-8')

# Building IOB-tagging Model