In [None]:
import pandas as pd
import spacy
from spacy import displacy
import nltk
import numpy as np
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('universal_tagset')
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import wordnet
#stop_words = set(stopwords.words('english'))
nlp = spacy.load("en_core_web_sm")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Unzipping taggers/universal_tagset.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


## Helper functions

In [None]:
def get_root(phrase):
  for token in phrase:
    if token.dep_ == 'ROOT':
      return token

#takes a phrase as input in the tokenized fromat
#breadth-first search of the dependency tree of the given phrase 
#returns a the dictionary with the length of the path to root for each token in the sentence

def join_punctuation(seq, characters='.,;?!'):
    characters = set(characters)
    seq = iter(seq)
    current = next(seq)

    for nxt in seq:
        if nxt in characters:
            current += nxt
        else:
            yield current
            current = nxt

    yield current

#takes the dataset, phrase_id and chapter_id
#retruns the phrase in a string by merging the words
#used to iterate in the training and testing set for extracting phrases 

def get_text(table, id,chapter):
  phrase_table = table[(table['phrase_id'] == id) & (table['chapter_id'] == chapter)]
  return ' '.join(join_punctuation(phrase_table['word'].values))

# def get_text(table, id,chapter):
#   phrase_table = table[(table['phrase_id'] == id) & (table['chapter_id'] == chapter)]
#   return ' '.join(phrase_table['word'].values)

#returns a list with the labels of a phrase identified with the ch_id, and phr_id
def get_labels(table, id,chapter):
  phrase_table = table[(table['phrase_id'] == id) & (table['chapter_id'] == chapter)]
  return phrase_table['label'].values

#adjusted lemmatization for nltk library
#offers the POS as a parameter to lemmatization function to make it more precise
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

#used to match the length of the tokenization with the length of the filtered table 
def tokenize(arg, ch = 'baskervilles03', ph = 21):

  if (ph, ch) == (436, 'wisteria02'):
    with arg.retokenize() as retokenizer:
      retokenizer.merge(arg[0:2])
      retokenizer.merge(arg[4:7])
    return arg

  if (ph, ch) in [(450, 'cardboard'),(457, 'cardboard')]:
    with arg.retokenize() as retokenizer:
      retokenizer.merge(arg[2:4])
      retokenizer.merge(arg[0:2])
    return arg


  no_exc = [('baskervilles03', 16), ('baskervilles03', 20), ('baskervilles11', 45), ('baskervilles12', 283), ('baskervilles13', 271), ('baskervilles14', 55)]
  retok1_pos = []#for - 
  retok2_pos = []#for `

  #1
  shift = 0
  cr_pos = 0
  for token in arg:
    if token.text == '-':
      retok1_pos.append(cr_pos)
    cr_pos+=1
    prev_char = token.text
  
  for pos in retok1_pos:
    with arg.retokenize() as retokenizer:
      retokenizer.merge(arg[pos-1-shift:pos+2-shift])
      shift += 2 

  #2
  shift = 0
  cr_pos = 0
  prev_char = 0
  for token in arg:
    if token.text =='`' and prev_char == '`':
      retok2_pos.append(cr_pos)
    cr_pos+=1
    prev_char = token.text
 
  for pos in retok2_pos:
    with arg.retokenize() as retokenizer:
      retokenizer.merge(arg[pos-shift-1:pos+1-shift])
      shift += 1

  #3
  retok2_pos = []
  suf = ['66', '86','ve','m']
  shift = 0
  cr_pos = 0
  prev_char = 0
  for token in arg:
    if token.text in suf and prev_char == "'" or token.text == '.' and prev_char == "No" and (ch,ph) not in no_exc:
      retok2_pos.append(cr_pos)
    cr_pos+=1
    prev_char = token.text
 
  for pos in retok2_pos:
    with arg.retokenize() as retokenizer:
      retokenizer.merge(arg[pos-shift-1:pos+1-shift])
      shift += 1

  if ph in [0,'0']:
    with arg.retokenize() as retokenizer:
      retokenizer.merge(arg[2:4])

  
  return arg

## [Not needed anymore] Testing attributes on individual pre-set phrase before automatically adding to the table dataset

In [None]:
#testing if tokenize and data from table have the same length for each phrase
all_ch_ids = train_data['chapter_id'].unique()
mismatch=[]
for ch in all_ch_ids:
  filter_ch = train_data[train_data['chapter_id'] == ch]
  all_ph_ids = filter_ch['phrase_id'].unique()
  for ph in all_ph_ids:
    filter_ph = filter_ch[filter_ch['phrase_id'] == ph]
    phrase = get_text(ph, ch)
    phr_doc = nlp(phrase)
    tok = tokenize(phr_doc, ch, ph)
    if len(filter_ph) != len(tok):
      mismatch.append((ch,ph))
print(len(mismatch))

0


In [None]:
#testing if there is any negation in 
all_ch_ids = train_data['chapter_id'].unique()
mismatch=[]
for ch in all_ch_ids:
  filter_ch = train_data[train_data['chapter_id'] == ch]
  all_ph_ids = filter_ch['phrase_id'].unique()
  for ph in all_ph_ids:
    filter_ph = filter_ch[filter_ch['phrase_id'] == ph]
    phrase = get_text(ph, ch)
    phr_doc = nlp(phrase)
    tok = tokenize(phr_doc, ch, ph)
    if len(filter_ph) != len(tok):
      mismatch.append((ch,ph))
print(len(mismatch))

In [None]:
#errors
phrase = get_text(436, 'wisteria02')
phr_doc = nlp(phrase)
#toks = tokenize(phr_doc,436)
for tok in phr_doc:
  print(tok,'\n')

# flag = 0
# if(all(x in abc for x in mismatch)):
#     flag = 1
# print(flag)

In [None]:
#nbor(d) - neighbour in the initial sentence at distance d -/+ -> to left/right
phr1 = "He is interested in learning Natural Language Processing."
phr2 = "I stood upon the hearth-rug and picked up the stick which our visitor had left behind him the night before."
phr3 = "Gus Proto is a Python developer currently working for a London-based Fintech company"

phr_doc = nlp(phr2)
res = len_path_root(phr_doc)

for token in phr_doc:
  print(token.text, res[token], "\n")

displacy.render(phr_doc, style="dep", jupyter=True)

In [None]:
phr4 = "Mr. Sherlock Holmes , who was usually very late in the mornings , save upon those not infrequent occasions when he was up all night , was seated at the breakfast table ."
phr4 = "guru99 is a totally new kind of learning experience."
phr4 = "The striped bats are hanging on their feet for best"

#phr4 = sent_tokenize(phr4)
words_list = nltk.word_tokenize(phr4)
print(tokenize(words_list))
print(words_list)
#adjusted lemma
#lemmatizer = WordNetLemmatizer()
#print([lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in words_list])

#POS 1,2
# fine_tags = nltk.pos_tag(words_list)
# coarse_tags = nltk.pos_tag(words_list, tagset='universal')
# print(fine_tags)
# print(coarse_tags)

In [None]:
id = 15
ch = 'baskervilles01'
txt = get_text(id,ch)
txt = "I wouldn't do that"
phr_doc = nlp(txt)

#print(len(list(phr_doc)))

# with phr_doc.retokenize() as retokenizer:
#     retokenizer.merge(phr_doc[21:23])

#phr_doc = re_tokenize(phr_doc)
for token in phr_doc:
  print(token)

#displacy.render(phr_doc, style="dep", jupyter=True)

## [Not needed anymore] Clean CRF

In [None]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

In [None]:
nltk.download('conll2002')

[nltk_data] Downloading package conll2002 to /root/nltk_data...
[nltk_data]   Unzipping corpora/conll2002.zip.


True

In [None]:
nltk.corpus.conll2002.fileids()
train_sents = list(nltk.corpus.conll2002.iob_sents('esp.train'))

In [None]:
print(train_sents[2])

In [None]:
y_train = [sent2labels(s) for s in train_sents]
print(y_train)

[['B-LOC', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'O', 'O'], ['O'], ['O', 'B-PER', 'I-PER', 'I-PER', 'I-PER', 'O', 'B-PER', 'I-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-MISC', 'O'], ['O', 'O', 'O', 'B-PER', 'I-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'I-ORG', 'O', 'O', 'O', 'B-LOC', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-MISC', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'B-PER', 'I-PER', 'O', 'B-LOC', 'O', 'B-PER', 'I-PER', 'O', 'O', 'O', 'O', 'O'

## Merging datasets

In [None]:
#merging test datasets
test_card = pd.read_csv('/content/gdrive/MyDrive/Colab Notebooks/ATM/SEM-2012-test-cardboard.txt', sep="\t", header = None)
test_circ = pd.read_csv('/content/gdrive/MyDrive/Colab Notebooks/ATM/SEM-2012-test-circle.txt', sep="\t", header = None)

frames = [test_card, test_circ]
test_data = pd.concat(frames)
test_data.rename(columns={1: 'phrase_id', 0: 'chapter_id', 2:'word_id', 3:'word', 4:'label'}, inplace=True)
#print(test_data.head(40))

#train & dev
train = pd.read_csv('/content/gdrive/MyDrive/Colab Notebooks/ATM/SEM-2012-training.txt', sep="\t", header = None)
dev = pd.read_csv('/content/gdrive/MyDrive/Colab Notebooks/ATM/SEM-2012-dev.txt', sep="\t", header = None)

train.rename(columns={1: 'phrase_id', 0: 'chapter_id', 2:'word_id', 3:'word', 4:'label'}, inplace=True)
dev.rename(columns={1: 'phrase_id', 0: 'chapter_id', 2:'word_id', 3:'word', 4:'label'}, inplace=True)

#merged datasets in test_data, train and dev separate
print(dev.head(10))

   chapter_id  phrase_id  word_id        word label
0  wisteria01          0        0          1.     O
1  wisteria01          0        1         The     O
2  wisteria01          0        2    Singular     O
3  wisteria01          0        3  Experience     O
4  wisteria01          0        4          of     O
5  wisteria01          0        5         Mr.     O
6  wisteria01          0        6        John     O
7  wisteria01          0        7       Scott     O
8  wisteria01          0        8      Eccles     O
9  wisteria01          1        0           I     O


## Dataset exploration

In [None]:
all_ch_ids = train_data['chapter_id'].unique()
print(all_ch_ids)

['baskervilles01' 'baskervilles02' 'baskervilles03' 'baskervilles04'
 'baskervilles05' 'baskervilles06' 'baskervilles07' 'baskervilles08'
 'baskervilles09' 'baskervilles10' 'baskervilles11' 'baskervilles12'
 'baskervilles13' 'baskervilles14' 'wisteria01' 'wisteria02']


In [None]:
all_ch_ids = train_data['chapter_id'].unique()
all_features = []
all_labels = []
for ch in all_ch_ids:
  filter_ch = train_data[train_data['chapter_id'] == ch]
  #print(len(filter_ch))
  all_ph_ids = filter_ch['phrase_id'].unique()
  print(len(all_ph_ids))

156
165
249
287
280
200
342
134
399
251
295
316
299
271
347
440


In [None]:
#num chapters training
phrase_lengths = []

all_ch_ids = train_data['chapter_id'].unique()
all_features = []
all_labels = []
for ch in all_ch_ids:
  filter_ch = train_data[train_data['chapter_id'] == ch]
  all_ph_ids = filter_ch['phrase_id'].unique()
  for ph in all_ph_ids:
    filter_ph = filter_ch[filter_ch['phrase_id'] == ph]
    phrase_lengths.append(len(filter_ph))

print(max(phrase_lengths))
print(min(phrase_lengths))
print(sum(phrase_lengths) / len(phrase_lengths))

83
2
17.83299480929813


In [None]:
print(len(train_data))
print(len(test_data))

79018
19216


In [None]:
all_ch_ids = test_data['chapter_id'].unique()
print(all_ch_ids)

['cardboard' 'circle01' 'circle02']


In [None]:
all_ch_ids = test_data['chapter_id'].unique()
all_features = []
all_labels = []
for ch in all_ch_ids:
  filter_ch = test_data[test_data['chapter_id'] == ch]
  #print(len(filter_ch))
  all_ph_ids = filter_ch['phrase_id'].unique()
  print(len(all_ph_ids))

496
371
222


In [None]:
phrase_lengths = []

all_ch_ids = test_data['chapter_id'].unique()
all_features = []
all_labels = []
for ch in all_ch_ids:
  filter_ch = test_data[test_data['chapter_id'] == ch]
  all_ph_ids = filter_ch['phrase_id'].unique()
  for ph in all_ph_ids:
    filter_ph = filter_ch[filter_ch['phrase_id'] == ph]
    phrase_lengths.append(len(filter_ph))

print(max(phrase_lengths))
print(min(phrase_lengths))
print(sum(phrase_lengths) / len(phrase_lengths))

68
2
17.6455463728191


In [None]:
print(list(train_data['label'].values).count('B-NEG'))
print(list(train_data['label'].values).count('O'))
print(list(test_data['label'].values).count('B-NEG'))
print(list(test_data['label'].values).count('O'))

1163
77836
269
18942


## CRF functions

In [None]:
def word2feature(token):

  features = {
    'lemma':token.lemma_,
    'fine_pos': token.pos_,
    'coarse_pos': token.tag_,
    'dependency':token.dep_,
    'head':token.head
  }

  return features

In [None]:
#takes a token as input
#returns True if token should be kept, or False if it is filtered
#could be changed depending on performance

neg_list = ['nor', 'neither', 'without', 'nobody', 'none', 'nothing', 'never', 'not', 'no', 'nowhere', 'non', "n't", "rather", "than", 'for', 'the']
def keep(tok):
  if tok.text in neg_list:
    return True
  if tok.is_punct or tok.is_stop:
    return False

In [None]:
#use this

#takes as input text of a sentence
#returns a list of dictionaries with the features of its tokens
def sent2feature(sentence, labels, ch = 'baskervilles03', ph = 21):
  sent_feat = []
  lemmatizer = WordNetLemmatizer()
  sent_doc = nlp(sentence)
  tokens = tokenize(sent_doc, ch, ph)
  lengths = len_path_root(tokens)

  ord = 0
  shift = 0
  for tok in tokens:

    if keep(tok):
      features = word2feature(tok)
      features['len_path_root'] = lengths[tok]
      sent_feat.append(features)

    else:
      labels = np.delete(labels, ord-shift)
      shift+=1

    ord+=1

  return sent_feat, labels

#takes the table as an input
#returns the list of lists of dicitionaries with the features
#text->phrase->words 

def process_data(table):
  all_ch_ids = table['chapter_id'].unique()
  all_features = []
  all_labels = []
  for ch in all_ch_ids:
    filter_ch = table[table['chapter_id'] == ch]
    all_ph_ids = filter_ch['phrase_id'].unique()
    for ph in all_ph_ids:
      phrase = get_text(table, ph, ch)
      labels = get_labels(table, ph, ch)
      #filtered
      print(ch,ph)
      filt_features, filt_labels = sent2feature(phrase, labels, ch, ph)
      all_features.append(filt_features)
      all_labels.append(filt_labels)

  return all_features, all_labels

In [None]:
#backup
def sent2feature(sentence, ch = 'baskervilles03', ph = 21):
  sent_feat = []
  lemmatizer = WordNetLemmatizer()
  sent_doc = nlp(sentence)
  tokens = tokenize(sent_doc, ch, ph)
  lengths = len_path_root(tokens)

  ord = 0
  shift = 0
  for tok in tokens:
    features = word2feature(tok)
    features['len_path_root'] = lengths[tok]
    sent_feat.append(features)

  return sent_feat

def process_data(table):
  all_ch_ids = table['chapter_id'].unique()
  all_features = []
  all_labels = []
  for ch in all_ch_ids:
    filter_ch = table[table['chapter_id'] == ch]
    all_ph_ids = filter_ch['phrase_id'].unique()
    for ph in all_ph_ids:
      filter_ph = filter_ch[filter_ch['phrase_id'] == ph]
      phrase = get_text(table, ph, ch)
      labels = get_labels(table, ph, ch)
      all_features.append(sent2feature(phrase, ch, ph))
      all_labels.append(labels)

  return all_features, all_labels

In [None]:
x_train, y_train = process_data(train_data)
x_test, y_test = process_data(test_data)

In [None]:
print(y_train[])

In [None]:
# #some test code
# doc = nlp("I don't like apples and pasta.")
# for tok in doc:
#   print(tok.text, tok.is_stop, '\n')

phr1 = get_text(test_data, 52, 'circle01')
doc = nlp(phr1)
doc = tokenize(doc)
for tok in doc:
  print(tok)
