In [1]:
import pandas as pd
import nltk
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.parse.stanford import StanfordDependencyParser

In [2]:
path_train = "/Users/lauraalvarez/Documents/GitHub/ATM/A2/SEM-2012-SharedTask-CD-SCO-simple.v2/SEM-2012-SharedTask-CD-SCO-training-simple.v2.txt"
data = pd.read_csv(path_train, sep="\t", header=None, names=['annotator', 'sentence_id', 'token_id', 'token', 'label'])
data

Unnamed: 0,annotator,sentence_id,token_id,token,label
0,baskervilles01,0,0,Chapter,O
1,baskervilles01,0,1,1.,O
2,baskervilles01,0,2,Mr.,O
3,baskervilles01,0,3,Sherlock,O
4,baskervilles01,0,4,Holmes,O
...,...,...,...,...,...
65446,baskervilles14,270,58,slopes,O
65447,baskervilles14,270,59,of,O
65448,baskervilles14,270,60,the,O
65449,baskervilles14,270,61,moor,O


## Extract features

### Lemma, POS tag, Dependency head, Dependency relationship

In [3]:
from nltk.corpus import wordnet

# Function to get the wordnet POS, it fixes compatibility issues with the nltk POS
def get_wordnet_pos(treebank_tag):

    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None

### Read file containing list of multi-negation expressions

In [4]:
multiple_neg_lfile = open("multiple-neg-list.txt", "r")
content = multiple_neg_lfile.read().lower()
multiple_neg_list = content.split("\n")
multiple_neg_list

['by no means',
 'on the contrary',
 'not for the world',
 'nothing at all',
 'rather than',
 'no more',
 'no longer']

In [5]:
from nltk.parse import CoreNLPParser
from nltk.parse.corenlp import CoreNLPDependencyParser
import os 

# How to set up depencency parser: https://github.com/nltk/nltk/wiki/Stanford-CoreNLP-API-in-NLTK
# Command line instruction to start server
    #java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer \
    # -preload tokenize,ssplit,pos,lemma,ner,parse,depparse \
    # -status_port 9000 -port 9000 -timeout 15000 & 

dep_parser = CoreNLPDependencyParser(url='http://localhost:9000')
lemmatizer = WordNetLemmatizer()

column_values = data[['annotator']].values.ravel()
annotator_ids = pd.unique(column_values)

pos_tags = []
heads = []
dep_rels = []
lemmas = []

for annotator in annotator_ids:
    annotator_data = data[data['annotator'] == annotator]
    column_values = annotator_data[['sentence_id']].values.ravel()
    sentence_ids = pd.unique(column_values)

    for sent_id in sentence_ids:
        sentence = annotator_data.loc[annotator_data['sentence_id'] == sent_id, 'token']
        parse, = dep_parser.parse(sentence)
        conll = parse.to_conll(4) # get the conll format
        df = pd.DataFrame([x.split('\t') for x in conll.split('\n')[:-1]], columns=['word', 'pos', 'head', 'deprel'])
        df['head'] = df['head'].astype(int)
        head = list(df['head'].values)
        dep_rel = list(df['deprel'].values)

        for p, h, d in zip(nltk.pos_tag(sentence), head, dep_rel):
            pos_tags.append(p[1])
            heads.append(h)
            dep_rels.append(d)
            if get_wordnet_pos(p[1]): lemma = lemmatizer.lemmatize(p[0], pos=get_wordnet_pos(p[1]))
            else: lemma = lemmatizer.lemmatize(p[0])
            lemmas.append(lemma)
                
                        
data['pos-tag'] = pos_tags
data['head'] = heads
data['dep-rel'] = dep_rels
data['lemma'] = lemmas
data['isPartOfNeg'] = 0
data

# Error example
# data.loc[ data['sentence_id'] == 12]
# data.loc[(data['annotator'] == 'baskervilles01') & (data['sentence_id'] == 12)]



Unnamed: 0,annotator,sentence_id,token_id,token,label,pos-tag,head,dep-rel,lemma,isPartOfNeg
0,baskervilles01,0,0,Chapter,O,NN,0,ROOT,Chapter,0
1,baskervilles01,0,1,1.,O,CD,1,nummod,1.,0
2,baskervilles01,0,2,Mr.,O,NNP,1,punct,Mr.,0
3,baskervilles01,0,3,Sherlock,O,NNP,5,compound,Sherlock,0
4,baskervilles01,0,4,Holmes,O,NNP,3,root,Holmes,0
...,...,...,...,...,...,...,...,...,...,...
65446,baskervilles14,270,58,slopes,O,NNS,55,nmod,slope,0
65447,baskervilles14,270,59,of,O,IN,62,case,of,0
65448,baskervilles14,270,60,the,O,DT,62,det,the,0
65449,baskervilles14,270,61,moor,O,NN,59,nmod,moor,0


### IsPartOfNegation

In [6]:
data.token = data.token.str.lower()

def find_sub_list(sl,l):
    results=[]
    sll=len(sl)
    for ind in (i for i,e in enumerate(l) if e==sl[0]):
        if l[ind:ind+sll]==sl:
            results.append((ind,ind+sll-1))

    return results

tokens = list(data.token.values)
for exp in multiple_neg_list:
    exp = exp.split(' ')
    index = find_sub_list(exp, tokens)
    for i in index:
        data.loc[data.index[i[0]], 'isPartOfNeg'] = 1
        data.loc[data.index[i[1]], 'isPartOfNeg'] = 1
        if i[1] - i[0] >1:
            data.loc[data.index[i[1]-1], 'isPartOfNeg'] = 1

In [7]:
data.to_csv('temp_data.csv')