In [1]:
import pandas as pd
import csv
from collections import Counter

### Explore the training and development data provided for the experiment

In [2]:
path_train = '../data/SEM-2012-SharedTask-CD-SCO-training-simple.v2.txt'
path_dev = '../data/SEM-2012-SharedTask-CD-SCO-dev-simple.v2.txt'

In [3]:
# use quotechar='\\' because we don't have \ in our data
# use skip_blank_lines=False because we want to keep blank lines signifying eos
# use keep_default_na=False because we want to keep data as is

train = pd.read_csv(path_train, encoding='utf-8', sep='\t', header=None, 
                    keep_default_na=False, quotechar='\\', skip_blank_lines=False)
dev = pd.read_csv(path_dev, encoding='utf-8', sep='\t', header=None, 
                  keep_default_na=False, quotechar='\\', skip_blank_lines=False)

In [4]:
train.head(10)

Unnamed: 0,0,1,2,3,4
0,baskervilles01,0.0,0.0,Chapter,O
1,baskervilles01,0.0,1.0,1.,O
2,baskervilles01,0.0,2.0,Mr.,O
3,baskervilles01,0.0,3.0,Sherlock,O
4,baskervilles01,0.0,4.0,Holmes,O
5,,,,,
6,baskervilles01,1.0,0.0,Mr.,O
7,baskervilles01,1.0,1.0,Sherlock,O
8,baskervilles01,1.0,2.0,Holmes,O
9,baskervilles01,1.0,3.0,",",O


In [5]:
dev.head(10)

Unnamed: 0,0,1,2,3,4
0,wisteria01,0.0,0.0,1.,O
1,wisteria01,0.0,1.0,The,O
2,wisteria01,0.0,2.0,Singular,O
3,wisteria01,0.0,3.0,Experience,O
4,wisteria01,0.0,4.0,of,O
5,wisteria01,0.0,5.0,Mr.,O
6,wisteria01,0.0,6.0,John,O
7,wisteria01,0.0,7.0,Scott,O
8,wisteria01,0.0,8.0,Eccles,O
9,,,,,


In [6]:
# number of values per column

train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 69094 entries, 0 to 69093
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       69094 non-null  object
 1   1       69094 non-null  object
 2   2       69094 non-null  object
 3   3       69094 non-null  object
 4   4       69094 non-null  object
dtypes: object(5)
memory usage: 2.6+ MB


In [7]:
dev.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14353 entries, 0 to 14352
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       14353 non-null  object
 1   1       14353 non-null  object
 2   2       14353 non-null  object
 3   3       14353 non-null  object
 4   4       14353 non-null  object
dtypes: object(5)
memory usage: 560.8+ KB


In [8]:
# number of unique values per column

train.nunique()

0      15
1     400
2      84
3    5780
4       4
dtype: int64

In [9]:
dev.nunique()

0       3
1     441
2      64
3    2433
4       4
dtype: int64

In [10]:
# unique labels

train[4].unique()

array(['O', '', 'B-NEG', 'I-NEG'], dtype=object)

In [11]:
# number of values per label

train[4].value_counts()

O        64448
          3643
B-NEG      987
I-NEG       16
Name: 4, dtype: int64

In [12]:
dev[4].value_counts()

O        13388
           786
B-NEG      176
I-NEG        3
Name: 4, dtype: int64

In [13]:
# initialize an empty df to collect some statistics

stats = pd.DataFrame(columns=['train', 'dev'])

In [14]:
# calculate the number of sentences: it's the number of empty rows + 1 for the last sentence

train_sents = int(train[4][train[4] == ''].value_counts() + 1)
dev_sents = int(dev[4][dev[4] == ''].value_counts() + 1)

In [15]:
# add the number of sentences to the table

stats.loc['# sentences'] = [train_sents, dev_sents]
stats.loc['# sentences']

train    3644
dev       787
Name: # sentences, dtype: object

In [16]:
# calculate the number of tokens: it's the total number of rows - number of empty rows

train_tokens = int(train.shape[0] - train[train[4] == ''][4].value_counts())
dev_tokens = int(dev.shape[0] - dev[dev[4] == ''][4].value_counts())

In [17]:
stats.loc['# tokens'] = [train_tokens, dev_tokens]
stats.loc['# tokens']

train    65451
dev      13567
Name: # tokens, dtype: object

In [18]:
# calculate the number of unique tokens: it's the unique values from column 3 - 1 for 
# value '' for empty row

train_tokens_unique = int(train[3].nunique() - 1)
dev_tokens_unique = int(dev[3].nunique() - 1)

In [19]:
stats.loc['# tokens, unique'] = [train_tokens_unique, dev_tokens_unique]
stats.loc['# tokens, unique']

train    5779
dev      2432
Name: # tokens, unique, dtype: object

In [20]:
# calculate the number of unique tokens if we lowercase them all
unique_train = set()
for token in train[3].unique():
    unique_train.add(token.lower())
unique_train - {''}
len(unique_train)

5445

In [21]:
unique_dev = set()
for token in dev[3].unique():
    unique_dev.add(token.lower())
unique_dev - {''}
len(unique_dev)

2296

In [22]:
stats.loc['# tokens lowercased, unique'] = [len(unique_train), len(unique_dev)]
stats.loc['# tokens lowercased, unique']

train    5445
dev      2296
Name: # tokens lowercased, unique, dtype: object

In [23]:
# calculate the number of negation cues: elements labeled with 'B-NEG' 

train_ncs = int(train[train[4] == 'B-NEG'][4].value_counts())
dev_ncs = int(dev[dev[4] == 'B-NEG'][4].value_counts())

In [24]:
stats.loc['# negation cues'] = [train_ncs, dev_ncs]
stats.loc['# negation cues']

train    987
dev      176
Name: # negation cues, dtype: object

In [25]:
# also interested to know the number of unique negation cues: try looking at unique tokens 
# labeled B-NEG

train[train[4] == 'B-NEG'][3].unique()

array(['save', 'not', 'infrequent', 'no', 'unfortunate', 'without',
       'never', 'nothing', 'Not', 'No', 'by', 'unknown', 'none',
       'unambitious', 'incredulously', "n't", 'unpractical',
       'inadvertently', 'unimaginative', 'nor', 'godless', 'unable',
       'unhappy', 'Nor', 'infinite', 'untimely', 'breathlessness',
       'unusual', 'untenanted', 'impenetrable', 'None', 'impatient',
       'useless', 'unsafe', 'inconceivable', 'immaterial', 'intolerable',
       'nobody', 'On', 'incredible', 'lifeless', 'rather', 'unlikely',
       'Nothing', 'uneducated', 'carelessness', 'careless', 'fail',
       'indiscreet', 'powerless', 'unfortunately', 'unpleasant',
       'unmarried', 'impossible', 'purposeless', 'inexplicable',
       'disconnected', 'imprudent', 'Never', 'undeniable', 'By', 'Except',
       'motionless', 'restlessly', 'unmistakable', 'uncontrollable',
       'hopeless', 'inadequate', 'invisible', 'unjustifiable',
       'impassable', 'indescribably', 'uncanny', 'i

In [26]:
# we can see that those are not all unique: some are the same token in a capitalized / not
# capitalized versions

# make a set of unique negation cues

train_ncs_unique = set()
for token in train[train[4] == 'B-NEG'][3].unique():
    train_ncs_unique.add(token.lower())
dev_ncs_unique = set()
for token in dev[dev[4] == 'B-NEG'][3].unique():
    dev_ncs_unique.add(token.lower())

In [27]:
train_ncs_unique

{'absence',
 'breathless',
 'breathlessness',
 'by',
 'careless',
 'carelessness',
 'colourless',
 'disapprobation',
 'disconnected',
 'disfavour',
 'displeasure',
 'distasteful',
 'except',
 'fail',
 'godless',
 'harmless',
 'helpless',
 'helplessly',
 'hopeless',
 'immaterial',
 'immutable',
 'impassable',
 'impatient',
 'impatiently',
 'impenetrable',
 'impossible',
 'imprudent',
 'inadequate',
 'inadvertently',
 'inconceivable',
 'inconclusive',
 'inconvenient',
 'incredible',
 'incredulously',
 'indescribably',
 'indiscreet',
 'inexplicable',
 'infinite',
 'infrequent',
 'inhospitable',
 'inscrutable',
 'insensible',
 'interminable',
 'intolerable',
 'invisible',
 'irregular',
 'irrelevant',
 'irresolute',
 'irretrievably',
 'irrevocable',
 'lifeless',
 'motionless',
 "n't",
 'neglected',
 'neither',
 'never',
 'no',
 'nobody',
 'noiselessly',
 'none',
 'nor',
 'not',
 'nothing',
 'nowhere',
 'on',
 'powerless',
 'prevent',
 'purposeless',
 'rather',
 'refused',
 'restlessly',
 's

In [28]:
# we can see that both single word negation cues are included, and first tokens of multi_word
# negation cues, for example 'by'
# let's explore the multi-word ncs we have in our corpus: they have element I-NEG

train[train[4] == 'I-NEG']

Unnamed: 0,0,1,2,3,4
930,baskervilles01,47,13,no,I-NEG
931,baskervilles01,47,14,means,I-NEG
4608,baskervilles02,59,6,no,I-NEG
4609,baskervilles02,59,7,means,I-NEG
10254,baskervilles03,182,2,the,I-NEG
10255,baskervilles03,182,3,contrary,I-NEG
11133,baskervilles03,236,24,than,I-NEG
11246,baskervilles03,243,1,the,I-NEG
11247,baskervilles03,243,2,contrary,I-NEG
15055,baskervilles04,197,2,for,I-NEG


In [29]:
dev[dev[4] == 'I-NEG']

Unnamed: 0,0,1,2,3,4
4257,wisteria01,248,5,no,I-NEG
4258,wisteria01,248,6,means,I-NEG
13769,wisteria02,414,7,more,I-NEG


In [30]:
# we can see there are 8 in total in training, and 2 in dev

stats.loc['# multi-word negation cues'] = [8, 2]

In [31]:
# now we can also calculate the number of single word ncs

stats.loc['# single word negation cues'] = [stats.loc['# negation cues']['train'] - stats.loc['# multi-word negation cues']['train'], 
                                           stats.loc['# negation cues']['dev'] - stats.loc['# multi-word negation cues']['dev']]
stats.loc['# single word negation cues']

train    979
dev      174
Name: # single word negation cues, dtype: object

In [32]:
# let's also look at the number of unique multi and single word negation cues and create
# sets containing them

In [33]:
train.iloc[930-1:931+1]

Unnamed: 0,0,1,2,3,4
929,baskervilles01,47,12,by,B-NEG
930,baskervilles01,47,13,no,I-NEG
931,baskervilles01,47,14,means,I-NEG


In [34]:
# add 'by no means' to a list of negation cues
train_ncs_unique_multi = set()
train_ncs_unique_multi.add('by no means')

In [35]:
train[((train[3] == 'by') | (train[3] == 'By')) & (train[4] == 'B-NEG')]

Unnamed: 0,0,1,2,3,4
929,baskervilles01,47,12,by,B-NEG
4607,baskervilles02,59,5,by,B-NEG
21427,baskervilles06,11,1,By,B-NEG


In [36]:
# 'by' never occurs as a single negation cue
train_ncs_unique_single = train_ncs_unique - {'by'}

In [37]:
train.iloc[10254-1:10255+1]

Unnamed: 0,0,1,2,3,4
10253,baskervilles03,182,1,On,B-NEG
10254,baskervilles03,182,2,the,I-NEG
10255,baskervilles03,182,3,contrary,I-NEG


In [38]:
train_ncs_unique_multi.add('on the contrary')

In [39]:
train[((train[3] == 'on') |(train[3] == 'On')) & (train[4] == 'B-NEG')]

Unnamed: 0,0,1,2,3,4
10253,baskervilles03,182,1,On,B-NEG
11245,baskervilles03,243,0,On,B-NEG


In [40]:
# 'on' never occurs as a single negation cue
train_ncs_unique_single = train_ncs_unique_single - {'on'}

In [41]:
train.iloc[11133-1:11133+1]

Unnamed: 0,0,1,2,3,4
11132,baskervilles03,236,23,rather,B-NEG
11133,baskervilles03,236,24,than,I-NEG


In [42]:
train_ncs_unique_multi.add('rather than')

In [43]:
train[((train[3] == 'rather') |(train[3] == 'Rather')) & (train[4] == 'B-NEG')]

Unnamed: 0,0,1,2,3,4
11132,baskervilles03,236,23,rather,B-NEG


In [44]:
# 'rather' never occurs as a single negation cue
train_ncs_unique_single = train_ncs_unique_single - {'rather'}

In [45]:
train.iloc[15055-1:15057+1]

Unnamed: 0,0,1,2,3,4
15054,baskervilles04,197,1,Not,B-NEG
15055,baskervilles04,197,2,for,I-NEG
15056,baskervilles04,197,3,the,I-NEG
15057,baskervilles04,197,4,world,I-NEG


In [46]:
train_ncs_unique_multi.add('not for the world')

In [47]:
train.iloc[39479-1:39480+1]

Unnamed: 0,0,1,2,3,4
39478,baskervilles09,200,11,nothing,B-NEG
39479,baskervilles09,200,12,at,I-NEG
39480,baskervilles09,200,13,all,I-NEG


In [48]:
train_ncs_unique_multi.add('nothing at all')

In [49]:
dev[dev[4] == 'I-NEG']

Unnamed: 0,0,1,2,3,4
4257,wisteria01,248,5,no,I-NEG
4258,wisteria01,248,6,means,I-NEG
13769,wisteria02,414,7,more,I-NEG


In [50]:
dev.iloc[4257-1:4258+1]

Unnamed: 0,0,1,2,3,4
4256,wisteria01,248,4,by,B-NEG
4257,wisteria01,248,5,no,I-NEG
4258,wisteria01,248,6,means,I-NEG


In [51]:
dev_ncs_unique_multi = set()
dev_ncs_unique_multi.add('by no means')

In [52]:
dev[((dev[3] == 'by') | (dev[3] == 'By')) & (dev[4] == 'B-NEG')]

Unnamed: 0,0,1,2,3,4
4256,wisteria01,248,4,by,B-NEG


In [53]:
# 'by' never occurs as a single negation cue
dev_ncs_unique_single = dev_ncs_unique - {'by'}

In [54]:
dev.iloc[13769-1:13769+1]

Unnamed: 0,0,1,2,3,4
13768,wisteria02,414,6,no,B-NEG
13769,wisteria02,414,7,more,I-NEG


In [55]:
dev_ncs_unique_multi.add('no more')

In [56]:
stats.loc['# multi-word negation cues, unique'] = [len(train_ncs_unique_multi), 
                                                   len(dev_ncs_unique_multi)]
stats.loc['# single word negation cues, unique'] = [len(train_ncs_unique_single), 
                                                    len(dev_ncs_unique_single)]

In [57]:
# I also want to know how many sentences have negation cues in the first place, and how many
# negation cues per sentence there are

def count_sentences_with_negation_cues(file_path):
    sents_with_one_nc, sents_with_multi_ncs = 0, 0
    counter = 0
    
    with open(file_path, 'r', encoding='utf8') as infile:
        filereader = csv.reader(infile, delimiter='\t', quotechar='\\')
        for row in filereader:
            if row:
                label = row[-1]
                if label == 'B-NEG':
                    counter += 1
            else:   # empty line           
                if counter == 1:
                    sents_with_one_nc += 1
                if counter > 1:
                    sents_with_multi_ncs += 1
                counter = 0
    
    if counter == 1:  # account for the last sentence
        sents_with_one_nc += 1
    if counter > 1:
        sents_with_multi_ncs += 1
    
    return sents_with_one_nc, sents_with_multi_ncs

In [58]:
train_sents_with_one_nc, train_sents_with_multi_ncs = count_sentences_with_negation_cues(path_train)

In [59]:
dev_sents_with_one_nc, dev_sents_with_multi_ncs = count_sentences_with_negation_cues(path_dev)

In [60]:
stats.loc['# sentences with one negation cue'] = [train_sents_with_one_nc, dev_sents_with_one_nc]
stats.loc['# sentences with more than one negation cue'] = [train_sents_with_multi_ncs, dev_sents_with_multi_ncs]

In [61]:
stats

Unnamed: 0,train,dev
# sentences,3644,787
# tokens,65451,13567
"# tokens, unique",5779,2432
"# tokens lowercased, unique",5445,2296
# negation cues,987,176
# multi-word negation cues,8,2
# single word negation cues,979,174
"# multi-word negation cues, unique",5,2
"# single word negation cues, unique",125,36
# sentences with one negation cue,732,115


In [62]:
# finally, let's see how frequent different single word negation cues are

train_freqs = Counter()
tokens = []
for token in train[train[4] == 'B-NEG'][3]:
    tokens.append(token.lower())

In [63]:
# build a list of multi-word negation cues so we can remove the first word from the single 
# negation cues list

train_ncs_multi = list(train_ncs_unique_multi)
train_ncs_multi.append('by no means')
train_ncs_multi.append('by no means')
train_ncs_multi.append('on the contrary')

for e in train_ncs_multi:
    tokens.remove(e.split(' ')[0])

In [64]:
train_freqs.update(tokens)

In [65]:
train_freqs.most_common(5)

[('not', 358), ('no', 226), ("n't", 65), ('never', 59), ('nothing', 55)]

In [66]:
# do the same for the development dataset

dev_freqs = Counter()
tokens = []
for token in dev[dev[4] == 'B-NEG'][3]:
    tokens.append(token.lower())

# no multi-word nc appears more than one, so we can directly use the set we constructed previously

for e in dev_ncs_unique_multi:
    tokens.remove(e.split(' ')[0])

dev_freqs.update(tokens)
dev_freqs.most_common(5)

[('not', 42), ('no', 33), ("n't", 20), ('nothing', 16), ('never', 11)]

In [67]:
# check out the lists of unique negation cues we have in our datasets
# from here, they can easily be saved to file if we want to use them in the process of 
# feature extraction
train_ncs_unique_multi

{'by no means',
 'not for the world',
 'nothing at all',
 'on the contrary',
 'rather than'}

In [68]:
dev_ncs_unique_multi

{'by no means', 'no more'}

In [69]:
train_ncs_unique_single

{'absence',
 'breathless',
 'breathlessness',
 'careless',
 'carelessness',
 'colourless',
 'disapprobation',
 'disconnected',
 'disfavour',
 'displeasure',
 'distasteful',
 'except',
 'fail',
 'godless',
 'harmless',
 'helpless',
 'helplessly',
 'hopeless',
 'immaterial',
 'immutable',
 'impassable',
 'impatient',
 'impatiently',
 'impenetrable',
 'impossible',
 'imprudent',
 'inadequate',
 'inadvertently',
 'inconceivable',
 'inconclusive',
 'inconvenient',
 'incredible',
 'incredulously',
 'indescribably',
 'indiscreet',
 'inexplicable',
 'infinite',
 'infrequent',
 'inhospitable',
 'inscrutable',
 'insensible',
 'interminable',
 'intolerable',
 'invisible',
 'irregular',
 'irrelevant',
 'irresolute',
 'irretrievably',
 'irrevocable',
 'lifeless',
 'motionless',
 "n't",
 'neglected',
 'neither',
 'never',
 'no',
 'nobody',
 'noiselessly',
 'none',
 'nor',
 'not',
 'nothing',
 'nowhere',
 'powerless',
 'prevent',
 'purposeless',
 'refused',
 'restlessly',
 'save',
 'shelterless',
 'u

In [70]:
dev_ncs_unique_single

{'dislike',
 'dissatisfied',
 'fearless',
 'impatience',
 'impossible',
 'improper',
 'inadmissable',
 'inexplicable',
 'insensibly',
 'insufferable',
 'invisible',
 'irreproachable',
 "n't",
 'needless',
 'neither',
 'never',
 'no',
 'nobody',
 'nor',
 'not',
 'nothing',
 'sapless',
 'save',
 'unbrushed',
 'unburned',
 'unclean',
 'uncommonly',
 'unconventional',
 'undoubtedly',
 'unkempt',
 'unknown',
 'unmistakable',
 'unnatural',
 'unpleasant',
 'unshaven',
 'without'}

In [72]:
outputfile = '../code/single_neg_cues_train.txt'

neg_prefix = {"dis", "im", "in", "ir", "un"}
neg_suffix = {"less", "lessness", "lessly"}

final_ncs = set()

for token in train_ncs_unique_single:
    affixal_negation = False  # a flag to check if the token has negation affix/suffix
    
    for prefix in neg_prefix:
        if token.startswith(prefix):
            affixal_negation = True
            
    if not affixal_negation:  # if it doesn't have the prefix, check also for suffix
        for suffix in neg_suffix:
            if token.endswith(suffix):
                affixal_negation = True
                
    if not affixal_negation:  # if it doesn't have any, add it to the list
        final_ncs.add(token) 

print(sorted(final_ncs))
      
with open(outputfile, 'w', encoding='utf8') as outfile:
    for token in sorted(final_ncs):
        outfile.write(token + '\n')

['absence', 'except', 'fail', "n't", 'neglected', 'neither', 'never', 'no', 'nobody', 'none', 'nor', 'not', 'nothing', 'nowhere', 'prevent', 'refused', 'save', 'without']
