In [1]:
from collections import Counter
from collections import defaultdict
import numpy as np
import pandas as pd
import spacy
from nltk import word_tokenize
import string

from spacy.tokenizer import Tokenizer
from spacy.lang.en import English
from stanza.pipeline.processor import ProcessorVariant, register_processor_variant

In [2]:
# load data
data = pd.read_csv('SEM-2012-SharedTask-CD-SCO-training-simple.v2.txt', sep="\t", header=None)
data.columns = ['annotator', 'sentence_id', 'token_id', 'token', 'label']

In [3]:
nlp = spacy.load("en_core_web_sm")

In [4]:
punctuations = string.punctuation
punctuations = punctuations.replace("'", '')
punctuations = punctuations.replace('`', '')

In [5]:
annotator_ids = data["annotator"].unique()
TOTAL_SENTENCES = []
# print(annotator_ids)
for annotator in annotator_ids:
    temp = data.loc[data["annotator"]==annotator]
    list_sentences = temp['sentence_id'].unique()
    # print(list_sentences)
    for sentence in list_sentences:
        temp2 = temp.loc[temp['sentence_id'] == sentence]
        tokens = ' '.join(list(temp2['token']))
        ## REMOVE PUNCTUATION EXCEPT APOSTROPHE
        tokens = " ".join("".join([" " if ch in punctuations else ch for ch in tokens]).split())
        TOTAL_SENTENCES.append(tokens)


In [6]:
print('Number of Sentences:', len(TOTAL_SENTENCES))
print('Number of tokens:', len(data['token']))

Number of Sentences: 3644
Number of words: 65451


In [10]:
# Counting the frequencies of the words 
word_frequencies = Counter()
total_length = 0

# To count the number of tokens 
num_tokens = 0
pos_tokencounts_dict = defaultdict(Counter)

annotator_ids = data["annotator"].unique()

for annotator in annotator_ids:
    temp = data.loc[data['annotator'] == annotator]
    list_sentences = pd.unique(data[['sentence_id']].values.ravel())

    for sentence in list_sentences:
        words = []
        pos_list = []
        tag_list = []
        temp2 = temp.loc[temp['sentence_id'] == sentence]

        tokens = ' '.join(list(temp2['token']))
        tokens = " ".join("".join([" " if ch in punctuations else ch for ch in tokens]).split())
        doc = nlp(tokens)
        prev = None
        prev_lemma = []
        lemmas = []
        for token in doc:
            num_tokens += 1
            prev_lemma = prev
            lemmas.append(token.lemma_)
            if not token.is_punct:
                total_length += len(token)
                tags_tp = (token.tag_,token.pos_)
                pos_tokencounts_dict[tags_tp].update([token.text])
                words.append(token.text)
        word_frequencies.update(words)


In [11]:
# Number of words 
num_words = sum(word_frequencies.values())
print('Number of words:', num_words)

# Number of word types 
num_types = len(word_frequencies.keys())
print('Number of word types:', num_types)
# Average words per sentence 

avg_words_sen = num_words/len(TOTAL_SENTENCES)
print('Average words per sentence:', avg_words_sen)

# Average word length 
avg_word_length = total_length / num_words
print('Average word length', avg_word_length)

Number of words: 58577
Number of word types: 5673
Average words per sentence: 16.074917672886937
Average word length 3.942929818870888


In [9]:
pos_tokencounts_dict

defaultdict(collections.Counter,
            {('NN',
              'NOUN'): Counter({'Chapter': 14,
                      'night': 70,
                      'breakfast': 11,
                      'table': 10,
                      'hearth': 2,
                      'rug': 1,
                      'stick': 13,
                      'visitor': 10,
                      'piece': 3,
                      'wood': 4,
                      'sort': 8,
                      'lawyer': 2,
                      'head': 30,
                      'silver': 6,
                      'band': 2,
                      'inch': 4,
                      'date': 12,
                      'family': 34,
                      'practitioner': 6,
                      'back': 8,
                      'sign': 9,
                      'occupation': 1,
                      'coffee': 5,
                      'pot': 1,
                      'front': 23,
                      'notion': 1,
                      'errand