In [10]:
import nltk
nltk.download('wordnet')
from itertools import chain
from nltk.corpus import wordnet


[nltk_data] Downloading package wordnet to /Users/roksana/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [11]:

def get_synonyms(word):
    synonyms = wordnet.synsets(word)
    lemmas = set(chain.from_iterable([word.lemma_names() for word in synonyms]))
    # only keep synonyms with the same pos tag
    lemmas = set([lemma for lemma in lemmas if wordnet.synsets(lemma) and wordnet.synsets(lemma)[0].pos() == wordnet.synsets(word)[0].pos()])
    # remove the original word
    lemmas.discard(word)
    return lemmas


In [12]:
import re
path = '/Users/roksana/Projects/definitions-1/semeval2020_ulscd_eng/corpus2/lemma/ccoha2.txt/ccoha2.txt'

with open(path, 'r') as file:
    lines = file.readlines()

# if there are _ in the lines, remove them and the following pos tag
def remove_pos_tags(sentence):
    return re.sub(r'(\b\w+?)_\w+\b', r'\1', sentence)
lines = [remove_pos_tags(line) for line in lines]

print(len(lines))

353692


In [13]:
import nltk
from nltk.corpus import brown
from nltk import FreqDist, pos_tag
import random

# Download necessary NLTK data files
nltk.download('brown')
nltk.download('universal_tagset')

# Step 1: Extract words from the Brown Corpus
words = [word.lower() for word in brown.words() if word.isalpha()]

# Step 2: Calculate word frequencies
freq_dist = FreqDist(words)

# Step 3: Set frequency thresholds
MIN_FREQ = 50     # Minimum frequency threshold
MAX_FREQ = 2000   # Maximum frequency threshold

# Step 4: Filter words by frequency
filtered_words = [word for word in set(words)
                  if MIN_FREQ <= freq_dist[word] <= MAX_FREQ]

# Step 5: Perform POS tagging
tagged_words = pos_tag(filtered_words, tagset='universal')

# Step 6: Filter words by desired POS (nouns, verbs, adjectives)
desired_tags = {'NOUN', 'VERB', 'ADJ'}
filtered_tagged_words = [(word, tag) for word, tag in tagged_words if tag in desired_tags]

# Step 7: Separate words by POS
nouns = [word for word, tag in filtered_tagged_words if tag == 'NOUN']
verbs = [word for word, tag in filtered_tagged_words if tag == 'VERB']
adjectives = [word for word, tag in filtered_tagged_words if tag == 'ADJ']

# Step 8: Decide the number of words from each POS
num_words = 200
num_per_pos = num_words // 3

# Step 9: Randomly select words from each POS category
random.seed(42)  # For reproducibility
selected_nouns = random.sample(nouns, min(len(nouns), num_per_pos))
selected_verbs = random.sample(verbs, min(len(verbs), num_per_pos))
selected_adjectives = random.sample(adjectives, min(len(adjectives), num_per_pos))

# Step 10: Combine and shuffle the final list
final_words = selected_nouns + selected_verbs + selected_adjectives
random.shuffle(final_words)

# Step 11: Fill in if fewer than 100 words
if len(final_words) < num_words:
    additional_words_needed = num_words - len(final_words)
    remaining_words = [word for word in filtered_words if word not in final_words]
    random.shuffle(remaining_words)
    final_words.extend(remaining_words[:additional_words_needed])

# Step 12: Output the list of words
print("List of 100 somewhat common words with varied verbs, nouns, and adjectives:\n")
for word in final_words[:200]:
    print(word)
print(len(final_words))


[nltk_data] Downloading package brown to /Users/roksana/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     /Users/roksana/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


List of 100 somewhat common words with varied verbs, nouns, and adjectives:

political
regard
aspects
thick
sweet
sun
hot
coming
obtained
london
medical
sharp
conventional
forest
device
formed
expected
late
official
stood
becoming
forth
issues
broken
understanding
health
paper
asking
being
young
party
technique
neighborhood
russian
wage
la
completed
unique
united
central
affairs
writer
sitting
unable
catholic
train
methods
move
joseph
suggest
military
single
planned
reasonable
seem
project
america
group
club
career
given
du
improved
horses
compared
answered
truth
open
added
child
friday
communication
literary
organized
waiting
company
rule
considered
personal
present
fundamental
authority
mean
believe
concerning
developed
declared
discussion
miss
victory
grew
represented
instance
march
pain
high
financial
hundred
contact
prove
heat
christ
independent
analysis
warm
appropriate
article
produced
wonderful
love
fourth
works
directed
faculty
assume
stream
said
grounds
man
flesh
established


In [14]:
# for each word, select 200 sentences that contain it
word_sentences = {word: [] for word in final_words}
for line in lines:
    for word in final_words:
        if word in line:
            word_sentences[word].append(line)

# remove words that have less than 30 sentences and limit the number of sentences to 100
for word in list(word_sentences.keys()):
    sentences = word_sentences[word]
    if len(sentences) < 30:
        word_sentences.pop(word)
        final_words.remove(word)
    else:
        word_sentences[word] = sentences[:100]


word_synonyms = {word: set() for word in final_words}
for word in final_words:
    word_synonyms[word] = get_synonyms(word)


synonym_counts = [len(synonyms) for synonyms in word_synonyms.values()]
# Remove words with less than 2 synonyms
for word in list(word_synonyms.keys()):
    synonyms = word_synonyms[word]
    if len(synonyms) < 2:
        print(f"'{word}': {synonyms}")
        word_synonyms.pop(word)
        word_sentences.pop(word, None)
        final_words.remove(word)

print(len(word_synonyms), len(word_sentences), len(final_words))

'political': set()
'hot': set()
'conventional': {'ceremonious'}
'late': set()
'official': {'functionary'}
'health': {'wellness'}
'technique': {'proficiency'}
'russian': {'Russian'}
'writer': {'author'}
'catholic': {'Catholic'}
'seem': {'appear'}
'du': set()
'communication': {'communicating'}
'literary': set()
'personal': set()
'victory': {'triumph'}
'financial': {'fiscal'}
'warm': {'warm_up'}
'article': {'clause'}
'moral': {'lesson'}
'opportunity': {'chance'}
'southern': set()
'towards': set()
'orchestra': set()
'commercial': {'commercial_message'}
'sir': {'Sir'}
'foreign': {'strange'}
136 136 136


In [15]:
# print the len of sentences for each word in word_sentences
for word, sentences in word_sentences.items():
    print(f"{word}: {len(sentences)}")

regard: 100
thick: 100
sweet: 100
sun: 100
coming: 100
london: 100
medical: 100
sharp: 100
forest: 100
device: 100
formed: 100
expected: 100
forth: 100
broken: 100
understanding: 100
paper: 100
being: 100
young: 100
party: 100
neighborhood: 100
wage: 100
la: 100
unique: 100
united: 100
central: 100
unable: 100
train: 100
move: 100
joseph: 100
suggest: 100
military: 100
single: 100
planned: 72
reasonable: 100
project: 100
america: 100
group: 100
club: 100
career: 100
given: 60
horses: 31
truth: 100
open: 100
added: 43
child: 100
friday: 100
company: 100
rule: 100
present: 100
fundamental: 100
authority: 100
mean: 100
believe: 100
concerning: 100
developed: 48
discussion: 100
miss: 100
instance: 100
march: 100
pain: 100
high: 100
hundred: 100
contact: 100
prove: 100
heat: 100
christ: 100
independent: 100
analysis: 100
appropriate: 100
wonderful: 100
love: 100
fourth: 100
works: 100
faculty: 100
assume: 100
stream: 100
said: 46
man: 100
flesh: 100
approval: 100
feet: 33
responsibility: 10

In [16]:
# for each word, select up to 5 synonyms that aren't the word itself
# word_synonyms = {word: random.sample(synonyms, min(5, len(synonyms))) for word, synonyms in word_synonyms.items()}

# for each synonym, select up to 100 sentences that contain it
word_synonym_sentences = {}
for word in word_synonyms.keys():
    word_synonym_sentences[word] = {}
    for synonym in word_synonyms[word]:
        word_synonym_sentences[word][synonym] = []
        for line in lines:
            if synonym in line:
                word_synonym_sentences[word][synonym].append(line)


In [17]:
# # remove senses that have less than 30 sentences and limit the number of sentences to 100
# for word in list(word_synonym_sentences.keys()):
#     senses = word_synonym_sentences[word]
#     for sense, sentences in senses.items():
#         if len(sentences) < 30:
#             word_synonym_sentences[word].pop(sense)
#             print(f"Removed sense {sense} from word {word}")
#         else:
#             word_synonym_sentences[word][sense] = sentences[:100]
            
# # remove any words that have less than 2 synonyms
# for word in list(word_synonym_sentences.keys()):
#     synonyms = word_synonyms[word]
#     if len(synonyms) < 2:
#         word_synonym_sentences.pop(word)
#         word_synonyms.pop(word)
#         final_words.remove(word)
#         print(f"Removed {word} with synonyms {synonyms}")
        
# Remove senses that have less than 30 sentences and limit the number of sentences to 100
for word in list(word_synonym_sentences.keys()):
    senses = word_synonym_sentences[word]
    senses_to_remove = []
    for sense in list(senses.keys()):
        sentences = senses[sense]
        if len(sentences) < 30:
            senses_to_remove.append(sense)
            print(f"Removed sense {sense} from word {word}")
        else:
            senses[sense] = sentences[:100]  # Limit to 100 sentences
    # Remove senses after iteration
    for sense in senses_to_remove:
        senses.pop(sense)

# Remove any words that have less than 2 synonyms
words_to_remove = []
for word in list(word_synonym_sentences.keys()):
    synonyms = word_synonym_sentences[word]
    if len(synonyms) < 2:
        words_to_remove.append(word)
        word_synonyms.pop(word)
        final_words.remove(word)
        # word_synonym_sentences.pop(word)
        word_sentences.pop(word, None)
        print(f"Removed {word} with synonyms {synonyms.keys()}")

print(f"Removed {len(words_to_remove)} words")
# Remove words after iteration
for word in words_to_remove:
    word_synonym_sentences.pop(word)

    


Removed sense paying_attention from word regard
Removed sense attentiveness from word regard
Removed sense compliments from word regard
Removed sense Henry_Sweet from word sweet
Removed sense sugariness from word sweet
Removed sense afters from word sweet
Removed sense Sweet from word sweet
Removed sense confection from word sweet
Removed sense Dominicus from word sun
Removed sense Sun from word sun
Removed sense Lord's_Day from word sun
Removed sense Sunday from word sun
Removed sense approaching from word coming
Removed sense sexual_climax from word coming
Removed sense capital_of_the_United_Kingdom from word london
Removed sense Jack_London from word london
Removed sense Greater_London from word london
Removed sense John_Griffith_Chaney from word london
Removed sense British_capital from word london
Removed sense London from word london
Removed sense medical_checkup from word medical
Removed sense checkup from word medical
Removed sense medical_exam from word medical
Removed sense h

In [18]:
print(len(word_synonym_sentences), len(word_synonyms), len(final_words), len(word_sentences))


102 102 102 102


In [19]:
# import pandas as pd
# import re
# import nltk
# from nltk.tokenize import TreebankWordTokenizer
# from nltk import pos_tag
# from nltk.corpus import wordnet as wn
# from collections import Counter

# # Download NLTK data files if not already downloaded
# nltk.download('punkt', quiet=True)
# nltk.download('averaged_perceptron_tagger', quiet=True)

# # Initialize the tokenizer
# tokenizer = TreebankWordTokenizer()

# # Assume word_synonym_sentences and word_sentences are already defined
# # Example structures:
# # word_synonym_sentences = {
# #     'word1': {'synonym1': ['sentence1', 'sentence2'], 'synonym2': ['sentence3']},
# #     'word2': {'synonym3': ['sentence4'], 'synonym4': ['sentence5']}
# # }
# # word_sentences = {'word1': ['sentence6', 'sentence7'], 'word2': ['sentence8']}

# # Initialize the data list
# data = []

# # Function to get the POS tag of the word in context
# def get_pos_tag(word, sentence, start, end):
#     word_in_sentence = sentence[start:end]
#     tokens = tokenizer.tokenize(sentence)
#     pos_tags = pos_tag(tokens)
#     # Reconstruct the sentence from tokens to get spans
#     current_pos = 0
#     for idx, token in enumerate(tokens):
#         # Find the start and end positions of each token
#         token_start = sentence.find(token, current_pos)
#         token_end = token_start + len(token)
#         current_pos = token_end
#         if token_start == start and token_end == end:
#             return pos_tags[idx][1]
#     return 'UNKNOWN'



# # Process each word
# for word in word_synonym_sentences:
#     synonyms_dict = word_synonym_sentences[word]
#     synonyms_list = list(synonyms_dict.keys())  # List of synonyms
#     # Map synonyms to sense numbers starting from 1
#     synonym_sense_mapping = {synonym: idx+1 for idx, synonym in enumerate(synonyms_list)}
#     # Process sentences from synonyms
#     for synonym in synonyms_list:
#         sense = synonym_sense_mapping[synonym]
#         sentences = synonyms_dict[synonym]
#         for sentence in sentences:
#             # Replace the synonym with the word in the sentence
#             pattern = r'\b{}\b'.format(re.escape(synonym))
#             replaced_sentence = re.sub(pattern, word, sentence)
#             # Tokenize the replaced sentence and get spans
#             tokens = tokenizer.tokenize(replaced_sentence)
#             spans = list(tokenizer.span_tokenize(replaced_sentence))
#             pos_tags = pos_tag(tokens)
#             # Find tokens that match the word
#             for idx, (token, (start, end)) in enumerate(zip(tokens, spans)):
#                 if token.lower() == word.lower():
#                     pos_tag_token = pos_tags[idx][1]
#                     data.append({
#                         'word': word,
#                         'start': start,
#                         'end': end,
#                         'sentence': replaced_sentence,
#                         'sense': sense,
#                         'pos_tag': pos_tag_token
#                     })
#     # Process sentences from word_sentences[word] with sense 0
#     sense = 0
#     sentences = word_sentences.get(word, [])
#     for sentence in sentences:
#         # Tokenize the sentence and get spans
#         tokens = tokenizer.tokenize(sentence)
#         spans = list(tokenizer.span_tokenize(sentence))
#         pos_tags = pos_tag(tokens)
#         # Find tokens that match the word
#         for idx, (token, (start, end)) in enumerate(zip(tokens, spans)):
#             if token.lower() == word.lower():
#                 pos_tag_token = pos_tags[idx][1]
#                 data.append({
#                     'word': word,
#                     'start': start,
#                     'end': end,
#                     'sentence': sentence,
#                     'sense': sense,
#                     'pos_tag': pos_tag_token
#                 })

# # Create the DataFrame
# df = pd.DataFrame(data, columns=['word', 'start', 'end', 'sentence', 'sense', 'pos_tag'])

# # Display the DataFrame
# print(df)


import pandas as pd
import re
import nltk
from nltk.tokenize import TreebankWordTokenizer
from nltk import pos_tag

# Download NLTK data files if not already downloaded
nltk.download('punkt', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)

# Initialize the tokenizer
tokenizer = TreebankWordTokenizer()

# Assume word_synonym_sentences and word_sentences are already defined
# Example structures:
# word_synonym_sentences = {
#     'word1': {'synonym1': ['sentence1', 'sentence2'], 'synonym2': ['sentence3']},
#     'word2': {'synonym3': ['sentence4'], 'synonym4': ['sentence5']}
# }
# word_sentences = {'word1': ['sentence6', 'sentence7'], 'word2': ['sentence8']}

# Initialize the data list
data = []

# Process each word
for word in word_synonym_sentences:
    synonyms_dict = word_synonym_sentences[word]
    synonyms_list = list(synonyms_dict.keys())  # List of synonyms
    # Map synonyms to sense numbers starting from 1
    synonym_sense_mapping = {synonym: idx+1 for idx, synonym in enumerate(synonyms_list)}
    # Process sentences from synonyms
    for synonym in synonyms_list:
        sense = synonym_sense_mapping[synonym]
        sentences = synonyms_dict[synonym]
        for sentence in sentences:
            # Replace the synonym with the word in the sentence
            pattern = r'\b{}\b'.format(re.escape(synonym))
            replaced_sentence = re.sub(pattern, word, sentence)
            # Tokenize the replaced sentence and get spans
            tokens = tokenizer.tokenize(replaced_sentence)
            spans = list(tokenizer.span_tokenize(replaced_sentence))
            pos_tags = pos_tag(tokens)
            # Find tokens that match the word
            for idx, (token, (start, end)) in enumerate(zip(tokens, spans)):
                if token.lower() == word.lower():
                    pos_tag_token = pos_tags[idx][1]
                    data.append({
                        'word': word,
                        'start': start,
                        'end': end,
                        'sentence': replaced_sentence,
                        'sense': sense,
                        'pos_tag': pos_tag_token,
                        'original_sense': synonym  
                    })
    # Process sentences from word_sentences[word] with sense 0
    sense = 0
    sentences = word_sentences.get(word, [])
    for sentence in sentences:
        # Tokenize the sentence and get spans
        tokens = tokenizer.tokenize(sentence)
        spans = list(tokenizer.span_tokenize(sentence))
        pos_tags = pos_tag(tokens)
        # Find tokens that match the word
        for idx, (token, (start, end)) in enumerate(zip(tokens, spans)):
            if token.lower() == word.lower():
                pos_tag_token = pos_tags[idx][1]
                data.append({
                    'word': word,
                    'start': start,
                    'end': end,
                    'sentence': sentence,
                    'sense': sense,
                    'pos_tag': pos_tag_token,
                    'original_sense': word  
                })

# Create the DataFrame
df = pd.DataFrame(data, columns=['word', 'start', 'end', 'sentence', 'sense', 'pos_tag', 'original_sense'])

# Display the DataFrame
print(df)


         word  start  end                                           sentence  \
0      regard    156  162  go ahead and make your point ' about the mello...   
1      regard    238  244  laughlin 's category encompass blind including...   
2      regard     15   21  but if he have regard the advice of never let ...   
3      regard     32   38  i be unaware that the e be must regard court o...   
4      regard      0    6  regard this advice strauss quickly set up a sm...   
...       ...    ...  ...                                                ...   
50965    pair     28   32  the view and vino-a perfect pair on the deck a...   
50966    pair    189  193  england do not release ' natural born killer '...   
50967    pair     27   31  in a happily time move the pair raise master g...   
50968    pair    297  301  in answer jaina increase the magnification on ...   
50969    pair     88   92  the arrangement suit neither and each be busy ...   

       sense pos_tag original_sense  
0

In [20]:
# check how many rows have start -1, should be 0
print(len(df[df['start'] == -1]))

# check the distribution of senses
print(df['sense'].value_counts())

# check the distribution of pos tags
print(df['pos_tag'].value_counts())

# check the distribution of words
print(df['word'].nunique())

# check how many words don't have sense 0
all_words = set(df['word'])
words_with_sense0 = set(df[df['sense'] == 0]['word'])
words_without_sense0 = all_words - words_with_sense0
print(f"Number of words without sense 0: {len(words_without_sense0)}")



0
sense
2     6844
1     6606
0     6583
3     5735
4     5090
5     3927
6     2963
7     2179
8     1691
9     1379
10    1227
13     918
11     913
12     904
14     377
21     374
20     321
23     308
19     306
18     290
17     273
16     267
25     207
22     199
26     159
15     153
27     145
24     144
28     116
29     101
31      93
35      78
33      38
32      34
36      20
34       8
Name: count, dtype: int64
pos_tag
NN     22702
JJ      6826
VB      4809
VBN     4623
VBG     3321
VBD     2617
NNS     2177
VBP     1624
VBZ     1395
IN       229
RP       216
RB       154
JJS      140
CD        75
PDT       16
FW        10
CC         7
$          6
JJR        5
SYM        4
MD         3
RBR        2
DT         2
PRP        2
NNP        2
RBS        1
WP         1
UH         1
Name: count, dtype: int64
102
Number of words without sense 0: 14


In [21]:
keep_df = df
keep_df

Unnamed: 0,word,start,end,sentence,sense,pos_tag,original_sense
0,regard,156,162,go ahead and make your point ' about the mello...,1,VB,heed
1,regard,238,244,laughlin 's category encompass blind including...,1,NN,heed
2,regard,15,21,but if he have regard the advice of never let ...,1,VBN,heed
3,regard,32,38,i be unaware that the e be must regard court o...,1,VB,heed
4,regard,0,6,regard this advice strauss quickly set up a sm...,1,NN,heed
...,...,...,...,...,...,...,...
50965,pair,28,32,the view and vino-a perfect pair on the deck a...,0,NN,pair
50966,pair,189,193,england do not release ' natural born killer '...,0,NN,pair
50967,pair,27,31,in a happily time move the pair raise master g...,0,NN,pair
50968,pair,297,301,in answer jaina increase the magnification on ...,0,NN,pair


In [40]:
df = keep_df
print(len(df))
# # if for each word any sense has less than 30 sentences, print the word and sense
# for word in df.word.unique():
#     senses = df[df['word'] == word]['sense'].unique()
#     # if sense 0 has less than 30 sentences, remove the word
#     sense0_count = len(df[(df['word'] == word) & (df['sense'] == 0)])
#     if sense0_count < 30:
#         print(f"Word '{word}' has less than 30 sentences for sense 0: {sense0_count}")
#         df = df[df['word'] != word]

#     else:

#         # if any of the senses have less than 30 sentences, print the word and sense and remove the sense
#         for sense in senses:
#             sense_count = len(df[(df['word'] == word) & (df['sense'] == sense)])
#             if sense_count < 30:
#                 print(f"Word '{word}' has less than 30 sentences for sense {sense}: {sense_count}")
#                 df = df[(df['word'] != word) | df['sense'] != sense]

#         # if any word has less than 2 senses, remove the word
#         if len(senses) < 2:
#             df = df[df['word'] != word]

#         # for words that have more than 5 senses, remove all senses except for the first 5
#         elif len(senses) > 5:
#             senses_to_keep = [0] 
#             for sense in senses:
#                 if sense != 0:
#                     senses_to_keep.append(sense)
#                 if len(senses_to_keep) == 5:
#                     break
#             print(f"Word '{word}' has more than 5 senses: {len(senses)}")
#             df = df[(df['word'] != word) | (df['sense'].isin(senses_to_keep))]
#             # Verify the senses after removal
#             senses = df[df['word'] == word]['sense'].unique()
# Keep track of rows to remove
rows_to_remove = set()

# if for each word any sense has less than 30 sentences, print the word and sense
for word in df.word.unique():
    senses = df[df['word'] == word]['sense'].unique()
    
    # if sense 0 has less than 30 sentences, mark the word for removal
    sense0_count = len(df[(df['word'] == word) & (df['sense'] == 0)])
    if sense0_count < 30:
        print(f"Word '{word}' has less than 30 sentences for sense 0: {sense0_count}")
        rows_to_remove.update(df[df['word'] == word].index)
    else:
        # if any of the senses have less than 30 sentences, print the word and sense and mark the sense for removal
        for sense in senses:
            sense_count = len(df[(df['word'] == word) & (df['sense'] == sense)])
            if sense_count < 30:
                print(f"Word '{word}' has less than 30 sentences for sense {sense}: {sense_count}")
                rows_to_remove.update(df[(df['word'] == word) & (df['sense'] == sense)].index)
        
        # if any word has less than 2 senses, mark the word for removal
        if len(senses) < 2:
            rows_to_remove.update(df[df['word'] == word].index)
        
        # for words that have more than 5 senses, mark all senses except for the first 5 for removal
        elif len(senses) > 5:
            senses_to_keep = [0]
            for sense in senses:
                if sense != 0:
                    senses_to_keep.append(sense)
                if len(senses_to_keep) == 5:
                    break
            print(f"Word '{word}' has more than 5 senses: {len(senses)}")
            rows_to_remove.update(df[(df['word'] == word) & ~(df['sense'].isin(senses_to_keep))].index)

# Remove all marked rows from the DataFrame
df = df.drop(rows_to_remove)

print(len(df))

50970
Word 'regard' has more than 5 senses: 9
Word 'coming' has less than 30 sentences for sense 0: 4
Word 'forest' has less than 30 sentences for sense 3: 1
Word 'formed' has less than 30 sentences for sense 0: 0
Word 'expected' has less than 30 sentences for sense 0: 0
Word 'broken' has less than 30 sentences for sense 7: 28
Word 'broken' has more than 5 senses: 13
Word 'understanding' has less than 30 sentences for sense 2: 13
Word 'understanding' has less than 30 sentences for sense 4: 25
Word 'understanding' has more than 5 senses: 9
Word 'being' has more than 5 senses: 6
Word 'united' has less than 30 sentences for sense 1: 8
Word 'united' has less than 30 sentences for sense 2: 29
Word 'train' has less than 30 sentences for sense 4: 28
Word 'train' has more than 5 senses: 14
Word 'move' has less than 30 sentences for sense 5: 28
Word 'move' has less than 30 sentences for sense 8: 8
Word 'move' has more than 5 senses: 11
Word 'single' has less than 30 sentences for sense 3: 3
Wor

In [35]:
# get a distribution of the senses
print(df['sense'].value_counts())
df
print(df['word'].nunique())

sense
0    6422
2    5554
1    4984
3    4184
4    3744
5     154
Name: count, dtype: int64
80


In [41]:
from collections import Counter

# for each word, show how many sentences there are for each sense
word_sense_counts = {word: Counter(df[df['word'] == word]['sense']) for word in word_synonym_sentences.keys()}
print(word_sense_counts)

rows_to_remove = set()
# for each word, for each sense keep only 100 sentences
for word in df.word.unique():
    senses = df[df['word'] == word]['sense'].unique()
    for sense in senses:
        sense_count = len(df[(df['word'] == word) & (df['sense'] == sense)])
        if sense_count > 100:
            rows_to_remove.update(df[(df['word'] == word) & (df['sense'] == sense)].index[100:])
            print(f"Removed {sense_count - 100} sentences for word '{word}' and sense {sense}")

# Remove all marked rows from the DataFrame
df = df.drop(rows_to_remove)
# check how many sentences there are for each sense
word_sense_counts = {word: Counter(df[df['word'] == word]['sense']) for word in word_synonym_sentences.keys()}
print(word_sense_counts)
        

{'regard': Counter({2: 93, 3: 67, 0: 60, 1: 41, 4: 31}), 'thick': Counter({3: 82, 1: 73, 0: 73, 2: 60}), 'sweet': Counter({2: 108, 0: 65, 1: 42}), 'sun': Counter({2: 102, 1: 86, 0: 43}), 'coming': Counter(), 'sharp': Counter({2: 72, 4: 69, 3: 61, 0: 40}), 'forest': Counter({0: 91, 1: 78, 2: 41}), 'formed': Counter(), 'expected': Counter(), 'broken': Counter({1: 99, 0: 86, 4: 72, 2: 70, 3: 68}), 'understanding': Counter({0: 91, 1: 86, 3: 30}), 'paper': Counter({2: 102, 3: 100, 4: 89, 1: 88, 0: 61}), 'being': Counter({2: 117, 3: 93, 4: 89, 0: 73, 1: 52}), 'young': Counter({2: 104, 0: 96, 1: 64}), 'neighborhood': Counter({0: 104, 2: 67, 1: 43, 3: 34}), 'wage': Counter({2: 102, 3: 101, 1: 74, 0: 73}), 'united': Counter({4: 101, 0: 101, 3: 32}), 'central': Counter({3: 102, 4: 102, 0: 92, 1: 60, 2: 48}), 'train': Counter({2: 94, 3: 88, 1: 73, 0: 41}), 'move': Counter({1: 96, 2: 82, 0: 78, 4: 69, 3: 42}), 'suggest': Counter({3: 101, 4: 101, 1: 83, 0: 77, 2: 51}), 'single': Counter({1: 97, 0: 

In [42]:
# check if any words don't have sense 0 or have less than 30 sentences for sense 0
for word in df['word'].unique():
    sense0_count = len(df[(df['word'] == word) & (df['sense'] == 0)])
    if sense0_count < 30:
        print(f"Word '{word}' has less than 30 sentences for sense 0: {sense0_count}")

    # remove those words from the df
    # df = df.drop(df[(df['word'] == word) & (df['sense'] == 0)].index)
    

In [43]:
# for each word check what senses have less than 30 sentences and remove them
rows_to_remove = set()
for word in df['word'].unique():
    senses = df[df['word'] == word]['sense'].unique()
    for sense in senses:
        sense_count = len(df[(df['word'] == word) & (df['sense'] == sense)])
        if sense_count < 30:
            print(f"Word '{word}' has less than 30 sentences for sense {sense}: {sense_count}")
            rows_to_remove.update(df[(df['word'] == word) & (df['sense'] == sense)].index)
            
df = df.drop(rows_to_remove)

# if the word has less than 2 senses, remove it
rows_to_remove = set()
for word in df['word'].unique():
    senses = df[df['word'] == word]['sense'].unique()
    if len(senses) < 2:
        print(f"Word '{word}' has less than 2 senses")
        rows_to_remove.update(df[df['word'] == word].index)

df = df.drop(rows_to_remove)



Word 'agree' has less than 2 senses


In [44]:
# check how many words remain in df
print(len(df['word'].unique()))


80


In [45]:
df['word'].value_counts()

word
content        478
instance       477
determine      455
practice       446
paper          438
              ... 
approval       175
independent    169
trial          162
place          141
sleep          108
Name: count, Length: 80, dtype: int64

In [46]:
# remove the \n from the end of the sentences
df['sentence'] = df['sentence'].str.replace('\n', '')



In [47]:
for word in df['word'].unique():
    # print the sense value counts for each word
    print(f"{word}: {df[df['word'] == word]['sense'].value_counts()}")

regard: sense
2    93
3    67
0    60
1    41
4    31
Name: count, dtype: int64
thick: sense
3    82
1    73
0    73
2    60
Name: count, dtype: int64
sweet: sense
2    100
0     65
1     42
Name: count, dtype: int64
sun: sense
2    100
1     86
0     43
Name: count, dtype: int64
sharp: sense
2    72
4    69
3    61
0    40
Name: count, dtype: int64
forest: sense
0    91
1    78
2    41
Name: count, dtype: int64
broken: sense
1    99
0    86
4    72
2    70
3    68
Name: count, dtype: int64
understanding: sense
0    91
1    86
3    30
Name: count, dtype: int64
paper: sense
2    100
3    100
4     89
1     88
0     61
Name: count, dtype: int64
being: sense
2    100
3     93
4     89
0     73
1     52
Name: count, dtype: int64
young: sense
2    100
0     96
1     64
Name: count, dtype: int64
neighborhood: sense
0    100
2     67
1     43
3     34
Name: count, dtype: int64
wage: sense
2    100
3    100
1     74
0     73
Name: count, dtype: int64
united: sense
4    100
0    100
3     32
Na

In [48]:
df.to_csv('English_pseudo_polysemy_wsd_corpus_2.csv', index=False)


In [49]:
import pandas as pd

df = pd.read_csv('English_pseudo_polysemy_wsd_corpus_2.csv')
print(len(df))

# for each word keep only 30 sentences from 2 senses (sense 0 and the next sense) and remove the rest
rows_to_remove = set()
for word in df['word'].unique():
    senses = df[df['word'] == word]['sense'].unique()
    if len(senses) > 2:
        for sense in senses[2:]:
            rows_to_remove.update(df[(df['word'] == word) & (df['sense'] == sense)].index)
    for sense in senses[:2]:
        rows_to_remove.update(df[(df['word'] == word) & (df['sense'] == sense)][30:].index)
df = df.drop(rows_to_remove)

print(len(df), df['word'].nunique() * 60)

24862
4800


In [51]:
# print each word and their sense value counts
for word in df['word'].unique():
    print(word)

regard
thick
sweet
sun
sharp
forest
broken
understanding
paper
being
young
neighborhood
wage
united
central
train
move
suggest
single
reasonable
project
group
club
open
child
company
rule
present
fundamental
authority
mean
believe
concerning
discussion
instance
march
hundred
contact
prove
heat
independent
appropriate
wonderful
love
faculty
assume
stream
man
flesh
approval
responsibility
lady
down
middle
research
little
signal
beginning
knife
role
particular
review
allow
trial
sleep
following
contract
add
bottom
content
friend
let
practice
determine
basis
second
resolution
place
produce
pair


In [52]:
df.to_csv('English_pseudo_polysemy_30_sents_each.csv', index=False)