**Extracting Nouns from the text file using Super Sense Tagging**

In [None]:
!pip install booknlp
!python -m spacy download en_core_web_lg

In [None]:
from booknlp.booknlp import BookNLP

# model_params={
# 		"pipeline":"entity,quote,supersense,event,coref",
# 		"model":"big"
# 	}

######## i am only interested in SuperSenses model parameters so i am using only SuperSense.
model_params={
		"pipeline":"entity,supersense",
		"model":"big"
	}


booknlp=BookNLP("en", model_params)


using device cpu
{'pipeline': 'entity,supersense', 'model': 'big'}
downloading entities_google_bert_uncased_L-6_H-768_A-12-v1.0.model
downloading coref_google_bert_uncased_L-12_H-768_A-12-v1.0.model
downloading speaker_google_bert_uncased_L-12_H-768_A-12-v1.0.1.model


Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/384 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/270M [00:00<?, ?B/s]

--- startup: 43.512 seconds ---


In [None]:

# Input file to process
input_file="train.txt"

# Output directory to store resulting files in
output_directory="output_dir/bartleby/"

# File within this directory will be named ${book_id}.entities, ${book_id}.tokens, etc.
book_id="bartleby"

booknlp.process(input_file, output_directory, book_id)

--- spacy: 15.908 seconds ---
--- entities: 284.568 seconds ---
--- quotes: 0.125 seconds ---
--- name coref: 4.524 seconds ---
--- TOTAL (excl. startup): 310.733 seconds ---, 62348 words


**Now open output_dir/bartleby/bartleby.supersense file and copy all content.**
1. Open an excel file and paste all copied content to that excel file.
2. Perform pre-processing on data.

Pre-processing includes
1. Splitting Nouns only from all SST based senses.
2. Catagorical division of data for each sense type.
3. Removing in-column duplication from each sense type.
4. filtering-out multiple sense problem. i.e, noun present in multiple SST based senses.

**removing duplicate values in cells.**

In [None]:
import pandas as pd
import numpy as np

# Read in the XLSX file as a dataframe
df = pd.read_excel('test.xlsx')

# Convert all values to strings
df = df.astype(str)

# Flatten the dataframe into a list of values
values = df.values.flatten()

# Convert the list of values to a NumPy array
arr = np.array(values)

# Find the indices of the duplicate values
_, indices = np.unique(arr, return_inverse=True)
duplicates = np.where(np.bincount(indices) > 1)[0]

# Replace the duplicate values with empty cells
for index in duplicates:
    values[index] = ""

# Reshape the list of values back into the original dataframe shape
new_df = pd.DataFrame(values.reshape(df.shape), columns=df.columns)

# Write the modified dataframe back to the XLSX file
new_df.to_excel('modified.xlsx', index=False)


**converting all cell values to lower-case.**

In [None]:
import openpyxl

# Load the Excel file
workbook = openpyxl.load_workbook('modified.xlsx')

# Get the active worksheet
worksheet = workbook.active

# Loop through all cells in the worksheet and convert their values to lower case
for row in worksheet.iter_rows():
    for cell in row:
        cell.value = str(cell.value).lower()

# Save the modified Excel file
workbook.save('modified1.xlsx')


removing multi-words cells.

In [None]:
import openpyxl

# Load the Excel file
workbook = openpyxl.load_workbook('modified.xlsx')

# Get the active worksheet
worksheet = workbook.active

# Loop through all cells in the worksheet
for row in worksheet.iter_rows():
    for cell in row:
        # Check if the cell value contains multiple words
        if isinstance(cell.value, str) and len(cell.value.split()) > 1:
            # If the cell value contains multiple words, replace it with an empty cell
            cell.value = ''

# Save the modified Excel file
workbook.save('modified2.xlsx')


replacing all 'nan' with '' empty cells in data.

In [None]:
import openpyxl

# Load the Excel file
workbook = openpyxl.load_workbook('modified2.xlsx')

# Get the active worksheet
worksheet = workbook.active

# Loop through all cells in the worksheet
for row in worksheet.iter_rows():
    for cell in row:
        # Check if the cell value is None or 'none' (case-insensitive)
        if cell.value is None or str(cell.value).lower() == 'none':
            # If the cell value is None or 'none', replace it with an empty cell
            cell.value = ''

# Save the modified Excel file
workbook.save('modified3.xlsx')


**removing empty cells inside a column.** (if any)

In [None]:
import openpyxl

# Load the Excel file
workbook = openpyxl.load_workbook('modified3.xlsx')

# Select the sheet that you want to read and modify
worksheet = workbook['Sheet1'] #Replace Sheet1 with your sheet name

# Iterate over the columns in the sheet and create a list of non-empty cell values for each column
columns = []
for column in worksheet.iter_cols():
    column_data = []
    for cell in column:
        if cell.value is not None:
            column_data.append(cell.value)
    columns.append(column_data)

# Iterate over the columns again and write the non-empty cell values back to the sheet
for i, column in enumerate(worksheet.columns):
    for j, cell in enumerate(column):
        if j < len(columns[i]):
            cell.value = columns[i][j]
        else:
            cell.value = None

# Save the modified Excel file
workbook.save('modified4.xlsx')


In [None]:
!mv modified4.xlsx nouns.xlsx

In [None]:
!rm modified*

**The code below performs the following operations**
1. reads excel file containing catagorical SST based nouns.
2. reads text file containing sentences for dataset.
3. pass each sentence through spaCy and extract nouns.
4. finds that noun in the excel file.
5. replace that noun with another Randomly selected noun by keeping the SST category.
6. replace the modified noun in the original sentence.
7. writes the modified sentences to a new file.
8. writes the dictionary of actual and replaced noun as well to update DRS file.

Code works well on small dataset portions. Splitting dataset into files having 400 examples each.

In [None]:
import os
import math

# Set the input file paths
sentences_file = 'train.txt.raw'
drs_file = 'train.txt'

# Set the output file prefix
output_prefix = ''

# Set the number of sentences per output file
sentences_per_file = 400

# Create the output directory if it doesn't exist
if not os.path.exists('dataset'):
    os.mkdir('dataset')

# Read in the sentences
with open(sentences_file, 'r') as f:
    sentences = [line.strip() for line in f.readlines()]

# Read in the DRSs
with open(drs_file, 'r') as f:
    drs = f.read().split('\n\n')

# Combine the sentences and DRSs into tuples
data = list(zip(sentences, drs))

# Split the data into chunks of 400 sentences
num_chunks = math.ceil(len(data) / sentences_per_file)
chunks = [data[i:i+sentences_per_file] for i in range(0, len(data), sentences_per_file)]

# Write each chunk to a new file in the output directory
for i, chunk in enumerate(chunks):
    filename = output_prefix + str(i+1) + '.txt'
    filepath = os.path.join('dataset', filename)
    with open(filepath, 'w') as f:
        for sentence, dr in chunk:
            f.write(sentence + '\n')

    dr_filename = output_prefix + str(i+1) + '_drs.txt'
    dr_filepath = os.path.join('dataset', dr_filename)
    with open(dr_filepath, 'w') as f:
        for sentence, dr in chunk:
            f.write(dr + '\n\n')


**working with DRS and Nouns at the same time.**

working with large spaCy model.

In [None]:
!python -m spacy download en_core_web_lg


In [None]:
import spacy
import openpyxl
import random
import re

# Load the spacy model
nlp = spacy.load("en_core_web_lg")

# Load the XLSX file
wb = openpyxl.load_workbook("nouns.xlsx")
ws = wb.active

# Define a dictionary to cache the replacement nouns for each unique noun found in the XLSX file
replacement_noun_cache = {}

# Define a helper function to find a replacement noun and cache it for future use
def get_replacement_noun(noun):
    # Check if a replacement noun has already been cached for this noun
    if noun in replacement_noun_cache:
        return replacement_noun_cache[noun]

    # Split the noun into its component parts using regular expressions
    noun_parts = re.split("[_\-~]+", noun)

    # Look for a replacement noun in the XLSX file that matches any of the noun parts
    for col in ws.iter_cols(values_only=True):
        for noun_part in noun_parts:
            if noun_part.lower() in [str(cell).lower() for cell in col]:
                candidate_nouns = [str(cell) for cell in col if str(cell) != "None"]
                if candidate_nouns:
                    replacement_noun = random.choice(candidate_nouns)
                    # Cache the replacement noun for future use
                    replacement_noun_cache[noun] = replacement_noun
                    return replacement_noun

    # If no replacement noun was found, return None
    return None

# Define the paths to the input files
drs_path = "/content/dataset/17_drs.txt"
text_path = "/content/dataset/17.txt"

# Define the paths to the output files
updated_drs_path = "/content/noun_aug/17_drs.txt"
updated_text_path = "/content/noun_aug/17.txt"

# Open the input files
with open(drs_path, "r") as drs_file, open(text_path, "r") as text_file:
    # Load the DRS file into a list
    drs_lines = drs_file.readlines()

    # Open the output files
    with open(updated_drs_path, "w") as updated_drs_file, open(updated_text_path, "w") as updated_text_file:
        # Loop over the sentences in the text file
        for sentence in text_file:
            # Parse the sentence with spacy
            doc = nlp(sentence)

            # Keep track of the nouns to replace
            nouns_to_replace = []

            # Find the nouns in the sentence
            for token in doc:
                if token.pos_ == "NOUN":
                    noun = token.text.lower()
                    if noun not in nouns_to_replace:
                        nouns_to_replace.append(noun)

            # Replace the nouns in the sentence and the DRS
            for noun in nouns_to_replace:
                replacement_noun = get_replacement_noun(noun)
                if replacement_noun:
                    sentence = sentence.replace(noun, replacement_noun)
                    drs_lines = [line.replace(noun, replacement_noun) for line in drs_lines]

            # Write the updated sentence to the updated text file
            updated_text_file.write(sentence)

        # Write the updated DRS lines to the updated DRS file
        updated_drs_file.writelines(drs_lines)


combining all splitted dataset files

In [None]:
cd noun_aug

for text files

In [None]:
!for i in {1..17}; do cat "$i.txt" >> output.txt; done

for DRS files

In [None]:
!for i in {1..17}_drs.txt; do cat "$i" >> drs_output.txt; done


moving augmented files outside the sub-folders

In [None]:
!mv drs_output.txt ..
!mv output.txt ..

In [None]:
cd ..

/content


re-naming augmented files.

In [None]:
!mv drs_output.txt noun_aug_drs.txt
!mv output.txt noun_aug_text.txt

In [None]:
ls

[0m[01;34mdataset[0m/   noun_aug_drs.txt   nouns.xlsx  train.txt.raw
[01;34mnoun_aug[0m/  noun_aug_text.txt  train.txt


**Adopting an alternative way (dictionary based) to augment nouns.**

getting nouns in each sentence

In [None]:
import spacy

# Load the language model
nlp = spacy.load("en_core_web_sm")

# Read the text file
with open("train.txt.raw") as file:
    text = file.read()

# Process the text with spaCy
doc = nlp(text)

# Iterate through each sentence and print the nouns
for sent in doc.sents:
    nouns = [token.text for token in sent if token.pos_ == "NOUN"]
    print("Nouns in the sentence '{}': {}".format(sent.text.strip(), nouns))


**reading SST based xlsx file to get all nouns in text file.**

In [None]:
import pandas as pd

# Load the Excel file into a pandas DataFrame
df = pd.read_excel("train.xlsx")

# Get all non-NaN words in the DataFrame except the first row (which contains the tags), removing any leading or trailing spaces
words = df.iloc[1:].values.flatten().tolist()
words = [word.strip() for word in words if pd.notna(word)]

# Save all the non-NaN words to a single text file
with open("nouns.txt", "w") as f:
    f.write("\n".join(words))


getting nouns in a text file. one noun per line.

In [None]:
import spacy

# Load the language model
nlp = spacy.load("en_core_web_sm")

# Read the text file
with open("train.txt.raw") as file:
    text = file.read()

# Process the text with spaCy
doc = nlp(text)

# Open a new file to write the nouns
with open("nouns.txt", "w") as file:
    # Iterate through each sentence and write each noun to a separate line in the file
    for sent in doc.sents:
        nouns = [token.text for token in sent if token.pos_ == "NOUN"]
        for noun in nouns:
            # Write each noun to a separate line in the file
            file.write(noun + "\n")


converting to lower case

In [None]:
with open('nouns.txt', 'r') as file:
    text = file.read()
    lower_text = text.lower()

with open('lower_nouns.txt', 'w') as file:
    file.write(lower_text)


getting unique nouns only.

In [None]:
!sort nouns.txt | uniq > unique_nouns.txt

getting randomly selected nouns from same SST based column. Saving replacable noun to a new file.

In [None]:
import pandas as pd
import random
import numpy as np

# Read the xlsx file using pandas
df = pd.read_excel('train.xlsx')

# Read the text file and store the words in a list
with open('unique_nouns.txt', 'r') as f:
    words = [line.strip() for line in f]

# Create a dictionary to store the updated words
updated_words = {}

# Iterate over the words from the text file
for word in words:
    # Iterate over each column in the dataframe
    for col in df.columns:
        # Check if the word is in the column
        if word in df[col].tolist():
            # Get a list of non-nan values from the same column
            non_nan_values = df[col].dropna().tolist()
            # Remove the original word from the list
            non_nan_values.remove(word)
            # Check if there are any non-nan values left in the list
            if non_nan_values:
                # Get a random non-nan value from the same column
                new_word = random.choice(non_nan_values)
                # Store the updated word in the dictionary
                updated_words[word] = new_word

# Write the updated words to a new text file
with open('modified_nouns.txt', 'w') as f:
    for word in words:
        # Write the original word or the updated word to the file
        if word in updated_words:
            f.write(updated_words[word] + '\n')
        else:
            f.write(word + '\n')


**problem with random selection is that, it only updates half of the nouns. therefore, i am taking the next value not the random value.**

In [None]:
import pandas as pd

# Read the xlsx file using pandas
df = pd.read_excel('train.xlsx', header=None)

# Read the text file and store the words in a list
with open('unique_nouns.txt', 'r') as f:
    words = [line.strip() for line in f]

# Create a dictionary to store the updated words
updated_words = {}

# Iterate over the words from the text file
for word in words:
    # Iterate over each column in the dataframe
    for col in df.columns:
        # Check if the word is in the column
        if word in df[col].tolist():
            # Get a list of non-nan values from the same column
            non_nan_values = df[col].dropna().tolist()
            # Check if the first row in the column contains column names
            if df.iloc[0, col] in df.columns:
                # If the first row contains column names, skip it
                non_nan_values = non_nan_values[1:]
            # Check if there is a non-null value after the original word in the list
            if word in non_nan_values:
                index = non_nan_values.index(word)
                if index < len(non_nan_values) - 1:
                    # Get the next non-null value from the same column
                    new_word = non_nan_values[index + 1]
                    # Store the updated word in the dictionary
                    updated_words[word] = new_word
                elif index == len(non_nan_values) - 1:
                    # If the original word is the last non-null value in the column, replace it with the first non-null value
                    new_word = non_nan_values[0]
                    # Store the updated word in the dictionary
                    updated_words[word] = new_word

# Write the updated words to a new text file
with open('modified_nouns.txt', 'w') as f:
    for word in words:
        # Write the original word or the updated word to the file
        if word in updated_words:
            f.write(updated_words[word] + '\n')
        else:
            f.write(word + '\n')


calculating changed nouns

In [None]:
# Open and read the first file
with open('unique_nouns.txt', 'r') as f1:
    lines1 = f1.readlines()

# Open and read the second file
with open('modified_nouns.txt', 'r') as f2:
    lines2 = f2.readlines()

# Convert each line to a list of words
words1 = [line.strip() for line in lines1]
words2 = [line.strip() for line in lines2]

# Find the lines with different words
diff_lines = [i for i in range(len(words1)) if words1[i] != words2[i]]

# Print the number of lines with different words
print(f"There are {len(diff_lines)} lines with different words.")


There are 1000 lines with different words.


generating dictionary of nouns and replaceable nouns.

In [None]:
nouns_dict = {}

with open('nouns.txt', 'r') as f1, open('modified_nouns.txt', 'r') as f2:
    for line1, line2 in zip(f1, f2):
        word1 = line1.strip()
        word2 = line2.strip()
        nouns_dict[word1] = word2

with open('nouns_dict.txt', 'w') as f:
    for key, value in nouns_dict.items():
        f.write(f"'{key}' : '{value}',\n")


In [None]:
ls

lower_nouns.txt     nouns_dict.txt  train.txt.raw  unique_nouns.txt
modified_nouns.txt  nouns.txt       train.xlsx


**getting nouns with same sense but not belonging to the same data. i.e., outside dataset but with same sense.**

reading excel data in text file

In [None]:
import pandas as pd

# Load Excel file
excel_file_path = 'train.xlsx'
df = pd.read_excel(excel_file_path)

# Open text file for writing
text_file_path = 'train.txt'
with open(text_file_path, 'w') as f:
    # Iterate through each row in the DataFrame
    for _, row in df.iterrows():
        # Iterate through each cell in the row
        for cell in row:
            # Check if cell value is not null
            if pd.notna(cell):
                # Convert cell value to string, remove leading/trailing spaces, and write to text file
                word = str(cell).strip()
                f.write(word + '\n')


In [None]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

code to use xlsx file for replacement and dictionary generation.

In [None]:
from nltk.corpus import wordnet as wn
import pandas as pd

# Read Excel file
df = pd.read_excel('train.xlsx')

# Create an empty dictionary to store actual words and their replacements
word_dict = {}

# Loop through each column in the DataFrame
for col in df.columns:
    # Loop through each cell in the column
    for i, cell in enumerate(df[col]):
        # Check if cell contains a string
        if isinstance(cell, str):
            # Loop through each word in the cell
            for word in cell.split():
                # Get synsets for the word
                synsets = wn.synsets(word)

                # If synsets are found, replace the word with the first lemma of the first synset
                if synsets:
                    new_word = synsets[0].lemmas()[0].name()
                    # Update the dictionary with actual word and replaced word
                    word_dict[word] = new_word
                    # Replace the word in the cell with the new word
                    df.at[i, col] = df.at[i, col].replace(word, new_word)

# Save updated Excel file
df.to_excel('output.xlsx', index=False)

# Save word dictionary to a text file
with open('word_dict.txt', 'w') as f:
    for key, value in word_dict.items():
        f.write(f"{key}: {value}\n")


code to read processed text file for noun replacement and dictionary generation.

In [None]:
from nltk.corpus import wordnet

# Load WordNet
import nltk
nltk.download('wordnet')

# Read words from the text file
with open('train.txt', 'r') as file:
    lines = file.readlines()

# Create a dictionary to store original words and their replaced words
word_dict = {}

# Replace words with their non-identical synonyms
for line in lines:
    original_word = line.strip()
    synsets = wordnet.synsets(original_word)
    if synsets:
        synset = synsets[0]  # Get the first synset
        lemmas = synset.lemmas()  # Get all lemmas of the synset
        for lemma in lemmas:
            similar_word = lemma.name()  # Get the lemma's name
            if similar_word != original_word:  # Exclude the original word itself
                word_dict[original_word] = similar_word
                break  # Stop after finding the first non-identical synonym
    else:
        word_dict[original_word] = original_word

# Write the dictionary to a text file
with open('word_dict.txt', 'w') as file:
    for original_word, similar_word in word_dict.items():
        file.write(f"{original_word}: {similar_word}\n")


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


**1. using word sense disambiguation for contextual sense based replacement. Getting 1500 contextual synonyms.**

In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
from nltk.corpus import wordnet
from nltk.wsd import lesk
from nltk.tokenize import word_tokenize

# Load WordNet
import nltk
nltk.download('wordnet')

# Read words from the text file
with open('train.txt', 'r') as file:
    lines = file.readlines()

# Create a dictionary to store original words and their replaced words
word_dict = {}

# Replace words with their sense-disambiguated synonyms
for line in lines:
    original_word = line.strip()
    synsets = wordnet.synsets(original_word)
    if synsets:
        sentence = original_word  # Use the original word as the sentence
        tokens = word_tokenize(sentence)  # Tokenize the sentence into words
        for i, token in enumerate(tokens):
            if token == original_word:
                synset = lesk(tokens, token)  # Get the Lesk synset for the token
                if synset:
                    lemmas = synset.lemmas()  # Get all lemmas of the synset
                    # Sort lemmas by their frequency count in WordNet
                    lemmas = sorted(lemmas, key=lambda x: x.count(), reverse=True)
                    for lemma in lemmas:
                        similar_word = lemma.name()  # Get the lemma's name
                        if similar_word != original_word:
                            tokens[i] = similar_word  # Replace the token with the similar word
                            word_dict[original_word] = similar_word
                            break  # Stop after finding the first non-identical synonym
        # Reconstruct the sentence with the replaced words
        replaced_sentence = ' '.join(tokens)
        #print(f"Original sentence: {sentence}")
        #print(f"Replaced sentence: {replaced_sentence}")
    else:
        word_dict[original_word] = original_word

# Write the dictionary to a text file
with open('word_dict.txt', 'w') as file:
    for original_word, similar_word in word_dict.items():
        file.write(f"'{original_word}': '{similar_word}',\n")


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


**code below will do Noun Augmentation outside data without SS**

code below will split dictionary based data into words and their corresponding replacement.

In [None]:
# Open the text file for reading
with open('word_dict.txt', 'r') as file:
    lines = file.readlines()

words = []
replacements = []

# Split the lines and extract words and replacements
for line in lines:
    word, replacement = line.strip().split(':')
    words.append(word.strip("'"))
    replacements.append(replacement.strip("',\n"))

# Save words into a separate file
with open('words.txt', 'w') as file:
    file.write('\n'.join(words))

# Save replacements into a separate file
with open('replacements.txt', 'w') as file:
    file.write('\n'.join(replacements))


code below will replace words with similar new word using wordNet.

In [None]:
import nltk
from nltk.corpus import wordnet

# Initialize NLTK's WordNet
nltk.download('wordnet')

# Open the replacements.txt file
with open('replacements.txt', 'r') as file:
    replacements = file.readlines()

similar_words = []

# Iterate over each replacement word
for word in replacements:
    word = word.strip()

    # Get the synsets for the word
    synsets = wordnet.synsets(word)

    # If there are synsets available, replace the word with the first similar word
    if synsets:
        similar_word = synsets[0].lemmas()[0].name()
        similar_words.append(similar_word)
    else:
        similar_words.append(word)  # If no synsets found, keep the original word

# Save similar words into a separate file
with open('similar_words.txt', 'w') as file:
    file.write('\n'.join(similar_words))


[nltk_data] Downloading package wordnet to /root/nltk_data...


saving the new dictionary

In [None]:
# Read words from words.txt
with open('words.txt', 'r') as file:
    words = file.readlines()

# Read similar words from similar_words.txt
with open('similar_words.txt', 'r') as file:
    similar_words = file.readlines()

# Remove newline characters from each word
words = [word.strip() for word in words]
similar_words = [word.strip() for word in similar_words]

# Combine words and similar words into a dictionary
dictionary = dict(zip(words, similar_words))

# Save dictionary into dic.txt
with open('dic.txt', 'w') as file:
    for word, similar_word in dictionary.items():
        file.write(f"'{word}':'{similar_word}',\n")


**Noun Augmentation on PMB-5.0.0**

**Applying WSD algorithm to get similar nouns**

In [None]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
from nltk.corpus import wordnet
from nltk.wsd import lesk  # Word Sense Disambiguation
import re

# Function to find contextually similar nouns
def find_similar_noun(noun):
    # Perform Word Sense Disambiguation (WSD) to find the most likely sense
    sense = lesk(noun.split(), noun)

    if sense:
        # Find synonyms based on the chosen sense
        synonyms = []
        for lemma in sense.lemmas():
            synonyms.append(lemma.name())

        # Filter out the original noun from the synonyms
        synonyms = [synonym for synonym in synonyms if synonym != noun]

        # Return the first synonym (you can choose how to handle multiple synonyms)
        return synonyms[0] if synonyms else None

    return None

# Read unique nouns from the "unique_nouns.txt" file
with open("nouns.txt", "r") as file:
    unique_nouns = [line.strip() for line in file]

# Initialize a list to store replaced nouns
replaced_nouns = []

# Apply WSD logic to extract similar words for each unique noun
for noun in unique_nouns:
    similar_word = find_similar_noun(noun)
    if similar_word:
        replaced_nouns.append(similar_word)
    else:
        replaced_nouns.append(noun)

# Save the replaced nouns to a new text file
output_file = "replaced_nouns.txt"
with open(output_file, "w") as file:
    for noun in replaced_nouns:
        file.write(noun + "\n")

print(f"Replaced nouns saved to {output_file}")


Replaced nouns saved to replaced_nouns.txt


In [None]:
from nltk.corpus import wordnet
from nltk.wsd import lesk  # Word Sense Disambiguation
from nltk.corpus import wordnet_ic  # Information Content
import re

# Function to find contextually similar nouns
def find_similar_noun(noun):
    # Perform Word Sense Disambiguation (WSD) to find the most likely sense
    sense = lesk(noun.split(), noun)

    if sense:
        # Find synonyms based on the chosen sense
        synonyms = []
        for lemma in sense.lemmas():
            synonyms.append(lemma.name())

        # Filter out the original noun from the synonyms
        synonyms = [synonym for synonym in synonyms if synonym != noun]

        # Return the first synonym (you can choose how to handle multiple synonyms)
        return synonyms[0] if synonyms else None

    return None

# Function to find synonyms for a noun
def find_synonyms(noun):
    synonyms = []
    for syn in wordnet.synsets(noun):
        for lemma in syn.lemmas():
            synonyms.append(lemma.name())
    return synonyms

# Read unique nouns from the "unique_nouns.txt" file
with open("nouns.txt", "r") as file:
    unique_nouns = [line.strip() for line in file]

# Initialize a list to store replaced nouns
replaced_nouns = []

# Apply WSD logic to extract similar words for each unique noun
for noun in unique_nouns:
    similar_word = find_similar_noun(noun)
    if similar_word:
        replaced_nouns.append(similar_word)
    else:
        # If WSD didn't change the noun, find synonyms and use the first one
        synonyms = find_synonyms(noun)
        if synonyms:
            replaced_nouns.append(synonyms[0])
        else:
            # If no synonyms found, keep the original noun
            replaced_nouns.append(noun)

# Save the replaced nouns to a new text file
output_file = "replaced_nouns2.txt"
with open(output_file, "w") as file:
    for noun in replaced_nouns:
        file.write(noun + "\n")

print(f"Replaced nouns saved to {output_file}")


Replaced nouns saved to replaced_nouns2.txt


**extracting plural nouns**

In [None]:
import nltk
from nltk import word_tokenize, pos_tag
from nltk.corpus import words

# Download the NLTK words dataset if not already downloaded
nltk.download('words')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

# Function to extract plural nouns from a list of words
def extract_plural_nouns(word_list):
    plural_nouns = []
    for word in word_list:
        if word.endswith('s') and word in words.words():
            plural_nouns.append(word)
    return plural_nouns

# Input and output file paths
input_file_path = 'nouns.txt'
output_file_path = 'plural_nouns.txt'

# Read the input file containing nouns
with open(input_file_path, 'r') as f:
    text = f.read()

# Tokenize the text and perform part-of-speech tagging
tokens = word_tokenize(text)
tagged_tokens = pos_tag(tokens)

# Extract plural nouns
plural_nouns = extract_plural_nouns([word for word, tag in tagged_tokens if tag == 'NNS'])

# Save the extracted plural nouns to the output file
with open(output_file_path, 'w') as f:
    for noun in plural_nouns:
        f.write(noun + '\n')

print(f"Extracted {len(plural_nouns)} plural nouns and saved them to '{output_file_path}'.")


[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


Extracted 41 plural nouns and saved them to 'plural_nouns.txt'.


**extracting multi-word nouns due to their different representation in SBN.**

In [None]:
import spacy

# Load the spaCy English language model
nlp = spacy.load("en_core_web_sm")

# Function to extract multi-word nouns (noun phrases)
def extract_multi_word_nouns(text):
    doc = nlp(text)
    multi_word_nouns = []
    for token in doc:
        if token.dep_ == 'compound' and token.head.pos_ == 'NOUN':
            noun_phrase = ' '.join([token.text for token in token.head.lefts] + [token.head.text] + [token.text for token in token.head.rights])
            multi_word_nouns.append(noun_phrase)
    return multi_word_nouns

# Input and output file paths
input_file_path = 'nouns.txt'
output_file_path = 'multi_word_nouns.txt'

# Read the input file containing text
with open(input_file_path, 'r') as f:
    text = f.read()

# Extract multi-word nouns (noun phrases)
multi_word_nouns = extract_multi_word_nouns(text)

# Save the extracted multi-word nouns to the output file
with open(output_file_path, 'w') as f:
    for noun_phrase in multi_word_nouns:
        f.write(noun_phrase + '\n')

print(f"Extracted {len(multi_word_nouns)} multi-word nouns and saved them to '{output_file_path}'.")


Extracted 1787 multi-word nouns and saved them to 'multi_word_nouns.txt'.


**Dealing with constraints in noun extraction.**
1- singluar plural
2- upper case and lower case
3- multi-word nouns

In [None]:
!pip install inflect

In [None]:
import re
import inflect

# Initialize the inflect engine for singularization and pluralization
p = inflect.engine()

# Given data examples
data_examples = [
    "It took him three months to learn to ride a bicycle.	time.n.08 TPR now male.n.02 quantity.n.01 EQU 3 month.n.02 Quantity -1 learn.v.01 Time -4 Agent -3 Duration -1 Topic +1 ride.v.02 Agent -4 Theme +1 bicycle.n.01",
    "He is ill.	male.n.02 time.n.08 EQU now ill.a.01 AttributeOf -2 Time -1",
    "They will be jealous.	person.n.01 time.n.08 TSU now jealous.a.01 Experiencer -2 Time -1",
    "I'm running out of ideas.	person.n.01 EQU speaker time.n.08 EQU now run_out.v.06 Source -2 Time -1 Theme +1 idea.n.01",
    "The girl is deeply attached to her aunt.	girl.n.01 time.n.08 EQU now deeply.r.01 attached.a.04 Experiencer -3 Time -2 Degree -1 Stimulus +2 female.n.02 person.n.01 Role +1 aunt.n.01 Of -2",
]

# Define a function to extract nouns from the logical representation
def extract_nouns_from_logical(logical_representation):
    nouns = []
    # Split the logical representation into components
    components = logical_representation.split()
    for component in components:
        # Extract nouns based on the part-of-speech ("n" for nouns) and handle case-insensitivity
        match = re.match(r'(.+)\.n\.(\d+)', component, re.IGNORECASE)
        if match:
            noun = match.group(1)
            nouns.append(noun)
    return nouns

# Define a function to match logical nouns with words in the text
def match_logical_nouns_with_text(logical_nouns, text):
    matched_nouns = []
    # Tokenize the text into words
    words = text.split()
    for word in words:
        # Check if the word corresponds to a logical noun, considering case-insensitivity and singular/plural forms
        for noun in logical_nouns:
            if (noun.lower() == word.lower() or re.search(rf'\b{noun}\b', word, re.IGNORECASE) or
                    p.singular_noun(noun) == word.lower() or p.plural_noun(noun) == word.lower()):
                matched_nouns.append(word)
    return matched_nouns

# Extract nouns from each data example and match with words in the text
for example in data_examples:
    parts = example.split('\t')
    text = parts[0]
    logical_representation = parts[1]

    # Extract nouns from logical representation
    logical_nouns = extract_nouns_from_logical(logical_representation)

    # Match logical nouns with words in the text
    matched_nouns = match_logical_nouns_with_text(logical_nouns, text)

    print("Text:", text)
    print("Matched Nouns:", matched_nouns)
    print()


Text: It took him three months to learn to ride a bicycle.
Matched Nouns: ['months', 'bicycle.']

Text: He is ill.
Matched Nouns: []

Text: They will be jealous.
Matched Nouns: []

Text: I'm running out of ideas.
Matched Nouns: []

Text: The girl is deeply attached to her aunt.
Matched Nouns: ['girl', 'aunt.']



In [None]:
import re
import inflect

# Initialize the inflect engine for singularization and pluralization
p = inflect.engine()

# Define a function to extract nouns from the logical representation
def extract_nouns_from_logical(logical_representation):
    nouns = []
    # Split the logical representation into components
    components = logical_representation.split()
    for component in components:
        # Extract nouns based on the part-of-speech ("n" for nouns) and handle case-insensitivity
        match = re.match(r'(.+)\.n\.(\d+)', component, re.IGNORECASE)
        if match:
            noun = match.group(1)
            nouns.append(noun)
    return nouns

# Define a function to match logical nouns with words in the text
def match_logical_nouns_with_text(logical_nouns, text):
    matched_nouns = []
    # Tokenize the text into words
    words = text.split()
    for word in words:
        # Check if the word corresponds to a logical noun, considering case-insensitivity and singular/plural forms
        for noun in logical_nouns:
            if (noun.lower() == word.lower() or re.search(rf'\b{noun}\b', word, re.IGNORECASE) or
                    p.singular_noun(noun) == word.lower() or p.plural_noun(noun) == word.lower()):
                matched_nouns.append(word)
    return matched_nouns

# Read data examples from a text file
with open("train.txt", "r") as file:
    data_examples = file.readlines()

# Create a file to save the extracted nouns
with open("extracted_nouns.txt", "w") as noun_file:
    for example in data_examples:
        example = example.strip()  # Remove leading/trailing whitespace
        parts = example.split('\t')
        text = parts[0]
        logical_representation = parts[1]

        # Extract nouns from logical representation
        logical_nouns = extract_nouns_from_logical(logical_representation)

        # Match logical nouns with words in the text
        matched_nouns = match_logical_nouns_with_text(logical_nouns, text)

        # Save the extracted nouns in the noun file with tab separation
        noun_file.write(f"{text}\t{', '.join(matched_nouns)}\n")

print("Nouns extracted and saved in 'extracted_nouns.txt'.")


Nouns extracted and saved in 'extracted_nouns.txt'.


In [None]:
# Read the contents of the input file
with open('extracted_nouns.txt', 'r') as file:
    lines = file.readlines()

# Initialize an empty list to store nouns
nouns = []

# Extract nouns from each line and add them to the list
for line in lines:
    line = line.strip()
    if '\t' in line:
        _, nouns_in_line = line.split('\t')
        if nouns_in_line:
            nouns.extend(nouns_in_line.split(', '))

# Filter out empty lines and remove duplicate nouns
nouns = list(set(filter(None, nouns)))

# Write the extracted nouns to the "nouns.txt" file
with open('nouns.txt', 'w') as output_file:
    output_file.write('\n'.join(nouns))

print("Nouns extracted and saved to 'nouns.txt'")


Nouns extracted and saved to 'nouns.txt'


In [None]:
import string

# Function to remove punctuation marks from a word, except for hyphens
def remove_punctuation_except_hyphen(word):
    return ''.join(char for char in word if char not in string.punctuation or char == '-')

# Read the contents of the "nouns.txt" file
with open('nouns.txt', 'r') as file:
    nouns = file.readlines()

# Clean and strip each noun of punctuation marks (except hyphens)
cleaned_nouns = [remove_punctuation_except_hyphen(noun.strip()) for noun in nouns]

# Write the cleaned nouns to a new file
with open('cleaned_nouns.txt', 'w') as output_file:
    output_file.write('\n'.join(cleaned_nouns))

print("Punctuation removed (except hyphens), and cleaned nouns saved to 'cleaned_nouns.txt'")


Punctuation removed (except hyphens), and cleaned nouns saved to 'cleaned_nouns.txt'


In [None]:
!sort cleaned_nouns.txt | uniq > unique_nouns.txt

**Extracting Entities that are the fundamental of SBN notation.**

In [None]:
import re

# Regular expression pattern to match generic entities
generic_entity_pattern = re.compile(r'^[a-z]+\.n\.\d+$|^[a-z]+\.a\.\d+$|^[a-z]+\.v\.\d+$', re.IGNORECASE)

# Function to extract entities excluding generic patterns
def extract_entities(sbn):
    entities = []
    words = sbn.split()
    for word in words:
        if not generic_entity_pattern.match(word):
            entities.append(word)
    return entities

# Read data from train.txt
with open('sbn.txt', 'r') as file:
    train_data = file.readlines()

# Extract entities and save them to entity.txt, one entity per line
with open('entity.txt', 'w') as entity_file:
    for sbn in train_data:
        entities = extract_entities(sbn)
        for entity in entities:
            entity_file.write(entity + '\n')

print("Entities extracted and saved to entity.txt (one entity per line).")


Entities extracted and saved to entity.txt (one entity per line).


In [None]:
!sort entity.txt | uniq > unique_entity.txt

conveeting to lower case.

In [None]:
with open('cap_noun_plurals.txt', 'r') as file:
    text = file.read()
    lower_text = text.lower()

with open('lower_nouns_plurals.txt', 'w') as file:
    file.write(lower_text)


converting to cap-case.

In [None]:
# Python program to read a file and capitalize
# the first letter of every word in the file.

# A file named "gfg", will be opened with the
# reading mode.
file_gfg = open('replacable_nouns_lower.txt', 'r')
cap_name = open('replacable_nouns_cap.txt','w')

# This will traverse through every line one by one
# in the file
for line in file_gfg:

    # This will convert the content
    # of that line with capitalized
    # first letter of every word
    output = line.title()
    cap_name.write(output)

    # This will print the output
    #print(output)

making plurals of nouns

In [None]:
import inflect

# Initialize the inflect engine
p = inflect.engine()

# Read the list of words from a text file
# just change the name for lower or upper case file.
with open('replacable_nouns_cap.txt', 'r') as file:
    words = [line.strip() for line in file]

# Generate the plural forms
plural_forms = [p.plural(word) for word in words]

# Save the plural forms in 'lower_nouns_plurals.txt'
with open('rep_cap_nouns_plurals.txt', 'w') as output_file:
    for plural in plural_forms:
        output_file.write(plural + '\n')

print("Plural forms saved in 'lower_nouns_plurals.txt'.")


Plural forms saved in 'lower_nouns_plurals.txt'.


**Shuffling Nouns to make inside-context-wo-SS dictionary**

In [None]:
import random


# Function to shuffle the content of a text file
def shuffle_file_content(input_file, output_file):
    # Read data from the input file
    with open(input_file, 'r') as file:
        data = file.readlines()

    # Shuffle the data randomly
    random.shuffle(data)

    # Write the shuffled data to the output file
    with open(output_file, 'w') as file:
        file.writelines(data)


# Input and output file names
input_file_name = 'unique_nouns.txt'  # Replace with your input file name
output_file_name = 'rep_unique_nouns.txt'  # Replace with your desired output file name

# Shuffle the content of the input file and save it to the output file
shuffle_file_content(input_file_name, output_file_name)

print(f"Shuffled content saved to {output_file_name}")


Shuffled content saved to rep_unique_nouns.txt


**Data Pre-Processing for Common Noun Augmentation**

1.   Converting Extracted nouns from singular + plural --> all singlular.
2.   removing duplication. (lower-case-singluar data)
3. convert into cap-case. (cap-case-singluar-data)
4. converting singluar into plurals for both lower and cap case nouns.



**1- converting singluar + plurals into singluar nouns**

In [None]:
!pip install inflect

In [None]:
import inflect

# Create an inflect engine
p = inflect.engine()

# Read the list of unique nouns from the file
with open("nouns.txt", "r") as file:
    unique_nouns = [line.strip() for line in file]

# Create a list to store the modified nouns
modified_nouns = []

# Create a dictionary to keep track of the nouns that have been modified
modified_dict = {}

# Iterate through the unique nouns while preserving order
for noun in unique_nouns:
    # Check if the noun is plural
    if p.singular_noun(noun):
        # If it's plural, convert it to its singular form and store it in the dictionary
        singular_noun = p.singular_noun(noun)
        modified_dict[noun] = singular_noun
        modified_nouns.append(singular_noun)
    else:
        # If it's singular or already converted, keep it as it is
        if noun not in modified_dict:
            modified_nouns.append(noun)

# Write the modified nouns to a new file
with open("lower_singular_nouns.txt", "w") as output_file:
    for noun in modified_nouns:
        output_file.write(noun + "\n")

# Print a message to indicate that the modified nouns have been saved
print("Modified nouns have been saved to 'singular_nouns.txt'")


Modified nouns have been saved to 'singular_nouns.txt'


**2- Removing data duplication Nouns without changing the order**

In [None]:
# Read the list of nouns from a file (one noun per line)
with open("lower_singular_nouns.txt", "r") as file:
    nouns = [line.strip() for line in file]

# Create a dictionary to keep track of unique nouns while preserving order
unique_nouns_dict = {}

# Iterate through the list of nouns
unique_nouns = []
for noun in nouns:
    # Check if the noun is not in the dictionary
    if noun not in unique_nouns_dict:
        # Add the noun to the dictionary and the list
        unique_nouns_dict[noun] = True
        unique_nouns.append(noun)

# Write the unique nouns to a text file
with open("unique_lower_singular_nouns.txt", "w") as output_file:
    for noun in unique_nouns:
        output_file.write(noun + "\n")

# Print a message to indicate that the unique nouns have been saved
print("Unique nouns have been saved to 'unique_nouns.txt'")


Unique nouns have been saved to 'unique_nouns.txt'


**working with Noun Augmentation based-on WordNet Synset and Hypernym based noun replacements**.

*   Dataset is PMB-5.0.0
*   Noun Augmentation



In [None]:
import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet

# Sample list of nouns and the noun to replace
noun_list = ["animal", "fruit", "vehicle"]
original_noun = "animal"

# Function to replace a noun with a hyponym
def replace_with_hyponym(text, original, hyponym):
    return text.replace(original, hyponym)

# Find hyponyms for the original noun
hyponyms = []
for synset in wordnet.synsets(original_noun):
    for lemma in synset.lemmas():
        hyponyms.append(lemma.name())

# Choose a replacement (e.g., randomly)
import random
replacement = random.choice(hyponyms)

# Sample text
text = "The quick brown animal jumps over the lazy dog."

# Replace the noun in the text
new_text = replace_with_hyponym(text, original_noun, replacement)
print(new_text)


[nltk_data] Downloading package wordnet to /root/nltk_data...


The quick brown animal jumps over the lazy dog.


**code for common noun augmentation on dataset file**

In [None]:
import nltk
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

# Initialize the WordNet lemmatizer
lemmatizer = WordNetLemmatizer()

# Function to check if a word is a common noun and exists in various forms in the text
def is_common_noun_in_text(word, text):
    # Generate possible forms of the word
    forms = set()
    forms.add(word.lower())           # Lower-case
    forms.add(word.upper())           # Upper-case
    forms.add(word.capitalize())      # Cap-case
    forms.add(lemmatizer.lemmatize(word.lower()))  # Singular
    forms.add(lemmatizer.lemmatize(word.lower(), 'n'))  # Singular (explicitly as noun)
    forms.add(lemmatizer.lemmatize(word.lower(), 'v'))  # Base form (verb)
    forms.add(word.lower() + 's')     # Plural
    forms.add(word.upper() + 'S')     # Plural (upper-case)
    forms.add(lemmatizer.lemmatize(word.lower() + 's'))  # Plural (lemmatized)

    # Check if any form of the word is present in the text
    for form in forms:
        if form in text.lower():
            return True
    return False

# Function to replace common nouns with hypernyms, considering case sensitivity and singular/plural
def replace_common_nouns_with_hypernyms(sentence, logical_representation):
    # Tokenize the sentence
    tokens = nltk.word_tokenize(sentence)

    # Extract nouns from the logical representation for all possible senses (from .n.01 to .n.09)
    logical_nouns = []
    for sense in range(1, 10):  # Iterate from .n.01 to .n.09
        logical_nouns += [word.split('.')[0] for word in logical_representation.split() if word.endswith(f'.n.{sense:02d}')]

    # Replace common nouns with hypernyms
    replaced_sentence = sentence

    for word in logical_nouns:
        if is_common_noun_in_text(word, sentence):
            synsets = wordnet.synsets(word)
            if synsets:
                hypernym = synsets[0].hypernyms()  # Choose the first hypernym for simplicity
                if hypernym:
                    hypernym_word = hypernym[0].name().split('.')[0]
                    # Replace nouns in both case-sensitive and case-insensitive forms
                    replaced_sentence = replaced_sentence.replace(word, hypernym_word)
                    replaced_sentence = replaced_sentence.replace(word.capitalize(), hypernym_word.capitalize())
                    logical_representation = logical_representation.replace(word, hypernym_word)

    return replaced_sentence, logical_representation

# Read the dataset from "gold.sbn"
input_filename = "gold.sbn"
output_filename = "output.sbn"

with open(input_filename, "r") as infile:
    lines = infile.readlines()

# Perform common noun replacement and save the output
replaced_dataset = []

for line in lines:
    sentence, logical_representation = line.strip().split('\t')
    replaced_sentence, replaced_logical_representation = replace_common_nouns_with_hypernyms(sentence, logical_representation)
    replaced_dataset.append((replaced_sentence, replaced_logical_representation))

# Save the output to "output.sbn"
with open(output_filename, "w") as outfile:
    for sentence, logical_representation in replaced_dataset:
        outfile.write(f"{sentence}\t{logical_representation}\n")


In [None]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
import random  # Import the random module

# Initialize the WordNet lemmatizer
lemmatizer = WordNetLemmatizer()

# Function to check if a word is a common noun and exists in various forms in the text
def is_common_noun_in_text(word, text):
    # Generate possible forms of the word
    forms = set()
    forms.add(word.lower())  # Lower-case
    forms.add(word.upper())  # Upper-case
    forms.add(word.capitalize())  # Cap-case
    forms.add(lemmatizer.lemmatize(word.lower()))  # Singular
    forms.add(lemmatizer.lemmatize(word.lower(), 'n'))  # Singular (explicitly as noun)
    forms.add(lemmatizer.lemmatize(word.lower(), 'v'))  # Base form (verb)
    forms.add(word.lower() + 's')  # Plural
    forms.add(word.upper() + 'S')  # Plural (upper-case)
    forms.add(lemmatizer.lemmatize(word.lower() + 's'))  # Plural (lemmatized)

    # Check if any form of the word is present in the text
    for form in forms:
        if form in text.lower():
            return True
    return False

# Function to replace common nouns with random hypernyms
def replace_common_nouns_with_hyponyms(sentence, logical_representation):
    # Tokenize the sentence
    tokens = nltk.word_tokenize(sentence)

    # Extract nouns from the logical representation for all possible senses (from .n.01 to .n.09)
    logical_nouns = []
    for sense in range(1, 10):  # Iterate from .n.01 to .n.09
        logical_nouns += [word.split('.')[0] for word in logical_representation.split() if
                          word.endswith(f'.n.{sense:02d}')]

    # Replace common nouns with random hypernyms
    replaced_sentence = sentence

    for word in logical_nouns:
        if is_common_noun_in_text(word, sentence):
            synsets = wordnet.synsets(word)
            if synsets:
                hyponyms = []
                for synset in synsets:
                    hyponyms.extend(synset.hyponyms())  # Get a list of hyponyms for each synset

                if hyponyms:
                    # Choose a random hyponym from the list
                    random_hyponym = random.choice(hyponyms)
                    hyponym_word = random_hyponym.name().split('.')[0]
                    # Replace nouns in both case-sensitive and case-insensitive forms
                    replaced_sentence = replaced_sentence.replace(word, hyponym_word)
                    replaced_sentence = replaced_sentence.replace(word.capitalize(), hyponym_word.capitalize())
                    logical_representation = logical_representation.replace(word, hyponym_word)

    return replaced_sentence, logical_representation

    # for word in logical_nouns:
    #     if is_common_noun_in_text(word, sentence):
    #         synsets = wordnet.synsets(word)
    #         if synsets:
    #             hypernyms = synsets[0].hypernyms()  # Get a list of hypernyms
    #             if hypernyms:
    #                 # Choose a random hypernym from the list
    #                 random_hypernym = random.choice(hypernyms)
    #                 hypernym_word = random_hypernym.name().split('.')[0]
    #                 # Replace nouns in both case-sensitive and case-insensitive forms
    #                 replaced_sentence = replaced_sentence.replace(word, hypernym_word)
    #                 replaced_sentence = replaced_sentence.replace(word.capitalize(), hypernym_word.capitalize())
    #                 logical_representation = logical_representation.replace(word, hypernym_word)
    #
    # return replaced_sentence, logical_representation

# Read the dataset from "gold.sbn"
input_filename = "gold.sbn"
output_filename = "noun_replacement_through_random_hyponyms.sbn"

with open(input_filename, "r") as infile:
    lines = infile.readlines()

# Perform common noun replacement and save the output
replaced_dataset = []

for line in lines:
    sentence, logical_representation = line.strip().split('\t')
    replaced_sentence, replaced_logical_representation = replace_common_nouns_with_hyponyms(sentence,
                                                                                             logical_representation)
    replaced_dataset.append((replaced_sentence, replaced_logical_representation))

# Save the output to "noun_replacement_through_first_hypernym.sbn"
with open(output_filename, "w") as outfile:
    for sentence, logical_representation in replaced_dataset:
        outfile.write(f"{sentence}\t{logical_representation}\n")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
