modifying placeholders according to NER model
1. all names --> PERSON
2. all city,state,country --> GPE 
**start from here**

**Using large spaCy model to extract named entities line by line, replace it with its tag and save the modified text and named entities in a new file.**

the code below provides,
1. updated text file with removed named entities with their tags.
2. dictionary for named entities.

In [None]:
!python -m spacy download en_core_web_lg 


In [None]:
import spacy

nlp = spacy.load('en_core_web_lg')

# Open the file and read the text
with open('text.txt', 'r') as file:
    text = file.read()

# Process the text using spaCy
doc = nlp(text)

# Define the set of entity labels to replace
replace_labels = {'PERSON', 'GPE'}

# Initialize the dictionary of named entities and their tags
ner_dict = {}

# Loop through the entities in the document
for ent in doc.ents:
    # Check if the entity label is in the set of labels to replace
    if ent.label_ in replace_labels:
        # Add the named entity and its tag to the dictionary
        ner_dict[ent.text] = ent.label_

    # Replace the entity with the named entity tag
    if ent.label_ in replace_labels:
        text = text.replace(ent.text, ent.label_)

# Write the modified text to a file
with open('modified_text.txt', 'w') as file:
    file.write(text)

# Write the named entity dictionary to a file
with open('named_entities.txt', 'w') as file:
    for key, value in ner_dict.items():
        file.write(f"'{key}':'{value}',\n")


**Applying unlexicalization on NOUNS.**

In [None]:
import spacy

nlp = spacy.load('en_core_web_lg')

# Open the file and read the text
with open('text.txt', 'r') as file:
    text = file.read()

# Process the text using spaCy
doc = nlp(text)

# Define the set of entity labels to replace
noun_labels = {'NOUN'}

# Initialize the dictionary of named entities and their tags
ner_dict = {}

# Loop through the entities in the document
for token in doc:
    # Check if the token is a noun
    if token.pos_ in noun_labels:
        # Add the noun and its tag to the dictionary
        ner_dict[token.text] = token.pos_

    # Replace the noun with the noun tag
    if token.pos_ in noun_labels:
        text = text.replace(token.text, token.pos_)

# Write the modified text to a file
with open('modified_text.txt', 'w') as file:
    file.write(text)

# Write the named entity dictionary to a file
with open('named_entities.txt', 'w') as file:
    for key, value in ner_dict.items():
        file.write(f"'{key}':'{value}',\n")


dictionary processing for DRS.

converting Brad Pitt --> Brad~Pitt

In [None]:
with open('named_entities.txt', 'r') as file:
    lines = file.readlines()

with open('dictionary.txt', 'w') as file:
    for line in lines:
        words = line.split()
        new_line = '~'.join(words)
        file.write(new_line + '\n')


In DRS, all names are in Lower Case. Converting into Lower case.

In [None]:
with open('dictionary.txt', 'r') as file:
    text = file.read()

text = text.lower()

with open('dictionary.txt', 'w') as file:
    file.write(text)


converting tag into UPPER CASE.

In [None]:
!sed -i 's/person/PERSON/g' dictionary.txt

In [None]:
!sed -i 's/gpe/GPE/g' dictionary.txt

**All above code is for Pre-Processing of Original Text for Unlex Named Entitties.**

general command is 's/placeholder-name/spaCy-teg/g' file name

In [None]:
!sed -i 's/Country_2/GPE/g' unlex1.txt

**Post-Processing of Named Entities**

In [None]:
import spacy
import pandas as pd

# Load the English language model in spaCy
nlp = spacy.load("en_core_web_sm")

# Define sentence A
sentence_a = "Tom and Mary buy a box from the supermarket in Boston."

# Parse sentence A with spaCy
doc_a = nlp(sentence_a)

# Extract the named entities and nouns from sentence A
entities = [(ent.text, ent.label_) for ent in doc_a.ents]
nouns = [token.text for token in doc_a if token.pos_ == "NOUN"]

# Create a pandas DataFrame to store the named entities and nouns
df = pd.DataFrame(entities + [(noun, "NOUN") for noun in nouns], columns=["text", "label"])

# Print the DataFrame
print(df)


          text   label
0          Tom  PERSON
1         Mary  PERSON
2       Boston     GPE
3          box    NOUN
4  supermarket    NOUN


In [None]:
# Define sentence B with placeholders for [PERSON], [NOUN], and [GPE]
sentence_b = "PERSON and PERSON buy a NOUN from NOUN at GPE."

# Find the recognized entities and nouns in the DataFrame
entities_df = df[df["label"].isin(["PERSON", "GPE"])]
nouns_df = df[df["label"] == "NOUN"]

# Replace the placeholders in sentence B with the recognized entities and nouns
for label, group in entities_df.groupby("label"):
    if label == "PERSON":
        sentence_b = sentence_b.replace(label, group["text"].iloc[0], 1)  # Replace only the first occurrence
        sentence_b = sentence_b.replace(label, group["text"].iloc[1], 1)  # Replace the second occurrence
    else:
        sentence_b = sentence_b.replace(label, group["text"].iloc[0], 1)  # Replace only the first occurrence
for i, row in nouns_df.iterrows():
    sentence_b = sentence_b.replace("NOUN", row["text"], 1)  # Replace only the first occurrence

print(sentence_b)


Tom and Mary buy a box from supermarket at Boston.


applying above logic on text files.


In [None]:
import spacy
import pandas as pd

# Load the English language model in Spacy
nlp = spacy.load("en_core_web_sm")

# Read the sentences from text files
with open("a.txt", "r") as f:
    sentence_a = f.read().strip()
with open("b.txt", "r") as f:
    sentence_b_template = f.read().strip()

# Define sentence B with placeholders for [PERSON], [NOUN], and [GPE]
sentence_b = sentence_b_template.replace("[PERSON]", "PERSON").replace("[NOUN]", "NOUN").replace("[GPE]", "GPE")

# Apply Spacy to sentence A to find recognized entities and nouns
doc = nlp(sentence_a)
entities_df = pd.DataFrame([(ent.text, ent.label_) for ent in doc.ents], columns=["text", "label"])
nouns_df = pd.DataFrame([(token.text, token.pos_) for token in doc if token.pos_ == "NOUN"], columns=["text", "pos"])

for label, group in entities_df.groupby("label"):
    if label == "PERSON" and len(group) >= 2:
        sentence_b = sentence_b.replace(label, group["text"].iloc[0], 1)  # Replace only the first occurrence
        sentence_b = sentence_b.replace(label, group["text"].iloc[1], 1)  # Replace the second occurrence
    elif len(group) >= 1:
        sentence_b = sentence_b.replace(label, group["text"].iloc[0], 1)  # Replace only the first occurrence
for i, row in nouns_df.iterrows():
    if len(row["text"]) > 0:
        sentence_b = sentence_b.replace("NOUN", row["text"], 1)  # Replace only the first occurrence

# Print the final sentence B
print(sentence_b)


3 times 5 is 15.


both files having multiple sentences.

In [None]:
import spacy
import pandas as pd

# Load the English language model in Spacy
nlp = spacy.load("en_core_web_sm")

# Read the sentences from text files
with open("lex.txt", "r") as f:
    sentences_a = [line.strip() for line in f.readlines()]
with open("unlex.txt", "r") as f:
    sentences_b_template = [line.strip() for line in f.readlines()]

# Define sentence B template with placeholders for [PERSON], [NOUN], and [GPE]
sentence_b_template = " ".join(sentences_b_template)
sentence_b_template = sentence_b_template.replace("[PERSON]", "PERSON").replace("[NOUN]", "NOUN").replace("[GPE]", "GPE")

# Loop through each pair of sentences and replace the placeholders in sentence B with the recognized entities and nouns
for sentence_a, sentence_b in zip(sentences_a, sentences_b_template):
    # Apply Spacy to sentence A to find recognized entities and nouns
    doc = nlp(sentence_a)
    entities_df = pd.DataFrame([(ent.text, ent.label_) for ent in doc.ents], columns=["text", "label"])
    nouns_df = pd.DataFrame([(token.text, token.pos_) for token in doc if token.pos_ == "NOUN"], columns=["text", "pos"])

    # Replace the placeholders in sentence B with the recognized entities and nouns
    for label, group in entities_df.groupby("label"):
      if label == "PERSON" and len(group) >= 2:
        sentence_b = sentence_b.replace(label, group["text"].iloc[0], 1)  # Replace only the first occurrence
        sentence_b = sentence_b.replace(label, group["text"].iloc[1], 1)  # Replace the second occurrence
      elif len(group) >= 1:
        sentence_b = sentence_b.replace(label, group["text"].iloc[0], 1)  # Replace only the first occurrence
    for i, row in nouns_df.iterrows():
      if len(row["text"]) > 0:
        sentence_b = sentence_b.replace("NOUN", row["text"], 1)  # Replace only the first occurrence

    # Print the final sentence B for the current sentence A
    print(sentence_b)
    


**this code is for substituting back the Named Entities and Nouns and also my main code for back substitution.**

But for back-substitution, we need to do some post-processing.
1. all Names --> PERSON
2. all locations --> GPE
3. all nouns --> NOUN

**processing char.txt file.**

In [None]:
!sed -i 's/Name_1/PERSON/g' unlex.txt

In [None]:
!sed -i 's/Name_2/PERSON/g' unlex.txt

In [None]:
!sed -i 's/City_1/GPE/g' unlex.txt

In [None]:
!sed -i 's/City_2/GPE/g' unlex.txt

In [None]:
!sed -i 's/State_1/GPE/g' unlex.txt

In [None]:
!sed -i 's/State_2/GPE/g' unlex.txt

In [None]:
!sed -i 's/Country_1/GPE/g' unlex.txt

In [None]:
!sed -i 's/Country_2/GPE/g' unlex.txt

In [None]:
!sed -i 's/name_1/PERSON/g' unlex.txt

In [None]:
!sed -i 's/name_2/PERSON/g' unlex.txt

In [None]:
!sed -i 's/city_1/GPE/g' unlex.txt

In [None]:
!sed -i 's/city_2/GPE/g' unlex.txt

In [None]:
!sed -i 's/state_1/GPE/g' unlex.txt

In [None]:
!sed -i 's/state_2/GPE/g' unlex.txt

In [None]:
!sed -i 's/country_1/GPE/g' unlex.txt

In [None]:
!sed -i 's/country_2/GPE/g' unlex.txt

In [None]:
!sed -i 's/noun/NOUN/g' unlex.txt

processing SST based file.

In [None]:
import re

# define the regular expression to match words starting with "noun_"
regex = re.compile(r"\bnoun_\w+\b")

# open the input file
with open("unlex.txt", "r") as input_file:
    # read each line of the file and apply the regex
    for line in input_file:
        # replace all matches with "NOUN"
        output_line = regex.sub("NOUN", line)
        # print the modified line
        print(output_line, end="")


**processing word.txt files.**

In [None]:
!sed -i 's/Name _ 1/PERSON/g' unlex.txt

In [None]:
!sed -i 's/Name _ 2/PERSON/g' unlex.txt

In [None]:
!sed -i 's/City _ 1/GPE/g' unlex.txt

In [None]:
!sed -i 's/City _ 2/GPE/g' unlex.txt

In [None]:
!sed -i 's/State _ 1/GPE/g' unlex.txt

In [None]:
!sed -i 's/State _ 2/GPE/g' unlex.txt

In [None]:
!sed -i 's/Country _ 1/GPE/g' unlex.txt

In [None]:
!sed -i 's/Country _ 2/GPE/g' unlex.txt

In [None]:
!sed -i 's/name _ 1/PERSON/g' unlex.txt

In [None]:
!sed -i 's/name _ 2/PERSON/g' unlex.txt

In [None]:
!sed -i 's/city _ 1/GPE/g' unlex.txt

In [None]:
!sed -i 's/city _ 2/GPE/g' unlex.txt

In [None]:
!sed -i 's/state _ 1/GPE/g' unlex.txt

In [None]:
!sed -i 's/state _ 2/GPE/g' unlex.txt

In [None]:
!sed -i 's/country _ 1/GPE/g' unlex.txt

In [None]:
!sed -i 's/country _ 2/GPE/g' unlex.txt

In [None]:
!sed -i 's/noun/NOUN/g' unlex.txt

processing SST based file.

In [None]:
import re

# define the regular expression to match words starting with "noun_"
regex = re.compile(r"\bnoun _ \w+\b")

# open the input file
with open("unlex.txt", "r") as input_file:
    # read each line of the file and apply the regex
    for line in input_file:
        # replace all matches with "NOUN"
        output_line = regex.sub("NOUN", line)
        # print the modified line
        print(output_line, end="")


In [None]:
import spacy
import pandas as pd

# Load the English language model in Spacy
nlp = spacy.load("en_core_web_sm")

# Read the sentences from text files
with open("lex.txt", "r") as f:
    sentences_a = [line.strip() for line in f.readlines()]
with open("unlex.txt", "r") as f:
    sentences_b_template = [line.strip() for line in f.readlines()]

# Define sentence B template with placeholders for [PERSON], [NOUN], and [GPE]
sentence_b_template = " ".join(sentences_b_template)
sentence_b_template = sentence_b_template.replace("[PERSON]", "PERSON").replace("[NOUN]", "NOUN").replace("[GPE]", "GPE")

# Open a file to write the output
with open("re_lex.txt", "w") as f:
    # Loop through each pair of sentences and replace the placeholders in sentence B with the recognized entities and nouns
    for sentence_a, sentence_b in zip(sentences_a, sentences_b_template):
        # Apply Spacy to sentence A to find recognized entities and nouns
        doc = nlp(sentence_a)
        entities_df = pd.DataFrame([(ent.text, ent.label_) for ent in doc.ents], columns=["text", "label"])
        nouns_df = pd.DataFrame([(token.text, token.pos_) for token in doc if token.pos_ == "NOUN"], columns=["text", "pos"])

        # Replace the placeholders in sentence B with the recognized entities and nouns
        for label, group in entities_df.groupby("label"):
          if label == "PERSON" and len(group) >= 2:
            sentence_b = sentence_b.replace(label, group["text"].iloc[0], 1)  # Replace only the first occurrence
            sentence_b = sentence_b.replace(label, group["text"].iloc[1], 1)  # Replace the second occurrence
          elif len(group) >= 1:
            sentence_b = sentence_b.replace(label, group["text"].iloc[0], 1)  # Replace only the first occurrence
        for i, row in nouns_df.iterrows():
          if len(row["text"]) > 0:
            sentence_b = sentence_b.replace("NOUN", row["text"], 1)  # Replace only the first occurrence

        # Write the final sentence B for the current sentence A to the file
        f.write(sentence_b + "\n")


In [None]:
ls

lex.txt  re_lex.txt  [0m[01;34msample_data[0m/  unlex.txt
