* NAME: Rishabh Patil
* SAP: 60009200056
* BATCH: D12

#**Lab 4: Implement Information Retrieval for extracting Text from Webpages and Image.**

#**1. On Sentences**

In [None]:
!git clone https://github.com/thunlp/OpenNRE.git
!cp -r OpenNRE/opennre opennre
!pip install transformers

Successfully installed huggingface-hub-0.17.3 safetensors-0.4.0 tokenizers-0.14.1 transformers-4.34.1


In [None]:
import re
import os
import nltk
import math
import opennre
import numpy as np
import pandas as pd
import tensorflow as tf
from keras import utils
from sklearn import metrics
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from keras.models import Sequential
from nltk.chunk.regexp import RegexpParser
from keras.preprocessing.text import Tokenizer
from keras.layers import Embedding, SimpleRNN, Dense
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
from nltk import sent_tokenize, word_tokenize, pos_tag, ne_chunk, RegexpTokenizer, PorterStemmer, WordNetLemmatizer

In [None]:
nltk.download('all')

In [None]:
text = "Dr. APJ Abdul Kalam was born in a humble and poor household in Tamil Nadu. He started off by selling newspapers as a boy. His hardwork and perseverance propelled him to become one of India's greatest rocket scientists, instrumental in the foundation of ISRO and DRDO. He went on to become the President of India."

**Sentence Tokenization**

In [None]:
sentences = sent_tokenize(text)

**Word Tokenization, POS Tagging and Named Entity Recognition**

In [None]:
for sentence in sentences:
    # Word Tokeniation
    words = word_tokenize(sentence)

    # POS Tagging
    pos_tags = pos_tag(words)

    # Named Entity Recognition (NER)
    named_entities = ne_chunk(pos_tags)

    # Extracting and printing named entities along with their types
    for entity in named_entities:
        if isinstance(entity, nltk.Tree):
            entity_name = " ".join([word for word, tag in entity.leaves()])
            entity_type = entity.label()
            print(f"Entity: {entity_name}, Type: {entity_type}")

Entity: Abdul Kalam, Type: PERSON
Entity: Tamil Nadu, Type: GPE
Entity: India, Type: GPE
Entity: ISRO, Type: ORGANIZATION
Entity: DRDO, Type: ORGANIZATION
Entity: India, Type: GPE


**Relationship Extraction**

In [None]:
# Sample data: (sentence, relation)
data = [
    ("Dev Patel is late to college.", "is_late"),
    ("Bhadreshbhai had five apples.", "had_apples"),
]

In [None]:
# Separate sentences and labels
sentences = [sentence for sentence, _ in data]
labels = [label for _, label in data]

In [None]:
# Create a tokenizer and fit on sentences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)

In [None]:
# Convert sentences to sequences of word indices
sequences = tokenizer.texts_to_sequences(sentences)

In [None]:
# Padding sequences for consistent input length
max_sequence_length = max([len(sequence) for sequence in sequences])
padded_sequences = pad_sequences(sequences, maxlen=max_sequence_length, padding='post')

In [None]:
# Create labels dictionary
label_indices = {"is_late": 0, "had_apples": 1}
labels_encoded = [label_indices[label] for label in labels]

In [None]:
# Convert labels to one-hot encoded vectors
labels_one_hot = tf.keras.utils.to_categorical(labels_encoded)

In [None]:
# Build RNN model
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=128, input_length=max_sequence_length))
model.add(SimpleRNN(64, return_sequences=False))
model.add(Dense(len(label_indices), activation='softmax'))

In [None]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(padded_sequences, labels_one_hot, epochs=20, batch_size=31)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x7be8ad1dba60>

**Sample text for prediction**

In [None]:
# Sample sentence for prediction
sample_sentence = "Bhadressbhai was in possession of sixteen apples in Mehsana."

In [None]:
# Tokenize and pad the sample sentence
sample_sequence = tokenizer.texts_to_sequences([sample_sentence])
sample_padded_sequence = pad_sequences(sample_sequence, maxlen=max_sequence_length, padding='post')

In [None]:
# Predict the relation for the sample sentence
predicted_label_index = np.argmax(model.predict(sample_padded_sequence))
predicted_label = list(label_indices.keys())[predicted_label_index]

print(f"Predicted relation: {predicted_label}")

Predicted relation: had_apples


**Template for NLTK using Information Extraction**

In [None]:
# Sample text
text = "Reliance Inc. was founded by Dhirubhai Ambani in Gujarat, in 1958."

In [None]:
# Tokenize the text into sentences and words
sentences = sent_tokenize(text)
words = [word_tokenize(sentence) for sentence in sentences]

In [None]:
# Part-of-speech tagging
pos_tags = [nltk.pos_tag(sentence) for sentence in words]

In [None]:
# Initialize variables to hold template placeholders
organization = ""
person = ""
location = ""
year = ""

In [None]:
# Iterate through the tagged words to extract information
for pos_tagged_sentence in pos_tags:
    for (word, pos) in pos_tagged_sentence:
        if pos == "NNP":  # Proper noun (likely an entity)
            if not organization:
                organization = word
            elif not person:
                person = word
            else:
                # Handle multiple proper nouns by assuming the last one is a person
                person = word
        elif pos == "IN" and not location:
            location = word
        elif pos == "CD" and not year:
            year = word

In [None]:
# Filter out stopwords
stop_words = set(stopwords.words("english"))
organization = organization.lower() if organization.lower() not in stop_words else ""
person = person.lower() if person.lower() not in stop_words else ""
location = location.lower() if location.lower() not in stop_words else ""

In [None]:
# Print the extracted information
print(f"Organization: {organization}")
print(f"Person: {person}")
print(f"Location: {location}")
print(f"Year: {year}")

Organization: reliance
Person: gujarat
Location: 
Year: 1958


In [None]:
# Define a template
template = "{organization} was founded by {person} in {location}, in {year}."
print(template)
# Fill the template
filled_template = template.format(
    organization=organization,
    person=person,
    location=location,
    year=year
)

{organization} was founded by {person} in {location}, in {year}.


In [None]:
# Print the filled template
print(filled_template)

reliance was founded by gujarat in , in 1958.


#**2. On Text Dataset**

**Using Spacy**

In [None]:
! pip install spacy
! python -m spacy download en_core_web_sm

In [None]:
import spacy

In [None]:
# Load the English language model
nlp = spacy.load("en_core_web_sm")

In [None]:
with open('/content/hlt-naacl08-data lab3 acl.txt', 'r') as file:
    text = file.read()

In [None]:
text

'<p1> Google </p1> assimilates <p2> YouTube </p2> !\nfinally <p1> google </p1> bought <p2> youtube </p2>\n<p1> Google </p1> actually bought <p2> YouTube </p2> .\n<p1> Google </p1> Closes <p2> YouTube </p2> Acquisition\nnever even saw this coming , but apparently <p1> Adobe </p1> www.msacromedia.com is about to buy <p2> Macromedia </p2> .\nThe story was first seen at Techcrunch , the picked up by the Wall Street Journal and has since been the subject of much talk , posts and thoughts over the past few days and finally it has been confirmed that <p1> Google </p1> have purchased <p2> Youtube </p2> for $ 1.65 billion in an official statement .\n<p1> Google </p1> confirms <p2> YouTube </p2> aquisition - BBC News\n<p1> Novartis </p1> acquired <p2> Eon Labs </p2> and Hexal to create the world s leading manufacturer of generic drugs .\n<p1> Novartis </p1> buys generic drugmakers <p2> Eon </p2> , Hexal\n<p1> Google </p1> announced Tuesday that it bought <p2> YouTube </p2> for $ 1.65 billion .\n

In [None]:
doc = nlp(text)

In [None]:
doc

In [None]:
# Extract named entities
entities = [(entity.text, entity.label_) for entity in doc.ents]

In [None]:
# Print the named entities and their labels
for entity, label in entities:
    print(f"Entity: {entity}, Label: {label}")

Entity: Macromedia, Label: ORG
Entity: Techcrunch, Label: ORG
Entity: the Wall Street Journal, Label: ORG
Entity: the past few days, Label: DATE
Entity: $ 1.65 billion, Label: MONEY
Entity: BBC News, Label: ORG
Entity: Eon Labs, Label: PERSON
Entity: Hexal, Label: NORP
Entity: Hexal
<p1>, Label: PERSON
Entity: Tuesday, Label: DATE
Entity: $ 1.65 billion, Label: MONEY
Entity: Macromedia, Label: ORG
Entity: July 11 , 2005, Label: DATE
Entity: today, Label: DATE
Entity: Request for Additional Information and Documentary Materials, Label: ORG
Entity: second, Label: ORDINAL
Entity: the Department of Justice, Label: ORG
Entity: Macromedia, Label: ORG
Entity: $ 1.65 Billion, Label: MONEY
Entity: Macromedia, Label: ORG
Entity: BASEL, Label: GPE
Entity: Switzerland, Label: GPE
Entity: AP, Label: ORG
Entity: Eon Labs, Label: PERSON
Entity: the United States, Label: GPE
Entity: Hexal, Label: NORP
Entity: Germany, Label: GPE
Entity: more than $ 8 billion, Label: MONEY
Entity: This week, Label: DAT

In [None]:
nlp = spacy.load("en_core_web_sm")
def info_retrieval(document):
  doc = nlp(document)
  organizations = []
  persons = []
  locations = []
  years = []
  for ent in doc.ents:
    if ent.label_ == "ORG":
        organizations.append(ent.text)
    elif ent.label_ == "PERSON":
        persons.append(ent.text)
    elif ent.label_ == "GPE":
        locations.append(ent.text)
    elif ent.label_ == "DATE":
        years.append(ent.text)
  return [("Organizations:", organizations), ("Persons:", persons), ("Locations:", locations), ("Years:", years)]

In [None]:
info_retrieval("Reliance Inc. was founded by Dhirubhai Ambani in Gujarat, in 1958.")

[('Organizations:', ['Reliance Inc.']),
 ('Persons:', ['Dhirubhai Ambani']),
 ('Locations:', ['Gujarat']),
 ('Years:', ['1958'])]

In [None]:
model = opennre.get_model('wiki80_cnn_softmax')

def relation_extraction_opennre_from_xml(document):

    p1_text = re.findall(r'<p1>(.*?)</p1>', document)
    p2_text = re.findall(r'<p2>(.*?)</p2>', document)
    relationships = []
    for p1, p2 in zip(p1_text, p2_text):
        relation_info = model.infer({'text': f"{p1} {p2}", 'h': {'pos': (0, len(p1))}, 't': {'pos': (len(p1) + 1, len(p1) + len(p2) + 1)}})
        predicted_relation = relation_info[0]
        relationships.append((p1, predicted_relation, p2))

    return relationships

In [None]:
with open('/content/hlt-naacl08-data lab3 acl.txt', 'r', encoding='utf-8') as file:
    lines = file.readlines()

In [None]:
data = []
in_line =[]
p1_list = []
p2_list = []
ner_list = []
relationship_list = []

In [None]:
p1 = ""
p2 = ""

for line in lines:
    in_line.append(line)
    p1_match = re.search(r'<p1>(.*?)</p1>', line)

    if p1_match:
        p1 = p1_match.group(1)

    p2_match = re.search(r'<p2>(.*?)</p2>', line)
    if p2_match:
        p2 = p2_match.group(1)
    relationship_list.append(relation_extraction_opennre_from_xml(line))
    line = re.sub(r'<.*?>', '', line)

    ner_list.append(info_retrieval(line))
    in_line.append(line)
    p1_list.append(p1)
    p2_list.append(p2)

In [None]:
df = pd.DataFrame({'Input':lines, 'p1': p1_list, 'p2': p2_list, 'NER': ner_list,'Relationship Extraction': relationship_list})
df.head()

Unnamed: 0,Input,p1,p2,NER,Relationship Extraction
0,<p1> Google </p1> assimilates <p2> YouTube </p...,Google,YouTube,"[(Organizations:, [Google, YouTube]), (Persons...","[( Google , headquarters location, YouTube )]"
1,finally <p1> google </p1> bought <p2> youtube ...,google,youtube,"[(Organizations:, []), (Persons:, []), (Locati...","[( google , headquarters location, youtube )]"
2,<p1> Google </p1> actually bought <p2> YouTube...,Google,YouTube,"[(Organizations:, [Google, YouTube]), (Persons...","[( Google , headquarters location, YouTube )]"
3,<p1> Google </p1> Closes <p2> YouTube </p2> Ac...,Google,YouTube,"[(Organizations:, [Google Closes YouTube Ac...","[( Google , headquarters location, YouTube )]"
4,"never even saw this coming , but apparently <p...",Adobe,Macromedia,"[(Organizations:, [Macromedia]), (Persons:, []...","[( Adobe , developer, Macromedia )]"


In [None]:
!jupyter nbconvert --to html "/content/60009200040_ACL_D11_Lab4.ipynb"

[NbConvertApp] Converting notebook /content/60009200040_ACL_D11_Lab4.ipynb to html
[NbConvertApp] Writing 775504 bytes to /content/60009200040_ACL_D11_Lab4.html
