In [2]:
from transformers import BertTokenizer
import torch
import torch.nn as nn
from transformers import BertModel
from torch.utils.data import Dataset, DataLoader
import numpy as np
import os
# from crfseg import CRF
import torch.nn.functional as F
import tqdm as tqdm

In [92]:
import re
import nltk
from nltk import word_tokenize, pos_tag

In [195]:
import nltk
from nltk import word_tokenize, pos_tag

# Function to extract noun phrases from POS tags
# def extract_noun_phrases(pos_tags):
#     # Define patterns for noun phrases
#     patterns = [
#         [['NN'], ['NNS'], ['NNP'], ['NNPS']],
#         [['DT', 'NN'], ['DT', 'NNS'], ['DT', 'NNP'], ['DT', 'NNPS'], ['JJ', 'NN'], ['JJ', 'NNS'], ['JJ', 'NNP'], ['JJ', 'NNPS']],
#         [['DT', 'JJ', 'NN'], ['DT', 'JJ', 'NNS'], ['DT', 'JJ', 'NNP'], ['DT', 'JJ', 'NNPS']],
#     ]

#     noun_phrases = []

#     # Iterate through the POS tags
#     for i in range(len(pos_tags)):
#         # Initialize flag to check if a longer keyword is found
#         longer_keyword_found = False

#         # Iterate through all possible lengths (up to trigrams)
#         for length in range(3, 0, -1):  # Start from length 3 and move to length 1
#             if i + length <= len(pos_tags):
#                 current_sequence = [tag for word, tag in pos_tags[i:i+length]]
#                 current_keyword = ' '.join(word for word, tag in pos_tags[i:i+length])

#                 # Check if current sequence matches any pattern
#                 for pattern in patterns[length - 1]:  # Adjust index for patterns
#                     if current_sequence == pattern:
#                         noun_phrases.append(current_keyword)
#                         # longer_keyword_found = True
#                         # break

#                 # if longer_keyword_found:
#                 #     break  # Break loop if a longer keyword is found
    
#     # Sort noun phrases based on their length in reverse order
#     noun_phrases.sort(key=len, reverse=True)
#     return noun_phrases

import re

def extract_noun_phrases(pos_tags):
    # Define patterns for noun phrases
    patterns = [
        r'NN.? VB.? NN.?',
        r'NN.? IN NN.?',
        r'DT JJ NN.?',
        r'JJ NN.?',
        r'DT NN.?',
        r'NN.?'
    ]

    noun_phrases = []

    # Convert the list of tuples to a space-separated string
    pos_string = ' '.join(tag for word, tag in pos_tags)

    # Check if the POS string matches any pattern
    for pattern in patterns:
        for match in re.finditer(pattern, pos_string):
            # Get the start and end indices of the match
            start, end = match.span()

            # Convert the indices to word indices
            start = pos_string[:start].count(' ')
            end = pos_string[:end].count(' ')

            # Extract the corresponding words
            noun_phrase = ' '.join(word for word, tag in pos_tags[start:end])
            
            # Check if the noun phrase is not empty
            if noun_phrase.strip():
                noun_phrases.append(noun_phrase)

    # Remove empty strings from the list
    noun_phrases = [phrase for phrase in noun_phrases if phrase]

    return noun_phrases


In [196]:
import nltk
import os
from nltk import word_tokenize, pos_tag
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np
from transformers import BertTokenizer

class MyDataset(Dataset):
    def __init__(self, data_dir):
        self.data_dir = data_dir
        
        # iterate through the files in the data directory
        self.txtfiles = []
        self.annfiles = []

        for file in os.listdir(data_dir):
            if file.endswith(".txt"):
                self.txtfiles.append(file)
        
        self.tokeniser = BertTokenizer.from_pretrained('bert-base-uncased')
    
    def tokenise(self , text):
        tokens = []  # List to store tokens
        starting_offsets = []  # List to store starting offsets
        current_token = ''  # Variable to store current token
        offset = 0  # Starting offset

        for char in text:
            if char == ' ':
                if current_token:  # If token is not empty
                    tokens.append(current_token.lower())  # Append token in lowercase
                    starting_offsets.append(offset - len(current_token))  # Store starting offset
                    current_token = ''  # Reset current token
                offset += 1  # Move offset to next character
            else:
                current_token += char  # Append character to current token
                offset += 1  # Move offset to next character

        # Handling the last token if it exists after the loop ends
        if current_token:
            tokens.append(current_token.lower())  # Append token in lowercase
            starting_offsets.append(offset - len(current_token))  # Store starting offset

        return starting_offsets , tokens

    def __len__(self):
        return len(self.txtfiles)

    def __getitem__(self, index):
        txtfile = self.txtfiles[index]
        sampleid = txtfile.split(".")[0]
        
        # read the text file
        with open(os.path.join(self.data_dir, txtfile), 'r') as file:
            txt = file.read()
        
        # read the annotation file
        annfilename = sampleid + ".ann"
        with open(os.path.join(self.data_dir, annfilename), 'r') as file:
            ann = file.read()
        
        offsets , tokenisedtxt = self.tokenise(txt)
        tagslist = np.zeros(len(tokenisedtxt))
        # now iterate through the ann file , in each line , divide into spaces and get the last word 
        # make tagslist[i] = 1 if the word is in the tokenisedtxt
        for line in ann.split('\n'):
            if line == '':
                continue
            words = line.split()
            if words[0][0] != 'T':
                continue

            ssofset = words[2]
            endoffset = words[3]

            # add a 1 to each index of tagslist for indexes where offset is between ssofset and endoffset (including both)
            for i in range(len(offsets)):
                if offsets[i] >= int(ssofset) and offsets[i] <= int(endoffset):
                    tagslist[i] = 1
        
        # Convert tokens to IDs using BERT tokenizer
        tokenisedids = self.tokeniser.convert_tokens_to_ids(tokenisedtxt)
        
        # Perform POS tagging
        pos_tags = nltk.pos_tag(tokenisedtxt)
        
        # Extract noun phrases from POS tags
        noun_phrases = extract_noun_phrases(pos_tags)
        
        return torch.tensor(tokenisedids), torch.tensor(tagslist), noun_phrases
    
    def collate_fn(self , batch):
        # batch is a list of tuples
        # each tuple has 3 tensors , one for tokenisedids, one for tagslist, and one for noun_phrases
        # we need to return a tensor of tokenisedids, a tensor of tagslist, and a list of lists for noun_phrases
        tokenisedids = []
        tagslist = []
        noun_phrases = []
        for tup in batch:
            tokenisedids.append(torch.tensor(tup[0]))
            tagslist.append(torch.tensor(tup[1]))
            noun_phrases.append(tup[2])
        
        tokenisedids = torch.nn.utils.rnn.pad_sequence(tokenisedids , batch_first=True , padding_value=0) 
        tagslist = torch.nn.utils.rnn.pad_sequence(tagslist , batch_first=True , padding_value=0)

        tokenisedids = tokenisedids.type(torch.LongTensor)
        tagslist = tagslist.type(torch.LongTensor)
        
        return tokenisedids , tagslist, noun_phrases

In [197]:
train_dataset = MyDataset('/Users/ashnadua/Desktop/INLP-project/scienceie2017_train/train2')
train_dataloader = DataLoader(train_dataset, batch_size=2, shuffle=True , collate_fn=train_dataset.collate_fn)

In [198]:
def load_gold_keywords(ann_file):
    gold_keywords = set()
    with open(ann_file, 'r') as file:
        for line in file:
            if line.startswith('T'):
                parts = line.split()
                keyword = ' '.join(parts[4:])
                gold_keywords.add(keyword.lower())
    return gold_keywords

In [199]:
def calculate_accuracy(extracted_keywords, gold_keywords):
    correctly_extracted = len(extracted_keywords.intersection(gold_keywords))
    total_gold_keywords = len(gold_keywords)
    accuracy = correctly_extracted / total_gold_keywords if total_gold_keywords > 0 else 0
    return accuracy

In [231]:
import os

data_dir = '/Users/ashnadua/Desktop/INLP-project/scienceie2017_dev/dev'
total_accuracy = 0
total_files = 0

for file in os.listdir(data_dir):
    if file.endswith(".txt"):
        txt_path = os.path.join(data_dir, file)
        ann_path = os.path.join(data_dir, file[:-3] + "ann")

        with open(txt_path, 'r') as file:
            text = file.read()

        tokens = word_tokenize(text)
        pos_tags = nltk.pos_tag(tokens)
        noun_phrases = extract_noun_phrases(pos_tags)
        noun_phrases = set(noun_phrases)  # Convert to set

        gold_keywords = load_gold_keywords(ann_path)

        accuracy = calculate_accuracy(noun_phrases, gold_keywords)
        total_accuracy += accuracy
        total_files += 1

# Calculate the average accuracy
average_accuracy = total_accuracy / total_files
print(f"Average accuracy over the entire dataset: {average_accuracy}")


Average accuracy over the entire dataset: 0.1905072174591904


In [201]:
text = "Poor oxidation behavior is the major barrier to the increased use of Ti-based alloys in high-temperature structural applications. The demand to increase the service temperature of these alloys beyond 550°C (the typical temperature limit) requires careful study to understand the role that composition has on the oxidation behavior of Ti-based alloys [1–3]. The attempt to overcome this limitation in Ti-based alloys has led to the production of alloys with substantially improved oxidation resistance such as β-21S and also development of coatings and pre-oxidation techniques [1,4–6]. While it is tempting to extrapolate the oxidation behavior (e.g. oxidation rate law, depth of oxygen ingress and scale thickness) observed for a limited number of compositions under a certain oxidation condition to a broader compositional range, there are numerous examples in the literature where deviations from the expected relations are observed [7,8]."
text = text.lower()
tokens = nltk.word_tokenize(text)
pos_tags = nltk.pos_tag(tokens)
noun_phrases = extract_noun_phrases(pos_tags)
print(noun_phrases)

['production of', 'development of', 'depth of oxygen', 'number of', 'the major barrier', 'the typical temperature', 'a limited number', 'a certain oxidation', 'the expected', 'poor oxidation', 'major barrier', 'ti-based', 'structural', 'typical temperature', 'careful study', 'ti-based', 'ti-based alloys', 'scale thickness', 'limited number', 'certain oxidation', 'compositional range', 'numerous', 'expected', 'the demand', 'the service', 'these', 'the role', 'that composition', 'the oxidation', 'the attempt', 'this limitation', 'the production', 'the oxidation', 'the literature', 'oxidation', 'behavior', 'barrier', 'use', 'demand', 'service', 'temperature', 'temperature', 'limit', 'study', 'role', 'composition', 'oxidation', 'behavior', 'attempt', 'limitation', 'alloys', 'production', 'oxidation', 'resistance', 'development', 'pre-oxidation', 'oxidation', 'behavior', 'oxidation', 'rate', 'law', 'depth', 'oxygen', 'ingress', 'thickness', 'number', 'oxidation', 'condition', 'range', 'lite

In [202]:
noun_phrases = set(noun_phrases)

In [203]:
ann_file = '/Users/ashnadua/Desktop/INLP-project/scienceie2017_train/train2/S0010938X1500195X.ann'  # Path to your annotation file
gold_keywords = load_gold_keywords(ann_file)

accuracy = calculate_accuracy(noun_phrases, gold_keywords)
print("Accuracy:", accuracy)

Accuracy: 0.4


In [216]:
predicted = list(noun_phrases)

In [224]:
for key in noun_phrases:
    print(key)

poor oxidation
the expected
certain oxidation
oxygen
scale thickness
typical temperature
production
the attempt
role
depth
thickness
expected
the service
barrier
limited number
this limitation
number of
careful study
the demand
alloys
resistance
ingress
the production
a certain oxidation
production of
depth of oxygen
structural
numerous
range
limitation
pre-oxidation
the role
behavior
limit
composition
compositional range
a limited number
use
oxidation
attempt
literature
development
law
development of
]
these
temperature
condition
number
the literature
ti-based
the major barrier
service
ti-based alloys
major barrier
the oxidation
rate
demand
the typical temperature
study
that composition


In [222]:
keys_per_column = 10

# Calculate the number of columns needed
num_columns = (len(predicted) + keys_per_column - 1) // keys_per_column

# Print keys in side-by-side columns
for i in range(keys_per_column):
    for j in range(num_columns):
        idx = j * keys_per_column + i
        if idx < len(predicted):
            print(f"{predicted[idx]:<20}", end="")
    print()

poor oxidation      thickness           resistance          pre-oxidation       literature          ti-based            that composition    
the expected        expected            ingress             the role            development         the major barrier   
certain oxidation   the service         the production      behavior            law                 service             
oxygen              barrier             a certain oxidation limit               development of      ti-based alloys     
scale thickness     limited number      production of       composition         ]                   major barrier       
typical temperature this limitation     depth of oxygen     compositional range these               the oxidation       
production          number of           structural          a limited number    temperature         rate                
the attempt         careful study       numerous            use                 condition           demand              
role        

In [145]:
# gold_keywords

In [146]:
# noun_phrases

In [147]:
# for i in range(20):
#     tokenisedids, tagslist, noun_phrases = train_dataset[i]
#     print(f"Sample {i+1}:")
#     print("Tokenized Text:", tokenisedids)
#     print("Tags List:", tagslist)
#     print("Noun Phrases:", noun_phrases)
#     print()


In [None]:
# import spacy

# def extract_noun_phrases_spacy(text):
#     nlp = spacy.load('en_core_web_sm')
#     doc = nlp(text)
#     noun_phrases = [chunk.text for chunk in doc.noun_chunks]
#     return noun_phrases

# # Example usage:
# text = "This is an example sentence with keywords like beautiful flowers and green grass."
# noun_phrases_spacy = extract_noun_phrases_spacy(text)
# print(noun_phrases_spacy)
