# Exp003: Search for constituents in Corpora

In [2]:
import pytorch
import nltk
import benepar

Download the Brown Corpus that contains parsed English sentences.

In [3]:
from nltk.corpus import brown
nltk.download('brown')
nltk.download('universal_tagset')

[nltk_data] Downloading package brown to /Users/dglandorf/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     /Users/dglandorf/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


True

Download the Berkeley Neural Parser

In [4]:
benepar.download('benepar_en3')
parser = benepar.Parser("benepar_en3")

[nltk_data] Downloading package benepar_en3 to
[nltk_data]     /Users/dglandorf/nltk_data...
[nltk_data]   Unzipping models/benepar_en3.zip.
  from .autonotebook import tqdm as notebook_tqdm
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Define a function to parse sentences with the Berkeley Neural Parser

In [5]:
def parse_sentence(words, tags):
    """
    Parses a sentence using the given words and tags.

    Args:
        words (list): A list of words in the sentence.
        tags (list): A list of tags corresponding to each word.

    Returns:
        object: The parsed sentence object.

    """
    input_sentence = benepar.InputSentence(
        words=words,
        space_after=[True] * len(words),  # Assuming space after each word
        tags=tags,
        escaped_words=words
    )
    return parser.parse(input_sentence)

Define search patterns

In [55]:
def search_adj_list(tree):
    """
    Searches for a subtree in the given tree that represents an adjective phrase (ADJP)
    with at least two adjective (JJ) nodes and at least one comma (,) node.

    Args:
        tree (Tree): The tree to search in.

    Returns:
        bool: True if a matching subtree is found, False otherwise.
    """
    for subtree in tree.subtrees():
        if subtree.label() == 'ADJP':
            jj_count = 0
            comma_count = 0
            for node in subtree:
                if node.label() == 'JJ':
                    jj_count += 1
                elif node.label() == ',':
                    comma_count += 1

            if jj_count >= 2 and comma_count >= 1:
                return True
    return False

In [66]:
def find_adj_before_noun(tree, adjectives):
    """
    Finds adjectives that appear before a noun in a given tree.

    Parameters:
    - tree (nltk.tree.Tree): The tree to search for adjectives and nouns.
    - adjectives (list): A list of adjectives to search for.

    Returns:
    - matches (list): A list of tuples containing the adjective and the noun it appears before.
    """
    matches = []
    for subtree in tree.subtrees(lambda t: t.label() == 'NP'):  # Looking within Noun Phrases
        
        for i in range(len(subtree) - 1):  # Iterate over elements in the NP, stopping one short of the end
            current_node = subtree[i]
            next_node = subtree[i + 1]
            # Check if the current node is one of the specified adjectives and the next node is a noun
            if current_node.label() == 'JJ' and current_node[0] in adjectives \
               and next_node.label().startswith('NN'):
                matches.append((current_node[0], ' '.join(next_node.leaves())))
    return matches

Iterate over the corpus and apply the pattern.

In [68]:
for sent in brown.tagged_sents()[:5000]:
    words, nltk_tags = zip(*sent)
    tree = parse_sentence(words, nltk_tags)
    if search_adj_list(tree):
        print("Adjective list pattern in sentence:", ' '.join(tree.leaves()))
    if find_adj_before_noun(tree, ['main', 'only']):
        print("Adj/noun pattern in sentence:", ' '.join(tree.leaves()))

Adjective list pattern in sentence: The Soviet Union and other members of the Communist bloc are rapidly expanding their economic , technical and military assistance to the uncommitted nations .
Adjective list pattern in sentence: Proceeds will be used by the section to further its program in science , education and social action on local , national and international levels .
Adjective list pattern in sentence: a collection of English , French and German coins , valued at $500 ; ;
Adjective list pattern in sentence: Though there has been some avant garde indication that contemporary furniture might go back to the boxy look of the '20's and '40's , two manufacturers chose to take the approach of the sophisticated , but warm look in contemporary .
Adjective list pattern in sentence: Colorful , bright Eastman Chromspun fabrics , with the magenta , pink and white tones predominating as well as golden shades are used with Composite .
Adjective list pattern in sentence: I am prepared to demo