# Quantifiers
## Quantifier Phrases
- characterized by occurrence before the descriptive adjectives in a noun phrase [1]
- sometimes qualified by adjectives or relative clauses [2]
- determiners like "every" are binary quantifiers $Q(A, B)$ which connect A and B to form a sentence 

## Sentence construction from quantifier phrases
- quantifier phrases may in turn combine with predicates [2]


1: https://www.merriam-webster.com/dictionary/quantifier
2: https://plato.stanford.edu/entries/quantification/
3: https://en.wikipedia.org/wiki/List_of_English_determiners

In [134]:
from dataclasses import dataclass
from typing import List, Any, Tuple

@dataclass
class Quantifier:
    name: str
    left_rising: bool
    right_rising: bool
    tags: List[List[str]]

        
all_quantifiers = [
    Quantifier("a few", True, True, [["DT", "JJ"]]),
    Quantifier("a large number of", True, True, [["DT", "JJ", "NN", "IN"]]),
    Quantifier("a little", True, True, [["DT", "JJ"]]),
    Quantifier("a number of", True, True, [["DT", "NN", "IN"]]),
    Quantifier("a small number of", True, True, [["DT", "JJ", "NN", "IN"]]),
    Quantifier("all", False, True, [["DT"]]),
    Quantifier("any", False, True, [["DT"]]),
    Quantifier("enough", True, True, [["DT"]]),
    Quantifier("each", False, True, [["DT"]]),
    Quantifier("every", True, True, [["DT"]]),
    Quantifier("few", False, False, [["DT"]]),
    Quantifier("fewer", False, False, [["DT"]]),
    Quantifier("less", False, False, [["DT"], ["RB"], ["IN"], ["JJR"]]), # Also adverb and preposition
    Quantifier("lots of", True, True, [["RB", "IN"], ["NNS", "IN"]]), # Idiom: adverb + preposition
    Quantifier("many", True, True, [["DT"], ["JJ"]]),
    Quantifier("most", False, True, [["DT"]]),
    Quantifier("most of", False, True, [["JJS", "IN"]]),
    Quantifier("much", True, True, [["DT"]]),
    Quantifier("much of", True, True, [["NN", "IN"]]),
    Quantifier("no", False, False, [["DT"]]),
    Quantifier("none of", False, False, [["NN", "IN"]]),
    Quantifier("not many", False, False, [["RB", "JJ"]]),
    Quantifier("not much", False, False, [["RB", "JJ"]]),
    Quantifier("numerous", True, True, [["JJ"]]), # Adjective
    Quantifier("plenty of", True, True, [["NN", "IN"]]), # Idiom: Pronoun + preposition
    Quantifier("several", True, True, [["DT"], ["JJ"]]), # Also pronoun
    Quantifier("some", True, True, [["DT"]]),
    Quantifier("whole", False, True, [["RB"], ["JJ"]]), # Adverb
    Quantifier("many of", True, True, [["NN", "IN"]]), # Noun + preposition
]

In [81]:
import os
import subprocess

from lxml import etree
from dataclasses import dataclass
CANDC_PATH = "../scripts/quantifier_monotonicity/candc-1.00"
SED_PATH = "../scripts/quantifier_monotonicity/tokenizer.sed"

def print_xml(xml: "XML"):
    print(etree.tostring(xml, pretty_print=True).decode("utf-8"))

def parse(sentence: str) -> "XML":
    sentence = sentence.replace("\"", "\\\"")
    ps = subprocess.run(
        f"echo \"{sentence}\" | sed -f {SED_PATH} | {CANDC_PATH}/bin/candc --models {CANDC_PATH}/models/ --candc-printer xml", 
        stdout=subprocess.PIPE,
        shell=True
        )
    xml = etree.fromstring(ps.stdout)
    return xml


In [145]:
import re
@dataclass
class QuantifierMatch:
    quantifier: Quantifier

@dataclass
class TokenMatch:
    index: int
    quantifier: Quantifier
        
def token_list(xml: "XML") -> List[Tuple[str, str]]:
    return etree.XPath("//lf")(xml)
        

def match_token(tokens: List[Tuple[str, str]], quantifier):
    """
    Assume that a specific quantifier only appears once per sentence
    """
    start_index = 0
    q_tokens = quantifier.name.split(" ")
    q_token_count = len(q_tokens)
    match = None
    for index, token in enumerate(tokens):
        q_index = index - start_index
        if q_tokens[q_index] != token.attrib["word"]:
            start_index = index+1
            continue
        if q_index == q_token_count - 1:
            match = TokenMatch(start_index, quantifier)
            break
    if match is None:
        return None
    for tags in quantifier.tags:
        for index, pair in enumerate(zip(tags, [lf.attrib["pos"] for lf in tokens][match.index:])):
            if pair[0] != pair[1]:
                break
            if index == q_token_count -1:
                return match
    return None


def get_all_ancestors(elem):
    ret = []
    while elem.getparent() is not None:
        ret.append(elem.getparent())
        elem = elem.getparent()
    return ret


def lowest_common_ancestor(elem1, elem2):
    elem1a = get_all_ancestors(elem1)
    elem2a = get_all_ancestors(elem2)
    return next(e1 for e1, e2 in zip(elem1a, elem2a) if e1 == e2)

VP_REGEX = r"^S[[\]a-z]*[\\/]NP$"
NP_REGEX = r"^NP$"
SOMETHING_REGEX = r"^\(S[[\]a-z]*[\\/]NP\)[\\/]\(S[[\]a-z]*[\\/]NP\)$"

def is_vp(node):
    return re.search(VP_REGEX, node.attrib["cat"]) is not None
def is_np(node):
    return re.search(NP_REGEX, node.attrib["cat"]) is not None
def is_something(node):
    return re.search(SOMETHING_REGEX, node.attrib["cat"]) is not None

def find_direction(direction, check, element):
    element = direction(element)
    if element is None:
        return None
    if check(element):
        return element
    elif is_something(element):
        return find_direction(direction, check, element)
    else:
        return None

def previous_node(node):
    return node.getprevious()

def next_node(node):
    return node.getnext()
    
def find_sibling_npvp(element):
    if is_np(element):
        print(element.tag, element.attrib["cat"])
        maybe_vp = find_direction(next_node, is_vp, element)
        if maybe_vp is None:
            maybe_vp = find_direction(previous_node, is_vp, element)
        if maybe_vp is not None:
            return (element, maybe_vp)
    elif is_vp(element):
        print(element.tag, element.attrib["cat"])
        maybe_np = find_direction(next_node, is_np, element)
        if maybe_np is None:
            maybe_np = find_direction(previous_node, is_np, element)
        if maybe_np is not None:
            return (maybe_np, element)
    elif is_something(element):
        maybe_np = find_direction(next_node, is_np, element)
        if maybe_np is None:
            maybe_np = find_direction(previous_node, is_np, element)
        maybe_vp = find_direction(next_node, is_vp, element)
        if maybe_vp is None:
            maybe_vp = find_direction(previous_node, is_vp, element)
        if maybe_vp is not None and maybe_np is not None:
            return (maybe_np, maybe_vp)
    else:
        parent = element.getparent()
        if parent is None:
            return None
        else:
            return find_sibling_npvp(parent)
        
        

def find_npvp(token_list) -> "XML":
    """Find the first NP rule common ancestor of first and last LF"""
    first = token_list[0]
    last = token_list[-1]
    common_ancestor = lowest_common_ancestor(first, last)
    return find_sibling_npvp(common_ancestor)
        
    
def match_quantifier(xml: "XML") -> List[QuantifierMatch]:
    tokens = token_list(xml)
    token_matches = list(filter(None, [match_token(tokens, quantifier) for quantifier in all_quantifiers]))
    matched_tokens = [
        tokens[match.index:match.index+len(match.quantifier.tags[0])]
        for match in token_matches
    ]
    npvp = [find_npvp(token_list) for token_list in matched_tokens]
    return token_matches, npvp
    
    
    

In [146]:
import re
re.search(VP_REGEX, r"(S[dcl]\NP)/NP")

  


In [150]:
xml = parse("Jindal : GOP must compete for every vote , reject identity politics , be more smart")
print_xml(xml)
result = match_quantifier(xml)
nps = result[1]
nps = [(etree.tostring(np), etree.tostring(vp)) for np, vp in nps]
result[0], nps

<candc>
<ccg>
 <rule type="ba" cat="NP">
  <rule type="lex" cat="NP">
   <lf start="0" span="1" word="Jindal" lemma="Jindal" pos="NNP" chunk="I-NP" entity="O" cat="N"/>
  </rule>
  <rule type="fa" cat="NP\NP">
   <lf start="1" span="1" word=":" lemma=":" pos="IN" chunk="O" entity="O" cat="(NP\NP)/S[dcl]"/>
   <rule type="ba" cat="S[dcl]">
    <rule type="lex" cat="NP">
     <lf start="2" span="1" word="GOP" lemma="GOP" pos="NNP" chunk="I-NP" entity="I-ORG" cat="N"/>
    </rule>
    <rule type="fa" cat="S[dcl]\NP">
     <lf start="3" span="1" word="must" lemma="must" pos="MD" chunk="I-VP" entity="O" cat="(S[dcl]\NP)/(S[b]\NP)"/>
     <rule type="ba" cat="S[b]\NP">
      <rule type="ba" cat="S[b]\NP">
       <lf start="4" span="1" word="compete" lemma="compete" pos="VB" chunk="I-VP" entity="O" cat="S[b]\NP"/>
       <rule type="fa" cat="(S[X]\NP)\(S[X]\NP)">
        <lf start="5" span="1" word="for" lemma="for" pos="IN" chunk="I-PP" entity="O" cat="((S\NP)\(S\NP))/NP"/>
        <rule typ

# this file was generated by the following command(s):
#   ../scripts/quantifier_monotonicity/candc-1.00/bin/candc --models ../scripts/quantifier_monotonicity/candc-1.00/models/ --candc-printer xml

1 parsed at B=0.075, K=20
1 coverage 100%
1 stats 7.14362 406 554


TypeError: cannot unpack non-iterable NoneType object

In [148]:
sentences = [
        "Bush says most of Congress \" acting like a teenager with a new credit card \"", # 0
        "Shaun White is the most successful snowboarder", # 1
        "O'Neill : no judge on Supreme Court now has legislative background", # 2
        "three - time winner Boris Becker believes any of the top four players could triumph", # 3
        "Arun Kundnani : some urge a U.S. government program aimed at extreme Muslim views", # 4
        "the owner of this estate is no ordinary Lord of the Manor -- it 's Russian tycoon Max", # 5
        "\" this wonderful couple is a danger to no one , \" writes Bourdain", # 6
        "while in prison , Mandela became most significant black leader in South Africa", # 7
        "new : Winfrey : \" we need a president who can bring us all together \"", # 8
        "Brazile : Norquist is the man most responsible for GOP gridlock in Washington", # 9
        "Earl Jr. believes Tiger has no one to keep him on the right path", # 10
        "world no. 3 Lee Westwood agrees with the move saying phones are key for business", # 11
        "there 's no substitute for American leadership in this critical region , he says", # 12
        "rising tide of Taliban and threat of violence has some residents worried", # 13
        "Ban : \" i can not find any other better suited leader \"", # 14
"the 24 - year - old is one of Australia 's most popular Olympic athletes", # 15
        "ruling : \" Manuel Noriega fails to provide any evidence of harm to his reputation \"", # 16
        "the Bay area and Detroit have the most arrests and child rescues", # 17
        "no specific threat is indicated against the U.S.", # 18
        "the U.S. Navy has announced it will no longer communicate in all - caps", # 19
        "44 million U.S. smoke , and a third of all cancer deaths caused by tobacco use", # 20
        "Vodafone 's HTC Magic handset will launch in western Europe in the next few months", # 21
        "moses shown on some Supreme Court friezes ; some founders wrote of Christian principles", # 22
        "some customers say UPS packages were declared \" delivered \" but were n't", # 23
        "Maathai of the West : \" nobody has a blueprint and nobody is a know - it - all \"", # 24
        "Antonieta Ledezma is joined by a few dozen protesters in New York 's Times Square", # 25
        "\" i have no doubt i 'll lose , \" says California lawyer who filed lawsuit", # 26
        "Greene : beyond that , it 's American justice carried out for us all ; important to bear witness", # 27
        "protesters , Israel soldiers clash every friday afternoon in two West Bank villages", # 28
        "the world no. 1 faces Andy Murray on sunday", # 29
        "Jindal : GOP must compete for every vote , reject identity politics , be more smart", # 30
        "Toshiba lost because it lacks retail presence in many key markets , analysts say", # 31
    ]

In [149]:
results = [match_quantifier(parse(s)) for s in sentences ]
results

# this file was generated by the following command(s):
#   ../scripts/quantifier_monotonicity/candc-1.00/bin/candc --models ../scripts/quantifier_monotonicity/candc-1.00/models/ --candc-printer xml

1 attempt nospan at B=0.075, K=20
1 attempt nospan at B=0.03, K=20
1 attempt nospan at B=0.01, K=20
1 attempt nospan at B=0.005, K=20
1 attempt nospan at B=0.001, K=150
1 failed no span at B=0.001, K=150
# this file was generated by the following command(s):
#   ../scripts/quantifier_monotonicity/candc-1.00/bin/candc --models ../scripts/quantifier_monotonicity/candc-1.00/models/ --candc-printer xml

1 parsed at B=0.075, K=20
1 coverage 100%
1 stats 2.07944 86 88
# this file was generated by the following command(s):
#   ../scripts/quantifier_monotonicity/candc-1.00/bin/candc --models ../scripts/quantifier_monotonicity/candc-1.00/models/ --candc-printer xml

1 parsed at B=0.075, K=20
1 coverage 100%
1 stats 3.4012 220 233


rule NP


# this file was generated by the following command(s):
#   ../scripts/quantifier_monotonicity/candc-1.00/bin/candc --models ../scripts/quantifier_monotonicity/candc-1.00/models/ --candc-printer xml

1 parsed at B=0.075, K=20
1 coverage 100%
1 stats 4.57471 405 463


rule NP


# this file was generated by the following command(s):
#   ../scripts/quantifier_monotonicity/candc-1.00/bin/candc --models ../scripts/quantifier_monotonicity/candc-1.00/models/ --candc-printer xml

1 parsed at B=0.075, K=20
1 coverage 100%
1 stats 4.27667 268 284


rule NP


# this file was generated by the following command(s):
#   ../scripts/quantifier_monotonicity/candc-1.00/bin/candc --models ../scripts/quantifier_monotonicity/candc-1.00/models/ --candc-printer xml

1 parsed at B=0.075, K=20
1 coverage 100%
1 stats 9.56549 1095 1445


rule NP


# this file was generated by the following command(s):
#   ../scripts/quantifier_monotonicity/candc-1.00/bin/candc --models ../scripts/quantifier_monotonicity/candc-1.00/models/ --candc-printer xml

1 parsed at B=0.075, K=20
1 coverage 100%
1 stats 7.96901 755 997


rule NP


# this file was generated by the following command(s):
#   ../scripts/quantifier_monotonicity/candc-1.00/bin/candc --models ../scripts/quantifier_monotonicity/candc-1.00/models/ --candc-printer xml

1 parsed at B=0.075, K=20
1 coverage 100%
1 stats 5.24175 427 532
# this file was generated by the following command(s):
#   ../scripts/quantifier_monotonicity/candc-1.00/bin/candc --models ../scripts/quantifier_monotonicity/candc-1.00/models/ --candc-printer xml

1 attempt nospan at B=0.075, K=20
1 attempt nospan at B=0.03, K=20
1 attempt nospan at B=0.01, K=20
1 attempt nospan at B=0.005, K=20
1 attempt nospan at B=0.001, K=150
1 failed no span at B=0.001, K=150
# this file was generated by the following command(s):
#   ../scripts/quantifier_monotonicity/candc-1.00/bin/candc --models ../scripts/quantifier_monotonicity/candc-1.00/models/ --candc-printer xml

1 parsed at B=0.075, K=20
1 coverage 100%
1 stats 6.2106 426 509
# this file was generated by the following command(s):
#   ../script

rule NP


# this file was generated by the following command(s):
#   ../scripts/quantifier_monotonicity/candc-1.00/bin/candc --models ../scripts/quantifier_monotonicity/candc-1.00/models/ --candc-printer xml

1 parsed at B=0.075, K=20
1 coverage 100%
1 stats 9.18502 1113 1543
# this file was generated by the following command(s):
#   ../scripts/quantifier_monotonicity/candc-1.00/bin/candc --models ../scripts/quantifier_monotonicity/candc-1.00/models/ --candc-printer xml

1 parsed at B=0.075, K=20
1 coverage 100%
1 stats 7.0282 744 924


rule NP


# this file was generated by the following command(s):
#   ../scripts/quantifier_monotonicity/candc-1.00/bin/candc --models ../scripts/quantifier_monotonicity/candc-1.00/models/ --candc-printer xml

1 parsed at B=0.075, K=20
1 coverage 100%
1 stats 8.74576 733 946


rule NP


# this file was generated by the following command(s):
#   ../scripts/quantifier_monotonicity/candc-1.00/bin/candc --models ../scripts/quantifier_monotonicity/candc-1.00/models/ --candc-printer xml

1 attempt nospan at B=0.075, K=20
1 attempt nospan at B=0.03, K=20
1 attempt nospan at B=0.01, K=20
1 attempt nospan at B=0.005, K=20
1 attempt nospan at B=0.001, K=150
1 failed no span at B=0.001, K=150
# this file was generated by the following command(s):
#   ../scripts/quantifier_monotonicity/candc-1.00/bin/candc --models ../scripts/quantifier_monotonicity/candc-1.00/models/ --candc-printer xml

1 parsed at B=0.075, K=20
1 coverage 100%
1 stats 5.8861 428 467
# this file was generated by the following command(s):
#   ../scripts/quantifier_monotonicity/candc-1.00/bin/candc --models ../scripts/quantifier_monotonicity/candc-1.00/models/ --candc-printer xml

1 attempt nospan at B=0.075, K=20
1 attempt nospan at B=0.03, K=20
1 attempt nospan at B=0.01, K=20
1 attempt nospan at B=0.005, K=20


KeyError: 'cat'

In [152]:
nps = [b for a,b in results]
[[(n.attrib["cat"], v.attrib["cat"]) for n, v in np] for np in nps]

[[],
 [('NP[nb]/N', 'N')],
 [('NP', 'NP\\NP'), ('((S\\NP)\\(S\\NP))/N', 'N')],
 [('NP[nb]/N', 'N'), ('NP[nb]/N', 'N')],
 [],
 [],
 [('NP[nb]/N', 'N')]]