In [1]:
import pandas as pd
import numpy as np
import nltk
from tqdm import tqdm
from nltk.parse.corenlp import CoreNLPServer
from nltk.parse.corenlp import CoreNLPParser
from nltk.corpus import brown
from nltk.tree import *
from nltk import CFG
from nltk import word_tokenize
import os
import random
from collections import defaultdict

In [76]:
in_sents = ['he green with envy',
            'he an expert',
           'she liking me',
           'your mama a weightlifter',
           "if i'm right, she a professor",
           'unfortunately, he crazy']

In [77]:
bad_sents = ['i green with envy',
            'it in the garden',
            'that my bag',
            'it what it is']

In [4]:
root = 'stanford-corenlp-4.4.0'
server = CoreNLPServer(
   os.path.join(root, "stanford-corenlp-4.4.0.jar"),
   os.path.join(root, "stanford-corenlp-4.4.0-models.jar"),
)

In [5]:
server.start()



In [6]:
tagged = nltk.corpus.brown.tagged_words()
fd = defaultdict(lambda: defaultdict(int))
for tok in tqdm(tagged.iterate_from(0)):
    fd[tok[0].lower()][tok[1]] += 1
        

1161192it [00:03, 355170.29it/s]


In [7]:
for key in fd.keys():
    total = 0
    for pos in fd[key].keys():
        total += fd[key][pos]
    for pos in fd[key].keys():
        fd[key][pos] = fd[key][pos] / total

In [None]:
no_nc_subjs = ['i', 'that', 'it', 'what']
pos_subjs = ['NN', 'NP','NN-TL','NNS', 'NNS-TL','NNP', 'NNPS', 'PRP', 'PSS', 'PPS','PPSS', 'WP']
negs = ["ain't", "didn't", "don't", "not", "doesn't"]
pos_negs = ["HVD*", 'BEZ*', 'DOD*', 'BER*', 'BER*', 'BEM*', 'HV*', "HVZ*", "*"]

In [8]:
def pos(word):
    entries = fd[word]
    n = []
    p = []
    it = []
    for i,item in enumerate(entries):
        n.append(item)
        p.append(fd[word][item])
    if len(n) == 0:
        ans = n
    else: 
        ans = n[np.argsort(p)[-1]]
    return ans


In [9]:
def make_sent(s):
    s_tok = s.split(' ')
    sent = []
    for i,tok in enumerate(s_tok):
        sent.append((tok, pos(tok)))
    return sent

In [10]:
def pos_words_lists(sent):
    return [item[1] for item in sent], [item[0] for item in sent]

In [11]:
def pos_adjacent(sent, pos1, pos2):
    pos_sent, words = pos_words_lists(sent)
    ind1 = int(pos_sent.index(pos1))
    ind2 = int(pos_sent.index(pos2))
    
    if  ind1 == ind2-1:
        #pos1 before pos2
        return 1
    elif ind1-1 == ind2:
        #pos1 after pos2
        return 2
    
    else:
        #nonadjacent
        return 0

In [59]:
def nc_pp(s):
    sent = make_sent(s.lower())
    pos_sent, words = pos_words_lists(sent)
    print(words)
    yes = 0
    present = 0
    validity = 'T'
    
    for subj in pos_subjs:
        pos1 = subj
        pos2 = 'IN'
        if pos2 not in pos_sent:
            print('pp: valid')
            validity = 'T'
            return validity
        if pos1 in pos_sent:
            
            if pos_adjacent(sent,pos1,pos2) == 1:
                rel = words[pos_sent.index(pos1)]
                present += 1
                if rel in no_nc_subjs:
                    print('pp: Invalid')
                    validity = 'F'
                else:
                    yes += 1
                    
            else:
                for neg in pos_negs:
                    if neg in pos_sent:
                        pos3 = neg
                        if pos_adjacent(sent,pos1,pos3) == 1 & pos_adjacent(sent,pos3,pos2) == 1:
                            rel = words[pos_sent.index(pos1)]
                            present += 1
                            if rel in no_nc_subjs:
                                print('pp: Invalid')
                                validity = 'F'
                            else:
                                yes += 1
        else:
            continue
    if yes != 0:
        print('pp: Valid')
        validity = 'T'
    if present == 0:
        print('pp: N/A')
        validity = 'T'
    return validity


In [60]:
def nc_adj(s):
    sent = make_sent(s.lower())
    pos_sent, words = pos_words_lists(sent)
    yes = 0
    present = 0
    validity = 'T'
    
    for subj in pos_subjs:
        pos1 = subj
        pos2 = 'JJ'
        if pos2 not in pos_sent:
            validity = 'T'
            print('adj: valid')
            return validity
        if pos1 in pos_sent:
            
            if pos_adjacent(sent,pos1,pos2) == 1:
                rel = words[pos_sent.index(pos1)]
                present += 1
                if rel in no_nc_subjs:
                    print('adj: Invalid')
                    validity = 'F'
                else:
                    yes += 1
                    
            else:
                for neg in pos_negs:
                    if neg in pos_sent:
                        pos3 = neg
                        if pos_adjacent(sent,pos1,pos3) == 1 & pos_adjacent(sent,pos3,pos2) == 1:
                            rel = words[pos_sent.index(pos1)]
                            present += 1
                            if rel in no_nc_subjs:
                                print(pos_sent)
                                print('adj: Invalid')
                                validity = 'F'
                            else:
                                yes += 1
        else:
            continue
    if yes != 0:
        print(words)
        print('adj: Valid')
        validity = 'T'
    if present == 0:
        print('adj: N/A')
        validity = 'T'
    return validity

    

                    
        

In [61]:
def nc_np(s):
    sent = make_sent(s.lower())
    pos_sent, words = pos_words_lists(sent)
    yes = 0
    present = 0
    validity = 'T'
    parser = CoreNLPParser()
    
    parse = next(parser.raw_parse(' '.join(words)))
    pt = ParentedTree.fromstring(str(parse))
    np_count = 0
    for subtree in pt.subtrees():
        if subtree.label() == 'NP':
            np_count += 1
    if np_count < 2:
        validity = 'T'
        print('np: valid')
        return validity
    
    
    for subj in pos_subjs:
        pos1 = subj
        pos2 = 'NP'
    
        if pos1 in pos_sent:
            if 'IN' in pos_sent:
                if pos_adjacent(sent,pos1,'IN') == 1:
                    validity = 'T'
                    print('np: valid')
                    return validity
            if 'JJ' in pos_sent:
                if pos_adjacent(sent,pos1,'JJ') == 1:
                    validity = 'T'
                    print('np: valid')
                    return validity
            idx = pos_sent.index(pos1)
            rest = words[idx+1:]
            s = ' '.join(rest)
            if rest:
                
                parse = next(parser.raw_parse(s))
                pt = ParentedTree.fromstring(str(parse))
                pt_lab = pt[0].label()
                if pt_lab == pos2:
                    rel = words[pos_sent.index(pos1)]
                    present += 1
                    if rel in no_nc_subjs:
                        print('np: Invalid')
                        validity = 'F'
                    else:
                        yes += 1
                if pos_sent[pos_sent.index(pos1) + 1] in pos_negs:
                    neg_idx = pos_sent.index(pos1) + 1
                    rest = words[neg_idx+1:]
                    s = ' '.join(rest)
                    if rest:
                        parser = CoreNLPParser()
                        parse = next(parser.raw_parse(s))
                        pt = ParentedTree.fromstring(str(parse))
                        pt_lab = pt[0].label()
                        if pt_lab == pos2:
                            rel = words[pos_sent.index(pos1)]
                            present += 1
                            if rel in no_nc_subjs:
                                print('np:Invalid')
                                validity = 'F'
                            else:
                                yes += 1
                    
        else:
            continue
    if yes != 0:
        print('np: Valid')
        validity = 'T'
    if present == 0:
        print('np: N/A')
        validity = 'T'
    return validity
        
        

In [67]:
def nc(s):
    pp = nc_pp(s)
    adj = nc_adj(s)
    np = nc_np(s)
    val = pp+adj+np
    if val == 'TTT':
        return True
    else:
        return False 


In [82]:
for sent in in_sents:
    print('\n',sent, '\n',nc(sent), '\n')

['he', 'green', 'with', 'envy']
pp: N/A
['he', 'green', 'with', 'envy']
adj: Valid
np: valid

 he green with envy 
 True 

['he', 'an', 'expert']
pp: valid
adj: valid
np: Valid

 he an expert 
 True 

['she', 'liking', 'me']
pp: valid
adj: valid
np: N/A

 she liking me 
 True 

['your', 'mama', 'a', 'weightlifter']
pp: valid
adj: valid
np: Valid

 your mama a weightlifter 
 True 

['if', "i'm", 'right,', 'she', 'a', 'professor']
pp: valid
adj: valid
np: Valid

 if i'm right, she a professor 
 True 



In [83]:
for sent in bad_sents:
    print('\n',sent, '\n',nc(sent), '\n')

['i', 'green', 'with', 'envy']
pp: N/A
adj: Invalid
np: valid

 i green with envy 
 False 

['it', 'in', 'the', 'garden']
pp: Invalid
adj: valid
np: valid

 it in the garden 
 False 



In [None]:
server.stop()