In [1]:
import json
from lemminflect import getAllInflections
import nltk
from nltk.corpus import framenet as fn
import numpy as np
import os
import pandas as pd
import pickle as pkl
import re
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
import spacy
from ipywidgets.widgets.widget_int import IntProgress

nltk.download("framenet_v17")

[nltk_data] Downloading package framenet_v17 to
[nltk_data]     /Users/ryanschaefer/nltk_data...
[nltk_data]   Package framenet_v17 is already up-to-date!


True

In [13]:
class LexicalUnitClassifier:
    def __init__(self, pretrained = True, model_directory = "../models/lexical_units"):
        self.lu_data = None
        self.models = None
        self.rules = None
        self.load_framenet()
            
        if pretrained:
            self.load_trained_models(model_directory)
    
    def load_framenet(self):
        self.nlp = spacy.load("en_core_web_sm")
        lexical_units = fn.lus()
        lu_names = list(set(map(lambda x: x.name, lexical_units)))
        self.lu_data = { key: { "frames": [] } for key in lu_names }
        progress = IntProgress(0, 0, len(lexical_units))
        display(progress)
        for lu in lexical_units:
            name = lu.name
            if "lexemes" not in self.lu_data[name].keys():
                self.lu_data[name]["lexemes"] = {
                    "lemmas": list(map(lambda x: x["name"].lower(), lu.lexemes)),
                    "consecutive": all(list(map(lambda x: x["breakBefore"] == "false", lu.lexemes)))
                }
                if "(" in name or "[" in name:
                    tmp_name = name.replace("[", "(").replace("]", ")")
                    substr = re.findall(r'\(.*?\)', tmp_name)[0]
                    lemmas = []
                    for token in self.nlp(substr):
                        lemma = token.lemma_.lower()
                        if lemma not in ["(", ")"]:
                            lemmas.append("/".join(list(set([ x for vals in getAllInflections(lemma).values() for x in vals ] + [ lemma ]))))
                    if "/" in lemmas:
                        index = lemmas.index("/")
                        lemmas[index - 1] = "{}/{}".format(lemmas[index - 1], lemmas[index + 1])
                        lemmas.pop(index)
                        lemmas.pop(index)
                    if tmp_name.index(".") - tmp_name.index(")") == 1:
                        self.lu_data[name]["lexemes"]["lemmas"] = self.lu_data[name]["lexemes"]["lemmas"] + lemmas
                    else:
                        self.lu_data[name]["lexemes"]["lemmas"] = lemmas + self.lu_data[name]["lexemes"]["lemmas"]
                    
            curr = {
                "name": lu.frame.name,
                "sentences": []
            }
            for sentence in lu.exemplars:
                curr2 = {
                    "text": sentence.text,
                    "fe": []
                }
                
                targetFound = False
                for aset in sentence.annotationSet:
                    for layer in aset.layer:
                        if layer.name == "Target":
                            if len(layer.label) > 0:
                                label = layer.label[0]
                                curr2["start"] = label["start"]
                                curr2["end"] = label["end"]
                                targetFound = True
                        elif layer.name == "FE":
                            for label in layer.label:
                                if "start" in label.keys():
                                    curr2["fe"].append({
                                        "name": label["name"],
                                        "start": label["start"],
                                        "end": label["end"]
                                    })
                if targetFound:
                    curr["sentences"].append(curr2)
            self.lu_data[name]["frames"].append(curr)
            progress.value += 1
                
    def load_trained_models(self, directory = "../models/lexical_units"):
        self.models = {}
        self.rules = json.load(open("{}/rules.json".format(directory)))
        for filename in os.listdir(directory):
            comps = filename.split(".")
            if comps[1] == "pkl":
                self.models[comps[0].replace("_", ".")] = pkl.load(open(os.path.join(directory, filename), "rb"))
            
    def get_word_lu(self, word, pos):
        possible_lus = list(filter(lambda x: x.startswith("{}".format(word.lower())), self.lu_data.keys()))
        if len(possible_lus) == 0:
            return None
        elif len(possible_lus) == 1:
            return possible_lus[0]
        else:
            tmp = word + pos
            if tmp in possible_lus:
                return tmp
            else:
                return None
            
    def pos_tag(self, sentence, doc = None):
        if doc is None:
            doc = self.nlp(sentence)
            
        pos_mapping = {
            'ADJ': 'a',    # Adjective
            'ADV': 'adv',  # Adverb
            'INTJ': 'intj', # Interjection
            'NOUN': 'n',   # Noun
            'PROPN': 'n',  # Proper noun
            'VERB': 'v',   # Verb
            'ADP': 'prep', # Adposition (preposition and postposition)
            'AUX': 'v',  # Auxiliary verb
            'CONJ': 'c',   # Conjunction
            'CCONJ': 'c',  # Coordinating conjunction
            'SCONJ': 'scon', # Subordinating conjunction
            'DET': 'art',  # Determiner (article)
            'NUM': 'num',  # Numeral
            'PART': 'part', # Particle
            'PRON': 'pron', # Pronoun
        }
        
        results = []
        for token in doc:
            if token.pos_ in pos_mapping.keys():
                results.append((token.lemma_.lower(), pos_mapping[token.pos_]))
                
        return results
    
    def annotated_features(self, sentence):
         # presence/absence of words
        # use parse tree (what does structure look like)
        # spacy has a dependency parser <-
        # look at children edge labels of children in dependency tree
        # surrounding words
        # word count
        # where lu is relative to sentence length <-
        # bool vars for certain words being present
        # lexical parse tree
        # named entities
        # POS counts
        
        doc = self.nlp(sentence["text"])
        in_lu = False
        tokens = []
        for token in doc:
            if not in_lu and token.idx == sentence["start"]:
                in_lu = True
                tokens.append(token)
            elif in_lu and token.idx > sentence["end"]:
                break
        
        if len(tokens) == 0:
            features = [None, None, None]
        else:
            features = [
                tokens[0].i / len(doc), tokens[0].dep_, tokens[0].head.lemma_
            ]
        
        return features
    
    def prediction_features(self, sentence, tokens, doc = None):
        if len(tokens) == 0:
            return [None, None, None]
        
        if doc == None:
            doc = self.nlp(sentence)
        
        features = []
        for token in doc:
            if token.lemma_.lower() == tokens[0]:
                return [token.i / len(doc), token.dep_, token.head.lemma_]
                
        raise Exception("Tokens '{}' not found in sentence '{}'".format(tokens, sentence))
    
    def find_lus(self, sentence, doc = None):
        token_pos = self.pos_tag(sentence, doc)
        possible_lus = []
        for lu, values in self.lu_data.items():
            lu_pos = lu.split(".")[-1]
            if values["lexemes"]["consecutive"] and len(values["lexemes"]["lemmas"]) > 1:
                index = 0
                possible_pos = []
                tokens = []
                for lemma, pos in token_pos:
                    if any(lemma == w for w in values["lexemes"]["lemmas"][index].split("/")):
                        index += 1
                        possible_pos.append(pos)
                        tokens.append(lemma)
                        if index == len(values["lexemes"]["lemmas"]):
                            break
                    elif index > 0:
                        break
                if index == len(values["lexemes"]["lemmas"]) and lu_pos in possible_pos:
                    possible_lus.append((lu, tokens))
            else:
                matchCount = 0
                possible_pos = []
                tokens = []
                for word in values["lexemes"]["lemmas"]:
                    for lemma, pos in token_pos:
                        if any(w == lemma for w in word.split("/")):
                            matchCount += 1
                            possible_pos.append(pos)
                            tokens.append(lemma)
                            break
                if matchCount == len(values["lexemes"]["lemmas"]) and lu_pos in possible_pos:
                    possible_lus.append((lu, tokens))
        return possible_lus

    def predict_frame(self, sentence, lu, tokens, doc = None):
        if lu in self.rules.keys():
            probs = self.rules[lu]
            pred_frame = np.random.choice(list(probs.keys()), p = list(probs.values()))
            return pred_frame
        elif lu in self.models.keys():
            features = self.prediction_features(sentence, tokens, doc)
            if None in features:
                return None
            X = pd.DataFrame(features, columns = ["location", "relation", "head"])
            return self.models[lu].predict(X)[0]
        else:
            return None
            # raise Exception("Unknown lexical unit: {}".format(lu))
    
    def fit(self, output_dir = "../models/lexical_units", random_state = None):
        # Delete old pkl files in output_dir
        for file in os.listdir(output_dir):
            if file.endswith(".pkl"):
                os.remove(os.path.join(output_dir, file))
        
        if self.lu_data is None:
            self.load_training_data()
            
        self.rules = {}
        self.models = {}
        progress = IntProgress(0, 0, len(self.lu_data))
        display(progress)
        for lu, values in self.lu_data.items():
            frames = values["frames"]
            if len(frames) > 1:
                sentences = list(map(lambda x: x["sentences"], frames))
                frame_counts = np.array(list(map(len, sentences)))
                
                if min(frame_counts) < 10:
                    if frame_counts.sum() == 0:
                        frame_counts = np.full(len(frame_counts), 1 / len(frame_counts))
                    else:
                        frame_counts = frame_counts / frame_counts.sum()
                    self.rules[lu] = { frames[i]["name"]: prob for i, prob in enumerate(frame_counts)}
                else:
                    data = {
                        "sentences": [],
                        "labels": []
                    }
                    for frame in frames:
                        for sentence in frame["sentences"]:
                            data["sentences"].append(sentence)
                            data["labels"].append(frame["name"])
                    features = list(map(lambda x: self.annotated_features(x), data["sentences"]))
                    X = pd.DataFrame(features, columns = ["location", "relation", "head"])
                
                    cat_pipeline = Pipeline([
                        ("ohe", OneHotEncoder(handle_unknown = "ignore"))
                    ])
                    col_transformer = ColumnTransformer([
                        ("cat", cat_pipeline, ["relation", "head"])
                    ])
                    pipeline = Pipeline([
                        ("preprocessing", col_transformer),
                        ("model", DecisionTreeClassifier(random_state = random_state))
                    ])
                    
                    pipeline.fit(X, data["labels"])
                    pkl.dump(pipeline, open("{}/{}.pkl".format(output_dir, lu.replace(".", "_")), "wb"))
                    self.models[lu] = pipeline
            progress.value += 1
                    
        json.dump(self.rules, open("{}/rules.json".format(output_dir), "w"), indent = 4)
    
    def predict(self, sentences, model_dir = None):
        if not hasattr(self, "models") or self.models is None:
            if model_dir is None:
                self.load_trained_models()
            else:
                self.load_trained_models(model_dir)
                
        predictions = []
        for sentence in sentences:
            doc = self.nlp(sentence)
            possible_lus = self.find_lus(sentence, doc)
            curr = []
            for lu, tokens in possible_lus:
                possible_frames = self.lu_data[lu]["frames"]
                if len(possible_frames) == 1:
                    curr.append((lu, possible_frames[0]["name"]))
                else:
                    frame = self.predict_frame(sentence, lu, tokens, doc)
                    if frame is not None:
                        curr.append((lu, frame))
            predictions.append(curr)
            
        return predictions

In [14]:
model = LexicalUnitClassifier(True)

IntProgress(value=0, max=13572)

In [15]:
model.fit()

IntProgress(value=0, max=10462)

In [18]:
df = pd.read_csv("../datasets/lexical_unit_sentences.csv")

display(df.info())
display(df.describe())
df

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 204022 entries, 0 to 204021
Data columns (total 4 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   Lexical Unit    204022 non-null  object
 1   Frame           204022 non-null  object
 2   Sentence        200751 non-null  object
 3   Sentence Count  204022 non-null  int64 
dtypes: int64(1), object(3)
memory usage: 6.2+ MB


None

Unnamed: 0,Sentence Count
count,204022.0
mean,47.642269
std,52.379088
min,0.0
25%,19.0
50%,33.0
75%,59.0
max,547.0


Unnamed: 0,Lexical Unit,Frame,Sentence,Sentence Count
0,(can't) help.v,Self_control,"` Not if I can help it . """,11
1,(can't) help.v,Self_control,"And now she took a better look at him , Folly ...",11
2,(can't) help.v,Self_control,"` I could n't help feeling that … well , in yo...",11
3,(can't) help.v,Self_control,"Yet , looking into those liquid dark eyes , Fr...",11
4,(can't) help.v,Self_control,She could n't help the tinge of pink that floo...,11
...,...,...,...,...
204017,zone.n,Locale,Dubai 10-28 ( FP ) - Dubai 's Crown Prince She...,32
204018,zone.n,Locale,A Turbo Cat ferry makes a one - hour trip ( 7 ...,32
204019,zone.n,Locale,"Macau , now the Chinese Special Economic Zone ...",32
204020,zonk out.v,Fall_asleep,,0


In [19]:
sentences = [df["Sentence"][1]]
predicted_frames = model.predict(sentences)

for i in range(len(sentences)):
    print(sentences[i])
    display(predicted_frames[i])
    print()

And now she took a better look at him , Folly could n't help noticing the strong , muscular lines of the broad back under that white shirt .


[('muscular.a', 'Body_description_holistic'),
 ('of.prep', 'Partitive'),
 ('look.n', 'Facial_expression'),
 ('under.prep', 'Non-gradable_proximity'),
 ('white.a', 'Color'),
 ('could.v', 'Possibility'),
 ("(can't) help.v", 'Self_control'),
 ('now.adv', 'Temporal_collocation'),
 ('broad.a', 'Measurable_attributes'),
 ('take.v', 'Removing'),
 ('help.v', 'Assistance'),
 ('at.prep', 'Locative_relation'),
 ('strong.a', 'Level_of_force_exertion'),
 ('line.n', 'Roadways'),
 ('shirt.n', 'Clothing'),
 ('notice.v', 'Becoming_aware')]


