In [1]:
import json
from lemminflect import getAllInflections, getAllLemmas
import nltk
from nltk.corpus import framenet as fn
import numpy as np
import os
import pandas as pd
import pickle as pkl
import re
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
import spacy
from tqdm import tqdm

nltk.download("framenet_v17")

[nltk_data] Downloading package framenet_v17 to
[nltk_data]     /Users/ryanschaefer/nltk_data...
[nltk_data]   Package framenet_v17 is already up-to-date!


True

In [2]:
class LexicalUnitClassifier:
    def __init__(self, reset_framenet = False, pretrained = True, model_directory = "../models"):
        self.lu_data = None
        self.models = None
        self.load_framenet(reset_framenet, model_directory)
            
        if pretrained:
            self.load_trained_models(model_directory)
    
    # Load relevant framenet data
    def load_framenet(self, reset = False, directory = "../models"):
        self.nlp = spacy.load("en_core_web_sm")
        
        # Load framenet from file is specified and file exists
        if not reset:
            filename = os.path.join(directory, "framenet.json")
            if os.path.isfile(filename):
                self.lu_data = json.load(open(filename))
                return
            else:
                print("Framenet file not found in directory `{}`. Resetting framenet...".format(directory))
                
        # Get all lexical units
        lexical_units = fn.lus()
        lu_names = list(set(map(lambda x: x.name.split(".")[0], lexical_units)))
        self.lu_data = { key: { "frames": [], "pos": [], "no_frame": [] } for key in lu_names }
        all_sentences = set()
        
        # Iterate through all lexical units
        for lu in tqdm(lexical_units):
            lu_split = lu.name.split(".")
            name = lu_split[0]
            pos = lu_split[1]
            if pos not in self.lu_data[name]["pos"]:
                self.lu_data[name]["pos"].append(pos)
            # Add lexemes if not already defined
            if "lexemes" not in self.lu_data[name].keys():
                self.lu_data[name]["lexemes"] = {
                    "consecutive": all(list(map(lambda x: x["breakBefore"] == "false", lu.lexemes)))
                }
                lu_lemmas = []
                for lexeme in lu.lexemes:
                    lex = lexeme["name"].lower()
                    lu_lemmas.append("/".join(list(set([ x for vals in getAllLemmas(lex).values() for x in vals ] + [ lex ]))))
                self.lu_data[name]["lexemes"]["lemmas"] = lu_lemmas
                # Add words in () or [] to lexemes
                if "(" in name or "[" in name:
                    # Extract substring in brackets
                    tmp_name = name.replace("[", "(").replace("]", ")")
                    substr = re.findall(r'\(.*?\)', tmp_name)[0]
                    # Get all lemmas in tokenized substring
                    lemmas = []
                    for token in self.nlp(substr):
                        lemma = token.lemma_.lower()
                        if lemma not in ["(", ")"]:
                            # Get all inflections of each lemma
                            lemmas.append("/".join(list(set([ x for vals in getAllInflections(lemma).values() for x in vals ] + [ lemma ]))))
                    # If lu contains a /, save words as the same lexeme
                    if "/" in lemmas:
                        index = lemmas.index("/")
                        lemmas[index - 1] = "{}/{}".format(lemmas[index - 1], lemmas[index + 1])
                        lemmas.pop(index)
                        lemmas.pop(index)
                    # Add lemmas to beginning or end based on where the close bracket is
                    if tmp_name.index(")") == len(tmp_name) - 1:
                        self.lu_data[name]["lexemes"]["lemmas"] = self.lu_data[name]["lexemes"]["lemmas"] + lemmas
                    else:
                        self.lu_data[name]["lexemes"]["lemmas"] = lemmas + self.lu_data[name]["lexemes"]["lemmas"]
                
            found = False    
            for i, frame in enumerate(self.lu_data[name]["frames"]):
                # If lu already has frames, add sentences to frame
                if frame["name"] == lu.frame.name:
                    found = True
                    # Iterate through all sentences that include lu
                    for sentence in lu.exemplars:
                        targetFound, curr2 = self.framenet_sentence(sentence)
                        # Add sentence if a target was found
                        if targetFound:
                            self.lu_data[name]["frames"][i]["sentences"].append(curr2)
                    break
                
            if not found:
                curr = {
                    "name": lu.frame.name,
                    "sentences": []
                }
                # Iterate through all sentences that include lu
                for sentence in lu.exemplars:
                    all_sentences.add(sentence.text)
                    targetFound, curr2 = self.framenet_sentence(sentence)
                    # Add sentence if a target was found
                    if targetFound:
                        curr["sentences"].append(curr2)
                # Store sentences
                self.lu_data[name]["frames"].append(curr)
           
        # Remove all LUs with fewer than 10 example sentences
        for lu in lu_names:
            frames = self.lu_data[lu]["frames"]
            # Get all example sentences and the number of sentences per frame
            frame_counts = np.array(list(map(lambda x: len(x["sentences"]), frames)))
            if max(frame_counts) < 10:
                del self.lu_data[lu]
                
        # Populate no frame sentences for remaining LUs 
        self.load_no_frames(list(all_sentences))
        # Save framenet data to file
        filename = os.path.join(directory, "framenet.json")
        json.dump(self.lu_data, open(filename, "w"), indent = 4)
        
    def load_no_frames(self, all_sentences):
        # Iterate through SpaCy parsed sentences
        for i, doc in enumerate(tqdm(self.nlp.pipe(all_sentences), total = len(all_sentences))):
            # Get LUs in sentence
            lus = self.find_lus(all_sentences[i], doc)
            # If LU is not tagged with a frame, add it to no_frame
            for lu, tokens in lus:
                if all(all_sentences[i] != example["text"] for frame in self.lu_data[lu]["frames"] for example in frame["sentences"]):
                    self.lu_data[lu]["no_frame"].append({ "text": all_sentences[i], "tokens": tokens })
        
    def framenet_sentence(self, sentence):
        curr2 = {
            "text": sentence.text,
            "fe": []
        }
        
        # Extract target and frame elements from sentence
        targetFound = False
        for aset in sentence.annotationSet:
            for layer in aset.layer:
                if layer.name == "Target":
                    if len(layer.label) > 0:
                        label = layer.label[0]
                        curr2["start"] = label["start"]
                        curr2["end"] = label["end"]
                        targetFound = True
                elif layer.name == "FE":
                    for label in layer.label:
                        if "start" in label.keys():
                            curr2["fe"].append({
                                "name": label["name"],
                                "start": label["start"],
                                "end": label["end"]
                            })
                            
        return targetFound, curr2
                
    # Load models and probabilities from files in a directory
    def load_trained_models(self, directory = "../models"):
        self.models = {}
        for filename in os.listdir(directory):
            comps = filename.split(".")
            if comps[1] == "pkl":
                self.models[comps[0]] = pkl.load(open(os.path.join(directory, filename), "rb"))
            
    def pos_tag(self, sentence, doc = None):
        # Create spacy parser if one has not already been created
        if doc is None:
            doc = self.nlp(sentence)
            
        # Translations from spacy POS tags to framenet suffixes
        pos_mapping = {
            'ADJ': 'a', # Adjective
            'ADV': 'adv', # Adverb
            'INTJ': 'intj', # Interjection
            'NOUN': 'n', # Noun
            'PROPN': 'n', # Proper noun
            'VERB': 'v', # Verb
            'ADP': 'prep', # Adposition (preposition and postposition)
            'AUX': 'v', # Auxiliary verb
            'CONJ': 'c', # Conjunction
            'CCONJ': 'c', # Coordinating conjunction
            'SCONJ': 'scon', # Subordinating conjunction
            'DET': 'art', # Determiner (article)
            'NUM': 'num', # Numeral
            'PART': 'part', # Particle
            'PRON': 'pron' # Pronoun
        }
        
        # Convert all spacy tags to framenet suffixes in the text
        results = []
        for token in doc:
            if token.pos_ in pos_mapping.keys():
                results.append((token.lemma_.lower(), pos_mapping[token.pos_]))
                
        return results
    
    # Get the features of an annotated sentence for training
    def annotated_features(self, sentence):
        doc = self.nlp(sentence["text"])
        
        if "start" in sentence.keys():
            tokens = []
            # Get target tokens
            for token in doc:
                if token.idx == sentence["start"]:
                    tokens.append(token)
                    break
            
            # Return features of target if found or None
            if len(tokens) == 0:
                features = [None, None, None]
            else:
                features = [
                    tokens[0].i / len(doc), tokens[0].dep_, tokens[0].head.lemma_
                ]
            
            return features
        else:
            return self.prediction_features(sentence["text"], sentence["tokens"], doc)
    
    # Get the features of an unannotated sentence for prediction
    def prediction_features(self, sentence, tokens, doc = None):
        # Return None if no tokens given
        if len(tokens) == 0:
            return [None, None, None]
        
        # Parse sentence with spacy
        if doc == None:
            doc = self.nlp(sentence)
        
        # Return features of tokens
        for token in doc:
            if token.lemma_.lower() == tokens[0]:
                return [token.i / len(doc), token.dep_, token.head.lemma_]
                
        raise Exception("Tokens '{}' not found in sentence '{}'".format(tokens, sentence))
    
    # Determine all of the lus that are in a sentence
    def find_lus(self, sentence, doc = None):
        # Get POS tags
        token_pos = self.pos_tag(sentence, doc)
        # Iterate through all lus
        possible_lus = []
        for lu, values in self.lu_data.items():
            # Check for multiple consecutive lexemes in the sentence
            if values["lexemes"]["consecutive"] and len(values["lexemes"]["lemmas"]) > 1:
                index = 0
                tokens = []
                # Iterate through tokens
                for lemma, _ in token_pos:
                    # If token is the next lexeme, increment index
                    if any(lemma == w for w in values["lexemes"]["lemmas"][index].split("/")):
                        index += 1
                        tokens.append(lemma)
                        # Stop if this was the last lexeme
                        if index == len(values["lexemes"]["lemmas"]):
                            break
                    # If first lexeme was found and this was not the next lexeme, stop
                    elif index > 0:
                        break
                # If all lexemes were found, add lu
                if index == len(values["lexemes"]["lemmas"]):
                    found = False
                    for i, (_, prev_tokens) in enumerate(possible_lus):
                        # Check if all lemmas in lu already in a lu (avoid duplicates)
                        if len(prev_tokens) > 1 and all(token in prev_tokens for token in tokens):
                            found = True
                            break
                        # Check if all lemmas in previous lu in lu (prev lu should be replaced)
                        elif len(tokens) > 1 and all(token in tokens for token in prev_tokens):
                            found = True
                            possible_lus[i] = (lu, tokens)
                            break
                    if not found:
                        possible_lus.append((lu, tokens))
            # Check for 1 or more nonconsecutive lexemes in the sentence
            else:
                matchCount = 0
                tokens = []
                # Iterate through lexemes and tokens
                for word in values["lexemes"]["lemmas"]:
                    for lemma, _ in token_pos:
                        # If lexeme is in the sentence, increment count and move on to next lexeme
                        if any(w == lemma for w in word.split("/")):
                            matchCount += 1
                            tokens.append(lemma)
                            break
                # If all lexemes were found, add lu
                if matchCount == len(values["lexemes"]["lemmas"]):
                    found = False
                    for i, (_, prev_tokens) in enumerate(possible_lus):
                        # Check if all lemmas in lu already in a lu (avoid duplicates)
                        if len(prev_tokens) > 1 and all(token in prev_tokens for token in tokens):
                            found = True
                            break
                        # Check if all lemmas in previous lu in lu (prev lu should be replaced)
                        elif len(tokens) > 1 and all(token in tokens for token in prev_tokens):
                            found = True
                            possible_lus[i] = (lu, tokens)
                            break
                    if not found:
                        possible_lus.append((lu, tokens))
        return possible_lus

    # Predict which of a lexical unit's possible frames is used in a given sentence
    def predict_frame(self, sentence, lu, tokens, doc = None):
        # If decision tree is defined, predict with tree
        if lu in self.models.keys():
            features = self.prediction_features(sentence, tokens, doc)
            if None in features:
                return None
            X = pd.DataFrame([features], columns = ["location", "relation", "head"])
            pred_frame = self.models[lu].predict(X)[0]
        # Else return no frame
        else:
            return None
        
        # Return None if no frame
        if pred_frame == "N/A":
            return None
        
        return pred_frame
    
    # Train models and probabilities on framenet example sentences
    def fit(self, output_dir = "../models", random_state = None):
        # Delete old pkl files in output_dir
        for file in os.listdir(output_dir):
            if file.endswith(".pkl"):
                os.remove(os.path.join(output_dir, file))
            
        self.models = {}
        # Iterate through lus
        for lu, values in tqdm(self.lu_data.items()):
            # If lu has more than 1 frame, we need to create probabilities or a model
            frames = values["frames"]
            # Get all example sentences and the number of sentences per frame
            frame_counts = np.array(list(map(lambda x: len(x["sentences"]), frames)))
            
            # Train a decision tree to predict the frame
            # Store sentences and frame labels to train on
            data = {
                "sentences": [],
                "labels": []
            }
            for frame in frames:
                for sentence in frame["sentences"]:
                    data["sentences"].append(sentence)
                    data["labels"].append(frame["name"])
            # Extract features from annotated example sentences
            features = list(map(lambda x: self.annotated_features(x), data["sentences"]))
            # Get no frame sentences
            no_frames = self.lu_data[lu]["no_frame"]
            if len(no_frames) > 0:
                # Undersample to match most common class
                selected_no_frames = np.random.choice(no_frames, size = max(frame_counts))
                # Extract features from no frame sentences
                for curr in selected_no_frames:
                    # Add no frame features to features
                    features.append(self.prediction_features(curr["text"], curr["tokens"]))
                    data["labels"].append("N/A")
            # Store features in data frame
            X = pd.DataFrame(features, columns = ["location", "relation", "head"])
        
            # Pipeline to one-hot-encode categorical features
            cat_pipeline = Pipeline([
                ("ohe", OneHotEncoder(handle_unknown = "ignore"))
            ])
            col_transformer = ColumnTransformer([
                ("cat", cat_pipeline, ["relation", "head"])
            ])
            pipeline = Pipeline([
                ("preprocessing", col_transformer),
                ("model", DecisionTreeClassifier(random_state = random_state))
            ])
            
            # Fit decision tree and store in pickle file
            pipeline.fit(X, data["labels"])
            pkl.dump(pipeline, open("{}/{}.pkl".format(output_dir, lu), "wb"))
            self.models[lu] = pipeline
    
    def predict(self, sentences, model_dir = None):
        # Load models if not already loaded
        if not hasattr(self, "models") or self.models is None:
            if model_dir is None:
                self.load_trained_models()
            else:
                self.load_trained_models(model_dir)
                
        predictions = []
        # Iterate through sentences to predict
        for i, doc in enumerate(tqdm(self.nlp.pipe(sentences), total = len(sentences))):
            sentence = sentences[i]
            # Identify all lexical units in this sentence
            possible_lus = self.find_lus(sentence, doc)
            curr = []
            # Iterate through lexical units
            for lu, tokens in possible_lus:
                # Get the possible frames for each lexical unit
                possible_frames = self.lu_data[lu]["frames"]
                # If there is only one frame, assign it to the sentence
                if len(possible_frames) == 1:
                    curr.append((lu, possible_frames[0]["name"]))
                # If there is more than one frame, predict which one to use
                else:
                    frame = self.predict_frame(sentence, lu, tokens, doc)
                    if frame is not None:
                        curr.append((lu, frame))
            # Store predicted frames with the target lexical units for this sentence
            predictions.append(curr)
            
        return predictions

In [3]:
model = LexicalUnitClassifier(True)

100%|██████████| 13572/13572 [02:31<00:00, 89.30it/s] 
100%|██████████| 162279/162279 [1:41:34<00:00, 26.63it/s]    


In [4]:
model.fit()

  0%|          | 6/4680 [00:02<35:34,  2.19it/s]

100%|██████████| 4680/4680 [1:11:47<00:00,  1.09it/s]


In [5]:
df = pd.read_csv("../datasets/lexical_unit_sentences.csv")

display(df.info())
display(df.describe())
df

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 204022 entries, 0 to 204021
Data columns (total 4 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   Lexical Unit    204022 non-null  object
 1   Frame           204022 non-null  object
 2   Sentence        200751 non-null  object
 3   Sentence Count  204022 non-null  int64 
dtypes: int64(1), object(3)
memory usage: 6.2+ MB


None

Unnamed: 0,Sentence Count
count,204022.0
mean,47.642269
std,52.379088
min,0.0
25%,19.0
50%,33.0
75%,59.0
max,547.0


Unnamed: 0,Lexical Unit,Frame,Sentence,Sentence Count
0,(can't) help.v,Self_control,"` Not if I can help it . """,11
1,(can't) help.v,Self_control,"And now she took a better look at him , Folly ...",11
2,(can't) help.v,Self_control,"` I could n't help feeling that … well , in yo...",11
3,(can't) help.v,Self_control,"Yet , looking into those liquid dark eyes , Fr...",11
4,(can't) help.v,Self_control,She could n't help the tinge of pink that floo...,11
...,...,...,...,...
204017,zone.n,Locale,Dubai 10-28 ( FP ) - Dubai 's Crown Prince She...,32
204018,zone.n,Locale,A Turbo Cat ferry makes a one - hour trip ( 7 ...,32
204019,zone.n,Locale,"Macau , now the Chinese Special Economic Zone ...",32
204020,zonk out.v,Fall_asleep,,0


In [6]:
sentences = [
    df["Sentence"][1],
    df[df["Lexical Unit"] == "yelp.v"].reset_index(drop = True)["Sentence"][0]
]
predicted_frames = model.predict(sentences)

for i in range(len(sentences)):
    print(sentences[i])
    display(predicted_frames[i])
    print()

100%|██████████| 2/2 [00:00<00:00,  6.01it/s]

And now she took a better look at him , Folly could n't help noticing the strong , muscular lines of the broad back under that white shirt .





[('look', 'Perception_active'),
 ('lined', 'Abounding_with'),
 ('now', 'Temporal_collocation'),
 ("(can't) help", 'Self_control'),
 ('shirt', 'Clothing'),
 ('line', 'Roadways'),
 ('take', 'Removing'),
 ('under', 'Non-gradable_proximity'),
 ('notice', 'Becoming_aware'),
 ('strong', 'Level_of_force_resistance'),
 ('lining', 'Part_inner_outer'),
 ('muscular', 'Body_description_holistic')]


Face red , chest puffed with indignation , young John would yelp : ` I assure you quite categorically that I never touched the ball . 


[('ball', 'Shapes'),
 ('young', 'Age'),
 ('never', 'Negation'),
 ('puff', 'Ingest_substance'),
 ('touch', 'Quantified_mass'),
 ('yelp', 'Communication_noise')]




In [7]:
print(df["Lexical Unit"][1], "|", df["Frame"][1])

(can't) help.v | Self_control


In [8]:
df[df["Lexical Unit"] == "yelp.v"].reset_index(drop = True)["Frame"][0]

'Communication_noise'

In [9]:
# Load example sentences
all_sentences = {}
for sentence in tqdm(fn.exemplars()):
    text = sentence.text
    if text in all_sentences.keys():
        all_sentences[text].append(( sentence.LU.name, sentence.frame.name ))
    else:
        all_sentences[text] = [( sentence.LU.name, sentence.frame.name )]

100%|██████████| 200751/200751 [00:06<00:00, 32496.74it/s]


In [10]:
all_text = list(all_sentences.keys())
all_predictions = model.predict(all_text)

100%|██████████| 171665/171665 [2:06:52<00:00, 22.55it/s]   


In [11]:
correct = 0
total = 0
found_total = 0
extra_predictions = 0
lu_results = {}
for i, (text, frames) in enumerate(all_sentences.items()):
    if i == len(all_predictions):
        break
    extra_predictions += len(list(filter(lambda x: x[1] not in list(map(lambda y: y[1], frames)), all_predictions[i])))
    for frame in frames:
        total += 1
        lu = frame[0].split(".")[0]
        if lu not in lu_results.keys():
            lu_results[lu] = {
                "total": 1,
                "predictions": [],
                "texts": []
            }
        else:
            lu_results[lu]["total"] += 1
        
        predicted_lu = list(filter(lambda x: x[0] == lu, all_predictions[i]))
        predicted_frame = list(filter(lambda x: x[1] == frame[1], all_predictions[i]))
        predicted = predicted_lu + predicted_frame
        if len(predicted) == 0:
            pred_frame = None
        else:
            pred_frame = predicted[0][1]
        lu_results[lu]["predictions"].append({ "actual": frame[1], "predicted": pred_frame })
        lu_results[lu]["texts"].append(all_text[i])
        
        if len(predicted_frame) > 0:
            correct += 1
            found_total += 1
        elif pred_frame != None:
            found_total += 1
       
print("{}/{} correct found predictions, Accuracy = {}".format(correct, found_total, correct / found_total)) 
print("{}/{} correct total predictions, Accuracy = {}".format(correct, total, correct / total))
print("{} extra predictions in {} sentences = {} extra predictions per sentence".format(extra_predictions, len(all_predictions), extra_predictions / len(all_predictions)))

169791/177608 correct found predictions, Accuracy = 0.9559873429124814
169791/200751 correct total predictions, Accuracy = 0.8457790994814471
871139 extra predictions in 171665 sentences = 5.07464538490665 extra predictions per sentence


In [12]:
df_acc = []
df_cnts = []
df_texts = []
for lu, results in lu_results.items():
    incorrect = list(filter(lambda x: x["actual"] != x["predicted"], results["predictions"]))
    total = results["total"]
    correct = total - len(incorrect)
    acc = correct / total
    df_acc.append(pd.DataFrame({
        "Lexical Unit": [lu],
        "Correct": [correct],
        "Total": [total],
        "Accuracy": [acc]
    }))
    combo_cnts = pd.Series.value_counts(results["predictions"])

    for i, (combo, count) in enumerate(combo_cnts.items()):
        df_cnts.append(pd.DataFrame({
            "Lexical Unit": [lu],
            "Actual Frame": [combo["actual"]],
            "Predicted Frame": [combo["predicted"]],
            "Count": [count]
        }))
    
    for i, combo in enumerate(results["predictions"]):
        df_texts.append(pd.DataFrame({
            "Lexical Unit": [lu],
            "Actual Frame": [combo["actual"]],
            "Predicted Frame": [combo["predicted"]],
            "Sentence": [results["texts"][i]]
        }))
    
df_acc = pd.concat(df_acc).sort_values(["Total"], ascending = False)
df_cnts = pd.concat(df_cnts).sort_values(["Lexical Unit", "Actual Frame", "Predicted Frame"])
df_texts = pd.concat(df_texts).sort_values(["Lexical Unit", "Actual Frame", "Predicted Frame"])

with pd.ExcelWriter("../output/Lexical Unit Predictions.xlsx") as writer:
    df_acc.to_excel(writer, sheet_name = "Accuracy", index = False)
    df_cnts.to_excel(writer, sheet_name = "Predictions", index = False)
    df_texts.to_excel(writer, sheet_name = "Sentences", index = False)

In [13]:
len(all_text)

171665