In [1]:
import copy
from difflib import get_close_matches
import json
from lemminflect import getAllLemmas, getAllInflections
import nltk
from nltk.corpus import framenet as fn
import numpy as np
import os
import pandas as pd
import pickle as pkl
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
import spacy

In [2]:
df = pd.read_csv("../datasets/lexical_unit_sentences.csv")

display(df.info())
display(df.describe())
df

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 204022 entries, 0 to 204021
Data columns (total 4 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   Lexical Unit    204022 non-null  object
 1   Frame           204022 non-null  object
 2   Sentence        200751 non-null  object
 3   Sentence Count  204022 non-null  int64 
dtypes: int64(1), object(3)
memory usage: 6.2+ MB


None

Unnamed: 0,Sentence Count
count,204022.0
mean,47.642269
std,52.379088
min,0.0
25%,19.0
50%,33.0
75%,59.0
max,547.0


Unnamed: 0,Lexical Unit,Frame,Sentence,Sentence Count
0,(can't) help.v,Self_control,"` Not if I can help it . """,11
1,(can't) help.v,Self_control,"And now she took a better look at him , Folly ...",11
2,(can't) help.v,Self_control,"` I could n't help feeling that … well , in yo...",11
3,(can't) help.v,Self_control,"Yet , looking into those liquid dark eyes , Fr...",11
4,(can't) help.v,Self_control,She could n't help the tinge of pink that floo...,11
...,...,...,...,...
204017,zone.n,Locale,Dubai 10-28 ( FP ) - Dubai 's Crown Prince She...,32
204018,zone.n,Locale,A Turbo Cat ferry makes a one - hour trip ( 7 ...,32
204019,zone.n,Locale,"Macau , now the Chinese Special Economic Zone ...",32
204020,zonk out.v,Fall_asleep,,0


In [21]:
for lu in fn.lus():
    for i, char in enumerate(lu.exemplars[0].text):
        print(i, char)
    break

0 `
1  
2 N
3 o
4 t
5  
6 i
7 f
8  
9 I
10  
11 c
12 a
13 n
14  
15 h
16 e
17 l
18 p
19  
20 i
21 t
22  
23 .
24  
25 "


In [25]:
for lu in fn.lus():
    print(lu.name)
    print(lu.exemplars[0])
    for aset in lu.exemplars[0].annotationSet:
        for layer in aset.layer:
            if layer.name == "Target":
                for l in layer.label:
                    print(l)
        print()
    break

(can't) help.v
exemplar sentence (4166707):
[corpID] 111
[docID] 421
[paragNo] 1609
[sentNo] 1
[aPos] 78308214

[LU] (16601) (can't) help.v in Self_control

[frame] (2651) Self_control

[annotationSet] 2 annotation sets

[POS] 0 tags

[POS_tagset] PENN

[GF] 2 relations

[PT] 2 phrases

[text] + [Target] + [FE]

` Not if I can help it . "
         -     **** --
         A          Ev
 (A=Agent, Ev=Event)



[cBy] MJE
[end] 18
[start] 15
[name] Target




In [4]:
suffixes = []
for lu in df["Lexical Unit"]:
    suffixes.append(lu.split(".")[-1])
    
pd.Series.value_counts(suffixes)

v       83906
n       79057
a       34355
prep     2991
adv      2150
scon      760
num       353
art       269
idio      124
c          51
intj        5
pron        1
Name: count, dtype: int64

In [None]:
for lu in fn.lus():
    lu.

In [3]:
class LexicalUnitClassifier:
    def __init__(
        self, load_training = True, pretrained = True, 
        training_filename = "../datasets/lexical_unit_sentences.csv",
        model_directory = "../models/lexical_units"
    ):
        self.df = None
        self.models = None
        self.rules = None
        self.load_framenet()
    
        if load_training:
            self.load_training_data(training_filename)
            
        if pretrained:
            self.load_trained_models(model_directory)
            
    def load_framenet(self):
        nltk.download("framenet_v17")
        self.nlp = spacy.load("en_core_web_sm")
        lexical_units = fn.lus()
        lu_names = list(set(map(lambda x: x.name.replace(".", "_"), lexical_units)))
        self.lu_frames = { key: [] for key in lu_names }
        for lu in lexical_units:
            name = str.replace(lu.name, ".", "_")
            if name not in self.lu_frames[name]:
                self.lu_frames[name].append(lu.frame.name)
                self.lu_frames[name] = list(sorted(self.lu_frames[name]))
    
    def load_training_data(self, filename = "../datasets/lexical_unit_sentences.csv"):
        self.df = pd.read_csv(filename)
        self.df["Lexical Unit"] = self.df["Lexical Unit"].str.replace(".", "_")
        
    def load_trained_models(self, directory = "../models/lexical_units"):
        self.models = {}
        self.rules = json.load(open("{}/rules.json".format(directory)))
        for filename in os.listdir(directory):
            comps = filename.split(".")
            if comps[1] == "pkl":
                self.models[comps[0]] = pkl.load(open(os.path.join(directory, filename), "rb"))
            
    def get_word_lu(self, word, pos):
        possible_lus = list(filter(lambda x: x.startswith("{}".format(word.lower())), self.lu_frames.keys()))
        if len(possible_lus) == 0:
            return None
        elif len(possible_lus) == 1:
            return possible_lus[0]
        else:
            tmp = word + pos
            if tmp in possible_lus:
                return tmp
            else:
                return None
            
    def pos_tag(self, sentence, doc = None):
        if doc is None:
            doc = self.nlp(sentence)
            
        pos_mapping = {
            'ADJ': 'a',    # Adjective
            'ADV': 'adv',  # Adverb
            'INTJ': 'intj', # Interjection
            'NOUN': 'n',   # Noun
            'PROPN': 'n',  # Proper noun
            'VERB': 'v',   # Verb
            'ADP': 'prep', # Adposition (preposition and postposition)
            'AUX': 'v',  # Auxiliary verb
            'CONJ': 'c',   # Conjunction
            'CCONJ': 'c',  # Coordinating conjunction
            'SCONJ': 'scon', # Subordinating conjunction
            'DET': 'art',  # Determiner (article)
            'NUM': 'num',  # Numeral
            'PART': 'part', # Particle
            'PRON': 'pron', # Pronoun
        }
        
        results = []
        for token in doc:
            if token.pos_ in pos_mapping.keys():
                results.append((token.text.lower(), pos_mapping[token.pos_]))
                
        return results
            
    def process_sentence(self, sentence, doc = None):
        # presence/absence of words
        # use parse tree (what does structure look like)
        # spacy has a dependency parser <-
        # look at children edge labels of children in dependency tree
        # surrounding words
        # word count
        # where lu is relative to sentence length <-
        # bool vars for certain words being present
        # lexical parse tree
        # named entities
        # POS counts
        
        if doc is None:
            doc = self.nlp(sentence)
        length = len(doc)
        results = {}
        first = None
        for i, token in enumerate(doc):
            results[token.lemma_.lower()] = [
                i / length, token.dep_, token.head.lemma_
            ]
            
            if token.dep_ in ["ROOT", "ccomp"]:
                first = token
            elif token.dep_ == "prt" and first != None:
                results["{} {}".format(first.lemma_.lower(), token.lemma_.lower())] = [
                    i / length, doc[i - 1].dep_, doc[i - 1].head.lemma_
                ]
            
        return results
    
    def match_lemmas(self, sentence, lu):
        real_word =  " ".join(lu.split("_")[:-1])

        # creates set to get all lemma options and fills with possibles keys for lemmas
        unique_lemmas = set()
        for token in self.nlp(real_word):
            unique_lemmas.add(token.text)
            unique_lemmas.add(token.lemma_)
        
        if real_word.count(" ") == 0:
            for lemma in getAllLemmas(real_word).values():
                for word in lemma:
                    unique_lemmas.add(word.lower())
        else:
            for w in real_word.split(" "):
                for lemma in getAllLemmas(w).values():
                    for word in lemma:
                        unique_lemmas.add(word.lower())

        # gets all iterations of the base lemmas to see all lemma options
        for lem in list(unique_lemmas):
            for infl in getAllInflections(lem).values():
                for word in infl:
                    unique_lemmas.add(word.lower())
                    for lemma in getAllLemmas(word).values():
                        for w in lemma:
                            unique_lemmas.add(w.lower())

        # gets the features for only the target lemma within the sample sentence
        features = self.process_sentence(sentence)
        for lemma in features:
            if all(len(get_close_matches(word, unique_lemmas)) > 0 for word in lemma.split(" ")):
                return features[lemma]
            
        print("Missing lexical unit {} in sentence {}".format(lu, sentence))
        
        return [None, None, None]

    def predict_frame(self, lu, processed_sentence):
        if lu in self.rules.keys():
            # probs = self.rules[lu]
            # pred_frame = np.random.choice(list(probs.keys()), p = list(probs.values()))
            return self.rules[lu]
        elif lu in self.models.keys():
            features = [processed_sentence["_".join(lu.split("_")[:-1])]]
            X = pd.DataFrame(features, columns = ["location", "relation", "head"])
            return self.models[lu].predict(X)[0]
        else:
            raise Exception("Unknown lexical unit: {}".format(lu))
    
    def fit(self, df_train = None, output_dir = "../models/lexical_units", random_state = None):
        # Delete old pkl files in output_dir
        for file in os.listdir(output_dir):
            if file.endswith(".pkl"):
                os.remove(os.path.join(output_dir, file))
        
        if df_train is not None:
            self.df = copy.deepcopy(df_train)
        elif self.df is None:
            self.load_training_data()
            
        self.rules = {}
        self.models = {}
        for lu, frames in self.lu_frames.items():
            if len(frames) > 1:
                df_lu = self.df[self.df["Lexical Unit"] == lu]
                frame_counts = df_lu.groupby("Frame")["Sentence"].count()
                
                if min(frame_counts) < 10:
                    frame_counts = frame_counts / sum(frame_counts)
                    self.rules[lu] = frame_counts.to_dict()
                else:
                    df_lu_no_na = df_lu.dropna().reset_index(drop = True)
                    features = list(map(lambda x: self.match_lemmas(x, lu), df_lu_no_na["Sentence"]))
                    X = pd.DataFrame(features, columns = ["location", "relation", "head"])
                    X["Frame"] = df_lu_no_na["Frame"]
                    X.dropna(inplace = True)
                    frame_labels = X["Frame"]
                    X.drop("Frame", axis = 1, inplace = True)
                    
                    if len(X) < 10 * len(self.lu_frames[lu]):
                        frame_counts = frame_counts / sum(frame_counts)
                        self.rules[lu] = frame_counts.to_dict()
                    else:
                        cat_pipeline = Pipeline([
                            ("ohe", OneHotEncoder(handle_unknown = "ignore"))
                        ])
                        col_transformer = ColumnTransformer([
                            ("cat", cat_pipeline, ["relation", "head"])
                        ])
                        pipeline = Pipeline([
                            ("preprocessing", col_transformer),
                            ("model", DecisionTreeClassifier(random_state = random_state))
                        ])
                        pipeline.fit(X, frame_labels)
                        pkl.dump(pipeline, open("{}/{}.pkl".format(output_dir, lu), "wb"))
                        self.models[lu] = pipeline
                    
        json.dump(self.rules, open("{}/rules.json".format(output_dir), "w"), indent = 4)
    
    def predict(self, sentences, model_dir = None):
        if not hasattr(self, "models") or self.models is None:
            if model_dir is None:
                self.load_trained_models()
            else:
                self.load_trained_models(model_dir)
                
        predictions = []
        for sentence in sentences:
            doc = self.nlp(sentence)
            pos = self.pos_tag(sentence, doc)
            processed_sentence = self.process_sentence(sentence, doc)
            curr = []
            for word, tag in pos:
                lu = self.get_word_lu(word, tag)
                if lu is not None:
                    possible_frames = self.lu_frames[lu]
                    if len(possible_frames) == 1:
                        curr.append((lu, possible_frames[0]))
                    else:
                        curr.append((lu, self.predict_frame(lu, processed_sentence)))
            predictions.append(curr)
            
        return predictions

In [4]:
model = LexicalUnitClassifier(True, False)

[nltk_data] Downloading package framenet_v17 to
[nltk_data]     /Users/ryanschaefer/nltk_data...
[nltk_data]   Package framenet_v17 is already up-to-date!


In [5]:
model.fit()

In [6]:
del model.models

In [7]:
sentences = [df["Sentence"][1]]
predicted_frames = model.predict(sentences)

for i in range(len(sentences)):
    print(sentences[i])
    display(predicted_frames[i])
    print()

And now she took a better look at him , Folly could n't help noticing the strong , muscular lines of the broad back under that white shirt .


[('now_adv', 'Temporal_collocation'),
 ('better_v', 'Surpassing'),
 ('look_n', 'Perception_active'),
 ('at_prep', 'Spatial_co-location'),
 ('could_v', 'Possibility'),
 ('help_v', 'Assistance'),
 ('strong_a', 'Level_of_force_resistance'),
 ('muscular_a', 'Body_description_holistic'),
 ('of_prep', 'Partitive'),
 ('broad_a', 'Dimension'),
 ('back_n', 'Body_parts'),
 ('under_prep', 'Non-gradable_proximity'),
 ('that_adv', 'Degree'),
 ('white_a', 'Color'),
 ('shirt_n', 'Clothing')]


