In [97]:
import copy
import json
import nltk
from nltk.corpus import framenet as fn
import numpy as np
import os
import pandas as pd
import pickle as pkl
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
import spacy
from lemminflect import getAllLemmas, getAllInflections

In [98]:
df = pd.read_csv("../datasets/lexical_unit_sentences.csv")

display(df.info())
display(df.describe())
df

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 204022 entries, 0 to 204021
Data columns (total 4 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   Lexical Unit    204022 non-null  object
 1   Frame           204022 non-null  object
 2   Sentence        200751 non-null  object
 3   Sentence Count  204022 non-null  int64 
dtypes: int64(1), object(3)
memory usage: 6.2+ MB


None

Unnamed: 0,Sentence Count
count,204022.0
mean,47.642269
std,52.379088
min,0.0
25%,19.0
50%,33.0
75%,59.0
max,547.0


Unnamed: 0,Lexical Unit,Frame,Sentence,Sentence Count
0,(can't) help.v,Self_control,"` Not if I can help it . """,11
1,(can't) help.v,Self_control,"And now she took a better look at him , Folly ...",11
2,(can't) help.v,Self_control,"` I could n't help feeling that … well , in yo...",11
3,(can't) help.v,Self_control,"Yet , looking into those liquid dark eyes , Fr...",11
4,(can't) help.v,Self_control,She could n't help the tinge of pink that floo...,11
...,...,...,...,...
204017,zone.n,Locale,Dubai 10-28 ( FP ) - Dubai 's Crown Prince She...,32
204018,zone.n,Locale,A Turbo Cat ferry makes a one - hour trip ( 7 ...,32
204019,zone.n,Locale,"Macau , now the Chinese Special Economic Zone ...",32
204020,zonk out.v,Fall_asleep,,0


In [99]:
class ClassifyLexicalUnits:
    def __init__(
        self, load_training = True, pretrained = True, 
        training_filename = "../datasets/lexical_unit_sentences.csv",
        model_directory = "../models/lexical_units"
    ):
        self.df = None
        self.models = None
        self.rules = None
        self.missing_count = 0
        self.total_count = 0
        self.load_framenet()
    
        if load_training:
            self.load_training_data(training_filename)
            
        if pretrained:
            self.load_trained_models(model_directory)
            
    def load_framenet(self):
        nltk.download("framenet_v17")
        self.nlp = spacy.load("en_core_web_sm")
        lexical_units = fn.lus()
        lu_names = list(set(map(lambda x: x.name.replace(".", "_"), lexical_units)))
        self.lu_frames = { key: [] for key in lu_names }
        for lu in lexical_units:
            name = str.replace(lu.name, ".", "_")
            if name not in self.lu_frames[name]:
                self.lu_frames[name].append(lu.frame.name)
                self.lu_frames[name] = list(sorted(self.lu_frames[name]))
    
    def load_training_data(self, filename = "../datasets/lexical_unit_sentences.csv"):
        self.df = pd.read_csv(filename)
        # self.df["POS"] = self.df["Sentence"].apply(self.pos_tag)
        self.df["Lexical Unit"] = self.df["Lexical Unit"].str.replace(".", "_")
        
    def load_trained_models(self, directory = "../models/lexical_units"):
        self.models = {}
        self.rules = json.load(open("{}/rules.json".format(directory)))
        for filename in os.listdir(directory):
            comps = filename.split(".")
            if comps[1] == "pkl":
                self.models[comps[0]] = pkl.load(open(os.path.join(directory, filename), "rb"))
                
    def pos_tag(self, sentence):
        if type(sentence) != str:
            return []
        else:
            tokens = nltk.word_tokenize(sentence)
            base_tags = nltk.pos_tag(tokens)
            final_tags = []
            for word, tag in base_tags:
                if tag.startswith("N"):
                    final_tags.append((word, "_n"))
                elif tag.startswith("J"):
                    final_tags.append((word, "_a"))
                elif tag.startswith("V"):
                    final_tags.append((word, "_v"))
                elif tag.startswith("R"):
                    final_tags.append((word, "_adv"))
                elif tag == "IN":
                    final_tags.append((word, "_prep"))
                elif tag == "CD":
                    final_tags.append((word, "_num"))
                elif tag == "CC":
                    final_tags.append((word, "_c"))
                elif tag == "UH":
                    final_tags.append((word, "_intj"))
                elif tag == "DT":
                    final_tags.append((word, "_art"))
                else:
                    final_tags.append((word, "_scon"))
                    
            return final_tags
            
        
    def get_word_lu(self, word, pos):
        possible_lus = list(filter(lambda x: x.startswith("{}".format(word.lower())), self.lu_frames.keys()))
        if len(possible_lus) == 0:
            return None
        elif len(possible_lus) == 1:
            return possible_lus[0]
        else:
            tmp = word + pos
            if tmp in possible_lus:
                return tmp
            else:
                return None
            
    def process_sentence(self, sentence):
        # presence/absence of words
        # use parse tree (what does structure look like)
        # spacy has a dependency parser <-
        # look at children edge labels of children in dependency tree
        # surrounding words
        # word count
        # where lu is relative to sentence length <-
        # bool vars for certain words being present
        # lexical parse tree
        # named entities
        # POS counts
        
        doc = self.nlp(sentence)
        length = len(doc)
        results = {}
        for i, token in enumerate(doc):
            results[token.lemma_.lower()] = [
                i / length, token.dep_, token.head.lemma_
            ]
            
        return results
    
    def match_lemmas(self, sentence, lu):
        self.total_count += 1
        real_word =  "_".join(lu.split("_")[:-1])

        # creates set to get all lemma options and fills with possibles keys for lemmas
        unique_lemmas = set()
        all_lemmas = getAllLemmas(real_word)
        for lemma in all_lemmas.values():
            for word in lemma:
                unique_lemmas.add(word.lower())

        # gets all iterations of the base lemmas to see all lemma options
        for lem in list(unique_lemmas):
            all_inflections = getAllInflections(lem)
            for lemma in all_inflections.values():
                for word in lemma:
                    unique_lemmas.add(word.lower())

        # gets the features for only the target lemma within the sample sentence
        features = self.process_sentence(sentence)
        for lemma in features:
            if lemma in unique_lemmas:
                return features[lemma]

        print("Missing lexical unit: {} out of total: {}".format(lu, self.total_count))
        self.missing_count += 1
        return [None, None, None]

    def predict_frame(self, lu, processed_sentence):
        if lu in self.rules.keys():
            probs = self.rules[lu]
            pred_frame = np.random.choice(list(probs.keys()), p = list(probs.values()))
            return pred_frame
        elif lu in self.models.keys():
            features = [processed_sentence["_".join(lu.split("_")[:-1])]]
            X = pd.DataFrame(features, columns = ["location", "relation", "head"])
            return self.models[lu].predict(X)[0]
        else:
            raise Exception("Unknown lexical unit: {}".format(lu))
    
    def fit(self, df_train = None, output_dir = "../models/lexical_units", random_state = None):
        if df_train is not None:
            self.df = copy.deepcopy(df_train)
            # self.df["POS"] = self.df["Sentence"].apply(self.pos_tag)
        elif self.df is None:
            self.load_training_data()
            
        self.rules = {}
        self.models = {}
        for lu, frames in self.lu_frames.items():
            if len(frames) > 1:
                df_lu = self.df[self.df["Lexical Unit"] == lu]
                frame_counts = df_lu.groupby("Frame")["Sentence"].count()
                
                df_lu_no_na = df_lu.dropna().reset_index(drop = True)
                features = list(map(lambda x: self.match_lemmas(x, lu), df_lu_no_na["Sentence"]))
                X = pd.DataFrame(features, columns = ["location", "relation", "head"])
                X["Frame"] = df_lu_no_na["Frame"]
                X.dropna(inplace = True)
                frame_labels = X["Frame"]
                X.drop("Frame", axis = 1, inplace = True)
                
                if min(frame_counts) < 10 or len(X) < 10 * len(self.lu_frames[lu]):
                    frame_counts = frame_counts / sum(frame_counts)
                    self.rules[lu] = frame_counts.to_dict()
                else:
                    cat_pipeline = Pipeline([
                        # ("impute", SimpleImputer(strategy = "most_frequent")),
                        ("ohe", OneHotEncoder(handle_unknown = "ignore"))
                    ])
                    # num_pipeline = Pipeline([
                    #     # ("impute", SimpleImputer(strategy = "median"))
                    # ])
                    col_transformer = ColumnTransformer([
                        ("cat", cat_pipeline, ["relation", "head"])
                        # ("num", num_pipeline, ["location"])
                    ])
                    pipeline = Pipeline([
                        ("preprocessing", col_transformer),
                        ("model", DecisionTreeClassifier(random_state = random_state))
                    ])
                    # model = DecisionTreeClassifier(random_state = random_state)
                    pipeline.fit(X, frame_labels)
                    # pkl.dump(pipeline, open("{}/{}.pkl".format(output_dir, lu), "wb"))
                    self.models[lu] = pipeline
                    
        json.dump(self.rules, open("{}/rules.json".format(output_dir), "w"), indent = 4)
        print("Missing:", self.missing_count)
        print("Total:", self.total_count)
    
    def predict(self, sentences, model_dir = None):
        if self.models is None:
            if model_dir is None:
                self.load_trained_models()
            else:
                self.load_trained_models(model_dir)
                
        pos = list(map(self.pos_tag, sentences))
        predictions = []
        for i in range(len(sentences)):
            processed_sentence = self.process_sentence(sentences[i])
            curr = []
            for word, tag in pos[i]:
                lu = self.get_word_lu(word, tag)
                if lu is not None:
                    possible_frames = self.lu_frames[lu]
                    if len(possible_frames) == 1:
                        curr.append((lu, possible_frames[0]))
                    else:
                        curr.append((lu, self.predict_frame(lu, processed_sentence)))
            predictions.append(curr)
            
        return predictions

In [100]:
clu = ClassifyLexicalUnits(True, False)

[nltk_data] Downloading package framenet_v17 to
[nltk_data]     /Users/ryanschaefer/nltk_data...
[nltk_data]   Package framenet_v17 is already up-to-date!


In [101]:
clu.fit()

Missing lexical unit: despite_prep out of total: 1223
Missing lexical unit: despite_prep out of total: 1224
Missing lexical unit: despite_prep out of total: 1225
Missing lexical unit: despite_prep out of total: 1226
Missing lexical unit: despite_prep out of total: 1227
Missing lexical unit: despite_prep out of total: 1228
Missing lexical unit: despite_prep out of total: 1229
Missing lexical unit: despite_prep out of total: 1230
Missing lexical unit: despite_prep out of total: 1231
Missing lexical unit: despite_prep out of total: 1232
Missing lexical unit: despite_prep out of total: 1233
Missing lexical unit: despite_prep out of total: 1234
Missing lexical unit: despite_prep out of total: 1235
Missing lexical unit: despite_prep out of total: 1236
Missing lexical unit: despite_prep out of total: 1237
Missing lexical unit: despite_prep out of total: 1238
Missing lexical unit: despite_prep out of total: 1239
Missing lexical unit: despite_prep out of total: 1240
Missing lexical unit: set up

In [103]:
len(clu.models)

347

In [None]:
sentences = [df["Sentence"][1]]
predicted_frames = clu.predict(sentences)

for i in range(len(sentences)):
    print(sentences[i])
    display(predicted_frames[i])
    print()

Legend has it that a local woman climbed the hill every day to watch for her husband returning from across the sea ; one day the wife and her child were turned to stone as a permanent symbol of her enduring faith .


[('legendary_a', 'Fame'),
 ('that_adv', 'Degree'),
 ('local_a', 'Political_locales'),
 ('woman_n', 'People'),
 ('hill_n', 'Natural_features'),
 ('day_n', 'Calendric_unit'),
 ('watch_v', 'Perception_active'),
 ('for_prep', 'Duration_relation'),
 ('husband_n', 'Personal_relationship'),
 ('from_prep', 'Time_vector'),
 ('across_prep', 'Distributed_position'),
 ('sea_n', 'Natural_features'),
 ('one_num', 'Cardinal_numbers'),
 ('day_n', 'Calendric_unit'),
 ('wife_n', 'Personal_relationship'),
 ('child_n', 'People_by_age'),
 ('turned on_a', 'Biological_urge'),
 ('stone_v', 'Cause_harm'),
 ('as_prep', 'Performers_and_roles'),
 ('symbolize_v', 'Representing'),
 ('of_prep', 'Origin'),
 ('enduring_a', 'Duration_description'),
 ('faith_n', 'Trust')]




In [3]:

from spacy import displacy

# Load the language model
nlp = spacy.load("en_core_web_sm")

sentence = 'Legend has it that a local woman climbed the hill every day to watch for her husband returning from across the sea ; one day the wife and her child were turned to stone as a permanent symbol of her enduring faith .'

# nlp function returns an object with individual token information, 
# linguistic features and relationships
doc = nlp(sentence)

print ("{:<15} | {:<8} | {:<15} | {:<20}".format('Token','Relation','Head', 'Children'))
print ("-" * 70)

# token_size = 0
for token in doc:
    # token_size += 1
    # Print the token, dependency nature, head and all dependents of the token
    print ("{:<15} | {:<8} | {:<15} | {:<20}"
            .format(str(token.text), str(token.dep_), str(token.head.text), str([child for child in token.children])))
  
# Use displayCy to visualize the dependency 
print(len)
displacy.render(doc, style='dep', jupyter=True, options={'distance': 120})

Token           | Relation | Head            | Children            
----------------------------------------------------------------------
Legend          | nsubj    | has             | []                  
has             | ccomp    | turned          | [Legend, it, climbed]
it              | dobj     | has             | []                  
that            | mark     | climbed         | []                  
a               | det      | woman           | []                  
local           | amod     | woman           | []                  
woman           | nsubj    | climbed         | [a, local]          
climbed         | ccomp    | has             | [that, woman, hill, day, watch]
the             | det      | hill            | []                  
hill            | dobj     | climbed         | [the]               
every           | det      | day             | []                  
day             | npadvmod | climbed         | [every]             
to              | aux      | watc