In [1]:
import copy
import json
import nltk
from nltk.corpus import framenet as fn
import numpy as np
import os
import pandas as pd
import pickle as pkl
from sklearn.tree import DecisionTreeClassifier

In [2]:
df = pd.read_csv("../datasets/lexical_unit_sentences.csv")

display(df.info())
display(df.describe())
df

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 88640 entries, 0 to 88639
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Lexical Unit    88640 non-null  object
 1   Frame Count     88640 non-null  int64 
 2   Frame           88640 non-null  object
 3   Sentence Count  88640 non-null  int64 
 4   Sentence        87409 non-null  object
dtypes: int64(2), object(3)
memory usage: 3.4+ MB


None

Unnamed: 0,Frame Count,Sentence Count
count,88640.0,88640.0
mean,3.203847,57.749492
std,1.781488,59.231118
min,2.0,0.0
25%,2.0,21.0
50%,2.0,39.0
75%,4.0,73.0
max,11.0,401.0


Unnamed: 0,Lexical Unit,Frame Count,Frame,Sentence Count,Sentence
0,faith.n,2,Religious_belief,0,
1,faith.n,2,Trust,1,Legend has it that a local woman climbed the h...
2,degree.n,3,Quantity,0,
3,degree.n,3,Quantified_mass,29,Specialist labour or industrial correspondents...
4,degree.n,3,Quantified_mass,29,The incremental approach has also been known t...
...,...,...,...,...,...
88635,evacuate.v,4,Emptying,6,Some nearby buildings have also been evacuated...
88636,evacuate.v,4,Emptying,6,The government of the Maldives has decided to ...
88637,evacuate.v,4,Emptying,6,Let us assume you wish to evacuate the nightcl...
88638,evacuate.v,4,Emptying,6,"The fire brigade reappeared , bringing them so..."


In [9]:
class ClassifyLexicalUnits:
    def __init__(
        self, load_training = True, pretrained = True, 
        training_filename = "../datasets/lexical_unit_sentences.csv",
        model_directory = "../models/lexical_units"
    ):
        self.df = None
        self.models = None
        self.rules = None
        self.load_framenet()
    
        if load_training:
            self.load_training_data(training_filename)
            
        if pretrained:
            self.load_trained_models(model_directory)
            
    def load_framenet(self):
        nltk.download("framenet_v17")
        lexical_units = fn.lus()
        lu_names = list(set(map(lambda x: x.name.replace(".", "_"), lexical_units)))
        self.lu_frames = { key: [] for key in lu_names }
        for lu in lexical_units:
            name = str.replace(lu.name, ".", "_")
            if name not in self.lu_frames[name]:
                self.lu_frames[name].append(lu.frame.name)
                self.lu_frames[name] = list(sorted(self.lu_frames[name]))
    
    def load_training_data(self, filename = "../datasets/lexical_unit_sentences.csv"):
        self.df = pd.read_csv(filename)
        self.df["POS"] = self.df["Sentence"].apply(self.pos_tag)
        self.df["Lexical Unit"] = self.df["Lexical Unit"].str.replace(".", "_")
        
    def load_trained_models(self, directory = "../models/lexical_units"):
        self.models = {}
        self.rules = json.load(open("{}/rules.json".format(directory)))
        for filename in os.listdir(directory):
            comps = filename.split(".")
            if comps[1] == "pkl":
                self.models[comps[0]] = pkl.load(open(os.join(directory, filename), "rb"))
                
    def pos_tag(self, sentence):
        if type(sentence) != str:
            return []
        else:
            tokens = nltk.word_tokenize(sentence)
            base_tags = nltk.pos_tag(tokens)
            final_tags = []
            for word, tag in base_tags:
                if tag.startswith("N"):
                    final_tags.append((word, "_n"))
                elif tag.startswith("J"):
                    final_tags.append((word, "_a"))
                elif tag.startswith("V"):
                    final_tags.append((word, "_v"))
                elif tag.startswith("R"):
                    final_tags.append((word, "_adv"))
                elif tag == "IN":
                    final_tags.append((word, "_prep"))
                elif tag == "CD":
                    final_tags.append((word, "_num"))
                elif tag == "CC":
                    final_tags.append((word, "_c"))
                elif tag == "UH":
                    final_tags.append((word, "_intj"))
                elif tag == "DT":
                    final_tags.append((word, "_art"))
                else:
                    final_tags.append((word, "_scon"))
                    
            return final_tags
            
        
    def get_word_lu(self, word, pos):
        possible_lus = list(filter(lambda x: x.startswith("{}".format(word.lower())), self.lu_frames.keys()))
        if len(possible_lus) == 0:
            return None
        elif len(possible_lus) == 1:
            return possible_lus[0]
        else:
            tmp = word + pos
            if tmp in possible_lus:
                return tmp
            else:
                return None
            
    def process_sentence(self, sentence):
        # presence/absence of words
        # use parse tree (what does structure look like)
        # spacy has a dependency parser <-
        # look at children edge labels of children in dependency tree
        # surrounding words
        # word count
        # where lu is relative to sentence length <-
        # bool vars for certain words being present
        # lexical parse tree
        # named entities
        # POS counts
        return np.array([])
    
    def predict_frame(self, lu, sentence):
        if lu in self.rules.keys():
            probs = self.rules[lu]
            pred_frame = np.random.choice(list(probs.keys()), p = list(probs.values()))
            return pred_frame
        elif lu in self.models.keys():
            features = [self.process_sentence(sentence)]
            return None
            return self.models[lu].predict(features)[0]
        else:
            raise Exception("Unknown lexical unit: {}".format(lu))
    
    def fit(self, df_train = None, output_dir = "../models/lexical_units", random_state = None):
        if df_train is not None:
            self.df = copy.deepcopy(df_train)
            self.df["POS"] = self.df["Sentence"].apply(self.pos_tag)
        elif self.df is None:
            self.load_training_data()
            
        self.rules = {}
        self.models = {}
        for lu, frames in self.lu_frames.items():
            if len(frames) > 1:
                df_lu = self.df[self.df["Lexical Unit"] == lu]
                frame_counts = df_lu.groupby("Frame")["Sentence"].count()
                if min(frame_counts) < 10:
                    frame_counts = frame_counts / sum(frame_counts)
                    self.rules[lu] = frame_counts.to_dict()
                else:
                    features = df_lu["Sentence"].apply(self.process_sentence).to_numpy()
                    # model = DecisionTreeClassifier(random_state = random_state)
                    # model.fit(features, df_lu["Frame"])
                    # pkl.dump(model, open("{}/{}.pkl".format(output_dir, lu), "wb"))
                    # self.models[lu] = model
                    
        json.dump(self.rules, open("{}/rules.json".format(output_dir), "w"), indent = 4)
    
    def predict(self, sentences, model_dir = None):
        if self.models is None:
            if model_dir is None:
                self.load_trained_models()
            else:
                self.load_trained_models(model_dir)
                
        pos = list(map(self.pos_tag, sentences))
        predictions = []
        for i in range(len(sentences)):
            curr = []
            for word, tag in pos[i]:
                lu = self.get_word_lu(word, tag)
                if lu is not None:
                    possible_frames = self.lu_frames[lu]
                    if len(possible_frames) == 1:
                        curr.append((lu, possible_frames[0]))
                    else:
                        curr.append((lu, self.predict_frame(lu, sentences[i])))
            predictions.append(curr)
            
        return predictions

In [10]:
clu = ClassifyLexicalUnits(False)

[nltk_data] Downloading package framenet_v17 to
[nltk_data]     /Users/ryanschaefer/nltk_data...
[nltk_data]   Package framenet_v17 is already up-to-date!


In [11]:
large = clu.fit()

In [12]:
sentences = [df["Sentence"][1]]
predicted_frames = clu.predict(sentences)

for i in range(len(sentences)):
    print(sentences[i])
    display(predicted_frames[i])
    print()

Legend has it that a local woman climbed the hill every day to watch for her husband returning from across the sea ; one day the wife and her child were turned to stone as a permanent symbol of her enduring faith .


[('legendary_a', 'Fame'),
 ('that_adv', 'Degree'),
 ('local_a', 'Political_locales'),
 ('woman_n', 'People'),
 ('hill_n', 'Natural_features'),
 ('day_n', 'Calendric_unit'),
 ('watch_v', 'Perception_active'),
 ('for_prep', 'Taking_sides'),
 ('husband_n', 'Personal_relationship'),
 ('from_prep', 'Origin'),
 ('across_prep', 'Distributed_position'),
 ('sea_n', 'Natural_features'),
 ('one_num', 'Cardinal_numbers'),
 ('day_n', 'Calendric_unit'),
 ('wife_n', 'Personal_relationship'),
 ('child_n', 'People_by_age'),
 ('turned on_a', 'Biological_urge'),
 ('stone_v', 'Cause_harm'),
 ('as_prep', 'Performers_and_roles'),
 ('symbolize_v', 'Representing'),
 ('of_prep', 'Partitive'),
 ('enduring_a', 'Duration_description'),
 ('faith_n', 'Trust')]


