In [21]:
import copy
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import framenet as fn
import numpy as np
import os
import pandas as pd
import pickle as pkl
import seaborn as sns
import xgboost as xgb

nltk.download("wordnet")
nltk.download("framenet_v17")

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/ryanschaefer/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package framenet_v17 to
[nltk_data]     /Users/ryanschaefer/nltk_data...
[nltk_data]   Package framenet_v17 is already up-to-date!


True

In [2]:
df = pd.read_csv("../datasets/lexical_unit_sentences.csv")

display(df.info())
display(df.describe())
df

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 88640 entries, 0 to 88639
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Lexical Unit    88640 non-null  object
 1   Frame Count     88640 non-null  int64 
 2   Frame           88640 non-null  object
 3   Sentence Count  88640 non-null  int64 
 4   Sentence        87409 non-null  object
dtypes: int64(2), object(3)
memory usage: 3.4+ MB


None

Unnamed: 0,Frame Count,Sentence Count
count,88640.0,88640.0
mean,3.203847,57.749492
std,1.781488,59.231118
min,2.0,0.0
25%,2.0,21.0
50%,2.0,39.0
75%,4.0,73.0
max,11.0,401.0


Unnamed: 0,Lexical Unit,Frame Count,Frame,Sentence Count,Sentence
0,faith.n,2,Religious_belief,0,
1,faith.n,2,Trust,1,Legend has it that a local woman climbed the h...
2,degree.n,3,Quantity,0,
3,degree.n,3,Quantified_mass,29,Specialist labour or industrial correspondents...
4,degree.n,3,Quantified_mass,29,The incremental approach has also been known t...
...,...,...,...,...,...
88635,evacuate.v,4,Emptying,6,Some nearby buildings have also been evacuated...
88636,evacuate.v,4,Emptying,6,The government of the Maldives has decided to ...
88637,evacuate.v,4,Emptying,6,Let us assume you wish to evacuate the nightcl...
88638,evacuate.v,4,Emptying,6,"The fire brigade reappeared , bringing them so..."


In [15]:
df[df["Lexical Unit"].str.contains("if\.")]["Lexical Unit"].value_counts()

Lexical Unit
if.scon    201
Name: count, dtype: int64

In [27]:
df["Lexical Unit"].drop_duplicates().str.split(".").apply(lambda x: x[1]).value_counts()

Lexical Unit
v       1092
n        614
a        235
prep      27
adv       17
scon       1
c          1
Name: count, dtype: int64

In [59]:
class ClassifyLexicalUnits:
    def __init__(
        self, pretrained = True, 
        training_filename = "../datasets/lexical_unit_sentences.csv",
        model_directory = "../models/lexical_units"
    ):
        self.df = None
        self.models = None
        self.load_framenet()
        if pretrained:
            # self.load_training_data(training_filename)
            self.load_trained_models(model_directory)
            
    def load_framenet(self):
        nltk.download("framenet_v17")
        lexical_units = fn.lus()
        lu_names = list(set(map(lambda x: x.name, lexical_units)))
        self.lu_frames = { key: [] for key in lu_names }
        for lu in lexical_units:
            if lu.frame.name not in self.lu_frames[lu.name]:
                self.lu_frames[lu.name].append(lu.frame.name)
    
    def load_training_data(self, filename = "../datasets/lexical_unit_sentences.csv"):
        self.df = pd.read_csv(filename)
        self.df["POS"] = self.df["Sentence"].apply(self.pos_tag)
        
    def load_trained_models(self, directory = "../models/lexical_units"):
        self.models = {}
        for filename in os.listdir(directory):
            comps = filename.split(".")
            if comps[1] == "pkl":
                self.models[comps[0]] = pkl.load(open(os.join(directory, filename), "rb"))
                
    def pos_tag(self, sentence):
        if type(sentence) != str:
            return []
        else:
            tokens = nltk.word_tokenize(sentence)
            base_tags = nltk.pos_tag(tokens)
            final_tags = []
            for word, tag in base_tags:
                if tag.startswith("N"):
                    final_tags.append((word, ".n"))
                elif tag.startswith("J"):
                    final_tags.append((word, ".a"))
                elif tag.startswith("V"):
                    final_tags.append((word, ".v"))
                elif tag.startswith("R"):
                    final_tags.append((word, ".adv"))
                elif tag == "IN":
                    final_tags.append((word, ".prep"))
                elif tag == "CD":
                    final_tags.append((word, ".num"))
                elif tag == "CC":
                    final_tags.append((word, ".c"))
                elif tag == "UH":
                    final_tags.append((word, ".intj"))
                elif tag == "DT":
                    final_tags.append((word, ".art"))
                else:
                    final_tags.append((word, ".scon"))
                    
            return final_tags
            
        
    def get_word_lu(self, word, pos):
        possible_lus = list(filter(lambda x: x.startswith("{}".format(word.lower())), self.lu_frames.keys()))
        if len(possible_lus) == 0:
            return None
        elif len(possible_lus) == 1:
            return possible_lus[0]
        else:
            tmp = word + pos
            if tmp in possible_lus:
                return tmp
            else:
                return None
            
    def process_sentence(self, sentence):
        pass
    
    def predict_frame(self, lu, sentence):
        features = self.process_sentence(sentence)
        return self.models[lu].predict(features)
    
    def fit(self, df_train = None, output_dir = "../models/lexical_units"):
        if df_train is not None:
            self.df = copy.deepcopy(df_train)
            self.df["POS"] = self.df["Sentence"].apply(self.pos_tag)
    
    def predict(self, sentences, model_dir = None):
        if self.models is None:
            if model_dir is None:
                self.load_trained_models()
            else:
                self.load_trained_models(model_dir)
                
        pos = list(map(self.pos_tag, sentences))
        predictions = []
        for i in range(len(sentences)):
            curr = []
            for word, tag in pos[i]:
                lu = self.get_word_lu(word, tag)
                if lu is not None:
                    possible_frames = self.lu_frames[lu]
                    if len(possible_frames) == 1:
                        curr.append(possible_frames[0])
                    # else:
                    #     curr.append(self.predict_frame(lu, sentences[i]))
            predictions.append(curr)
            
        return predictions

In [60]:
clu = ClassifyLexicalUnits()

[nltk_data] Downloading package framenet_v17 to
[nltk_data]     /Users/ryanschaefer/nltk_data...
[nltk_data]   Package framenet_v17 is already up-to-date!


In [61]:
clu.predict([df["Sentence"][1]])

[['Fame',
  'Degree',
  'Political_locales',
  'People',
  'Natural_features',
  'Personal_relationship',
  'Distributed_position',
  'Natural_features',
  'Cardinal_numbers',
  'Personal_relationship',
  'Biological_urge',
  'Performers_and_roles',
  'Representing',
  'Duration_description']]

In [29]:
pd.Series.value_counts(list(map(lambda x: x.split(".")[1], clu.lu_frames.keys())))

n       4694
v       3318
a       2042
adv      220
prep      99
num       31
idio      29
scon      12
art        6
c          5
intj       5
pron       1
Name: count, dtype: int64

In [33]:
list(filter(lambda x: x.endswith(".art"), clu.lu_frames.keys()))

['much.art', 'most.art', 'some.art', 'a few.art', 'little.art', 'no.art']