In [1]:
# NLP
import spacy
from collections import Counter
from spacy.matcher import Matcher
from dframcy import DframCy

# DATA MANIPULATION 
from tqdm import tqdm
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv("data/merged.csv")
df.head()

(keep it simple stupid)

In [None]:
df = df[df['constitution'].notna()]
df = df.reset_index(drop=True)
df.info()

In [None]:
#!python -m spacy download en_core_web_lg

In [None]:
def extract_keywords(nlp, text):

    doc = nlp(text)
    matcher = Matcher(nlp.vocab)

    # COUNT FREQUENCY - SIMPLE TOKEN
    words = [token.text.lower() for token in doc]
    word_freq = Counter(words)

    # COUNT FREQUENCY - COMPLEX WORD
    pattern_1 = [{"POS": "NOUN"}, {"LOWER": "of"}, {"POS": "NOUN"}]
    matcher.add("NOUN-of-NOUN", [pattern_1])
    pattern_2 = [{"POS": "NOUN"}, {"POS": "ADJ"}]
    matcher.add("NOUN-ADJ", [pattern_2])
    pattern_3 = [{"POS": "NOUN"}, {"LOWER": "of"}, {"LOWER": "the"}, {"POS": "NOUN"}]
    matcher.add("NOUN-of-the-NOUN", [pattern_3])

    doc = nlp(text)
    matches = matcher(doc)
    complex_words = []
    
    for match_id, start, end in matches:
        span = doc[start:end]  # The matched span
        complex_word = span.text
        complex_words.append(complex_word)

    complex_words_freq = Counter(complex_words)

    return word_freq, complex_words_freq

In [None]:
df["simple_keywords"] = ""
df["complex_keywords"] = ""

for index, row in tqdm(df.iterrows(), desc='df rows - Keywords', total=df.shape[0]):
    
    nlp = spacy.load("en_core_web_lg")
    simple_keywords, complex_keywords = extract_keywords(nlp, row["constitution"])
    row["simple_keywords"] = simple_keywords
    row["complex_keywords"] = complex_keywords

In [None]:
df['nouns'] = ""
df['verbs'] = ""
df['entities'] = ""

for index, row in tqdm(df.iterrows(), desc='df rows - Grammatical entities', total=df.shape[0]):
    
    nlp = spacy.load("en_core_web_lg")
    dframcy = DframCy(nlp)
    docs = dframcy.nlp(row['constitution'])
    anotation_dataframe, entity_dataframe = dframcy.to_dataframe(docs, separate_entity_dframe=True)
    row['entities'] = list(entity_dataframe['ent_text'])
    verbs = anotation_dataframe.loc[anotation_dataframe['token_tag_'] == "VERB"]['token_text']
    nouns = anotation_dataframe.loc[anotation_dataframe['token_tag_'] == "NOUN"]['token_text']
    row['verbs'] = verbs.tolist()
    row['nouns'] = nouns.tolist()