In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import spacy
import pandas as pd
from pprint import pprint
import numpy as np
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import plotly.express as px
import re
import plotly.express as px
from spacy import displacy
from spacy.symbols import NOUN, DET, ADJ


In [None]:
!python -m spacy download nl_core_news_sm

In [None]:
%%timeit
spacy.load('nl_core_news_sm') 

In [None]:
%%timeit
spacy.load('nl_core_news_md') 

In [None]:
%%timeit
spacy.load('nl_core_news_lg') 

In [None]:
nlp = spacy.load('nl_core_news_md') 

In [None]:
df = pd.read_pickle('./data/processed/df_wiki_text.pickle')

In [None]:
doc = nlp("De deur is nu gesloten, in het slot gevallen.Je bent er doorheen gegaan.")
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

In [None]:
for token in doc:
    if token.pos_ == "NOUN":
        
        print(token.text, token.ent_type_, token.sentiment, [child for child in token.children])

In [None]:
displacy.serve(doc, style="dep")

In [None]:
def extract_de_of_het(doc):
    substantieven = []
 
    for token in doc:

    #     print(token.text, token.dep_, token.head.text, token.head.pos_,
    #             [child for child in token.children])


        if token.pos == NOUN:
            # Case 1 : If there is het of de in the POS then extract the head text
    #         print(token.text, token.dep_, token.head.text, token.head.pos_,
    #             [child for child in token.children])


            # Case 2 : There is no clear DET. There must be some ADJ before the noun.

            # if the last letter of the adjective is 'e' and  then DET is always de
            children = [child for child in token.children]
            has_adjective = any([child.pos == ADJ for child in token.children])
            has_det = any([child.pos == DET for child in token.children])
            has_de_of_het = any([child.lemma_ == 'de' or  child.lemma_ == 'het' for child in token.children])
            has_geen_of_een = any([child.lemma_ == 'geen' or  child.lemma_ == 'een' for child in token.children])
            if has_adjective or has_det:
                found_adj_with_last_letter_e=False

                if not has_de_of_het and has_adjective and has_geen_of_een:

                    for child in children:
                        if child.pos == ADJ:
                            if child.text[-1] == 'e':
                                substantieven.append(  
                                    {'det': 'de',
                                     
                                     'woord' : token.text,
                                     'woord_vec':token.vector
                                    } )
                                found_adj_with_last_letter_e=True
                                break
                    if not found_adj_with_last_letter_e:

                        substantieven.append( {'det': 'het','woord' : token.text,'woord_vec':token.vector} )
#                     print(f"Token text  {token.text}")     
#                     print(f"Token children  {children}")     
#                     print(f"Found adjective ending with 'e' :{found_adj_with_last_letter_e}" )

                if has_de_of_het and not has_geen_of_een :

                    for child in children:
                        if child.pos == DET:
                            substantieven.append( {'det': child.lemma_,'woord' : token.text,'woord_vec':token.vector} )
                            break

        
                
    
    return substantieven



    #         if token.lemma_[-1] == 'e': 
    #             substantieven.append({'det': 'het','woord' : token.head.lemma_})
    #         else : # Else it is a de woord
    #             substantieven.append({'det': 'de','woord' : token.head.lemma_})

def is_word_det_or_het(word):
    return word=='de' or word=='het'   

def assert_that_only_wanted_articles_are_added(list_of_articles):
    for article in list_of_articles:
        assert is_word_det_or_het(article['det']), f"The extraction script found article other than het or de. It found {article['det']}"
    
    
def extract_de_of_het_simpler(doc):
    substantieven = []
 
    for token in doc:

    #     print(token.text, token.dep_, token.head.text, token.head.pos_,
    #             [child for child in token.children])


        if token.pos == NOUN:
            # Case 1 : If there is het of de in the POS then extract the head text
    #         print(token.text, token.dep_, token.head.text, token.head.pos_,
    #             [child for child in token.children])


            # Case 2 : There is no clear DET. There must be some ADJ before the noun.

            # if the last letter of the adjective is 'e' and  then DET is always de
            children = [child for child in token.children]
            
            if len(children)>0:

                for child in children:
                    if is_word_det_or_het(child.lemma_):
                        
                        substantieven.append( 
                            {
                            'det': child.lemma_,
                            'woord' : token.text,
                            'woord_vec':token.vector
                                                  
                            } )
    
    # Make sure that 
    assert_that_only_wanted_articles_are_added(substantieven)
    
    return substantieven



def pre_process_text_generator(text_list):
    
    cleanr = re.compile('<.*?>')
    for text in text_list:
        if isinstance(text, str):
            text = text.lower()                 # Converting to lowercase

            text = re.sub(cleanr, ' ', text)                 # Removing HTML tags
            text = re.sub(r'[?|!|\'|"|#]',r'',text)
            text = re.sub(r'[.|,|)|(|\|/]',r' ',text)        # Removing Punctuations

            yield text
        


In [None]:
cleaned_text = pre_process_text_generator(df.text.tolist())

substantieven_doc_wise = []
with nlp.disable_pipes(["ner"],):
    print(f"Using pipeline : {nlp.pipe_names}")
    for doc in nlp.pipe(cleaned_text,n_process=4,batch_size=100):
        
        lidwoordenlijst = extract_de_of_het_simpler(doc)
        
        substantieven_doc_wise.append(lidwoordenlijst)



In [None]:
flatten = lambda l: [item for sublist in l for item in sublist]

In [None]:
len(substantieven_doc_wise)

In [None]:
df_processed = pd.DataFrame.from_dict(flatten(substantieven_doc_wise))

In [None]:
df_processed.head()

In [None]:
# Remove duplicates from name
df_processed.drop_duplicates(subset='woord',inplace=True)

In [None]:
assert all(df_processed.woord.apply(lambda w : len(w)>0 and isinstance(w,str))) # Make sure that there are actual words and that each word is a string

In [None]:
df_processed.det.value_counts()/df_processed.shape[0]*100

In [None]:
df_processed.to_pickle('./data/processed/woorden_met_hetofde.pickle')