In [None]:
import sys
sys.path.append('../gerlach_et_al_src/')

from filter_words import run_stopword_statistics
from filter_words import make_stopwords_filter
from filter_words import remove_stopwords_from_list_texts

from real_corpora import tranfer_real_corpus_toID_and_shuffle
from ldavb import ldavb_inference_terminal, obtain_ldavb_cpuTime_memory
from evaluation import obtain_nmi_unsup, state_dwz_nmi

In [None]:
import sys
sys.path.append('../src/')

from utils_tiramisu import *

from tqdm import tqdm

from pathlib import Path

# this is the same TIRAMISU_PATH as shown in start_here.ipynb
TIRAMISU_PATH = 

In [None]:
import pandas as pd
import re
import numpy as np

Following the code from _Gerlach et al._ (2019), we filter stopwods at the document level as Doc2Vec models are trained on documents as the text records in a corpus. 

`../cache/pdfs_word_excel_powerpoint_010924.parquet` is simply a Pandas DataFrame that contains the combined texts of the scanned/electronic PDFs and MS documents. The columns are `text`, which is the raw text, and `nodeID` which is the nodeIDs of the split single-page PDFs or the MS documents.

In [None]:
all_pdfs = return_from_neo4j("""
match (n:Folder) - [:CONTAINS] -> (e:File) - [:SPLIT_INTO] -> (c:File) - [:CONVERT_TO] -> (f:File) 
where e.fileExtension = 'pdf' and f.fileExtension = 'png' 
return c.nodeID as nodeID, e.originalPath as path, e.fileExtension as fileExtension
""")

all_ms = return_from_neo4j("""
match (n:Folder) - [:CONTAINS] -> (e:File) 
where e.fileExtension in ['doc', 'docx', 'ppt', 'pptx'] 
return e.nodeID as nodeID, e.originalPath as path, e.fileExtension as fileExtension
""")

corpus = pd.read_parquet('../cache/pdfs_word_excel_powerpoint_010924.parquet')
corpus['processed'] = corpus['text'].apply(lambda x: re.sub(r'\W+', ' ', x.strip().lower()) )
all_excel = return_from_neo4j("""
match (n:Folder) - [:CONTAINS] -> (e:File) 
where e.fileExtension in ['xls', 'xlsx'] 
return e.nodeID as nodeID, e.originalPath as path, e.fileExtension as fileExtension
""")

corpus = corpus.loc[~corpus.nodeID.isin(all_excel['nodeID'].to_list())]

corpus['processed'] = corpus['text'].apply(lambda x: re.sub(r'\W+', ' ', x.strip().lower()) )

corpus = corpus[['processed', 'nodeID']]

folder_structure = pd.concat([all_pdfs, all_ms])

map_nodeID_to_docID = return_from_neo4j("""
match (n:Folder) - [:CONTAINS] -> (e:File) - [:SPLIT_INTO] -> (c:File) - [:PART_OF] -> (d:Document) 
where e.fileExtension = 'pdf' 
return c.nodeID as nodeID, d.nodeID as documentID 
""").set_index('nodeID').to_dict()['documentID']


merged = pd.merge(corpus, folder_structure, on = 'nodeID')
merged['documentID'] = merged['nodeID'].apply(lambda x: map_nodeID_to_docID[x] if x in map_nodeID_to_docID else None)

pdfs_with_paths = merged.loc[merged.documentID.notna()]

together = pd.concat([merged.loc[merged.fileExtension != 'pdf'], pdfs_with_paths])
together['documentID'] = together.apply(lambda x: x['nodeID'] if x['documentID'] is None else x['documentID'], axis = 1)

together = together[['processed', 'fileExtension', 'path', 'documentID']]

together['filePath'] = together['path'] + '---' + together['documentID']

to_put_into_gensim = together[['processed', 'filePath']].set_index('filePath').groupby('filePath').apply(lambda x : x.to_numpy().tolist()).to_dict()

In [None]:
list_texts = []
list_nodeIDs = []

for document in tqdm(to_put_into_gensim, total = len(to_put_into_gensim)):
    temp_corpus = []
    for i in to_put_into_gensim[document]:
        
        for token in i[0].split():
            if token == '':
                continue
            temp_corpus.append(token.strip())
        
    if len(temp_corpus) == 0:
        continue
    list_nodeIDs.append(document)
    list_texts.append(temp_corpus)

In [None]:
%%time

# this is provided by Gerlach et al
path_stopword_list =  'stopwords_filtering/data/stopword_list_en'

## number of realizations for the random null model
N_s = 10

## get the statistics
df = run_stopword_statistics(list_texts,N_s=N_s,path_stopword_list=path_stopword_list)

## look at the entries
df.sort_values(by='F',ascending=False).head()

In [None]:
for fraction in tqdm([0, 0.01, 0.05, 0.1, 0.2, 0.5]):
    cutoff_type = 'p'
    cutoff_val = fraction
    method = 'INFOR'
    df_filter = make_stopwords_filter(df,
                                  method = method,
                                  cutoff_type = cutoff_type, 
                                  cutoff_val = fraction, )
    list_words_filter = list(df_filter.index)
    list_texts_filter = remove_stopwords_from_list_texts(list_texts, list_words_filter)
    N = sum([ len(doc) for doc in list_texts ])
    N_filter = sum([ len(doc) for doc in list_texts_filter ])
    print('Remaining fraction of tokens',N_filter/N, fraction)
    to_df = [' '.join(i) for i in list_texts_filter]
    
    to_df = pd.DataFrame(to_df)
    
    to_df['nodeIDs'] = list_nodeIDs
    to_df.columns = ['text', 'nodeID']

    to_df.to_parquet(f'../models/stopwords_filtering/filtered_text_240320_{fraction}_removed_22070.parquet')
