### Imports

In [110]:
from general_functions import *

In [100]:
import os
import pickle
from tqdm.notebook import tqdm
import re
import spacy

import pandas as pd

nlp = spacy.load("en_core_web_lg")
nlp.disable_pipes('ner', 'parser')


['ner', 'parser']

In [89]:
s = "ADJ.pickle".split('.')
'_CLEAN.'.join(s)

'ADJ_CLEAN.pickle'

In [133]:
def check_path(path, OVERRIDE=False):
    """
    -->
        function that checks if file exists and if so, asks the user if they want to override it.

        Parameters:
            -----------
                path: str -> path to check

    """
    if os.path.exists(path):
        if OVERRIDE:
            print(f"Overriding the following file: {path}")
            return True
        else:
            decision = input(f"This following path already exists:\n '{path}'\nAre you sure you want to override?\n Continue? [y/n]: ")
            
            if decision == 'y':
                return True
                print("The process has been continued.")
            elif decision == 'n':
                print("The process has been halted.")
                return False
            else:
                print("You did not enter a valid option. \nCanceling Operation...")
                return False
    else:
        return True

In [138]:
def lemmatise_paragraphs(df, OUTPUT_PATH, POS, ONLY_ENGLISH_WORDS=False, ENGLISH_WORD_LIST=[], OVERWRITE=False, NLP_MAX_LENGTH=1500000):
    """
    -->
        function that lemmatises the paragraphs of a single text file.

        Parameters:
        -----------
            FILE_PATH: Str -> input directory path, to the text files
            FILE_OUTPUT_DIR: Str -> output directory path, where you want to save the .pickle files
            POS: string (e.g. "NOUN") -> options: (https://spacy.io/usage/spacy-101#annotations-pos-deps)
            OVERRIDE_OLD_WORDLISTS: Bool -> Whether you want to override existing output files
            NLP_MAX_LENGTH: Int (default: 1500000) -> Allowed number of characters per file
    """
    
    nlp.max_length = NLP_MAX_LENGTH
    
    #Checks if valid part-of-speech tag was provided
    POStags=["PROPN", "AUX", "NOUN", "ADJ", "VERB", "ADP", "SYM", "NUM"]
    if not isinstance(POS, str) or POS.upper() not in POStags:
        raise Exception(f'POSfilter only allows any of the following (SpaCy) part-of-speech tags: {POStags}.')
    
    paragraphs_dict = {}
    if check_path(OUTPUT_PATH, OVERWRITE):
        processed_paragraphs = [text for text in tqdm(nlp.pipe(df.paragraph, n_process=2, batch_size=1, disable=["ner", "parser"]), desc=f"Lemmatising ({OUTPUT_PATH.split('___')[1]})",total=len(df.paragraph), leave=False)]
        lemmatized_paragraphs = [[word.lemma_ for word in paragraph if word.pos_ == POS and not word.is_punct and not word.is_stop] for paragraph in processed_paragraphs]
        regexed_paragraphs= [[re.sub(r'\W+', '', word) for word in paragraph] for paragraph in lemmatized_paragraphs]
        
        for index, lemmatised_paragraph in enumerate(regexed_paragraphs):
            paragraphs_dict[df.loc[index].paragraph_id] = lemmatised_paragraph

        with open(OUTPUT_PATH, 'wb') as fp:
            pickle.dump(paragraphs_dict, fp)
    
    filename = os.path.basename(OUTPUT_PATH)
    CLEAN_PATH = f"{os.path.dirname(OUTPUT_PATH)}/{'_CLEAN.'.join(filename.split('.'))}"

    if ONLY_ENGLISH_WORDS and check_path(CLEAN_PATH, OVERWRITE):
        if not paragraphs_dict:
            with open(OUTPUT_PATH, 'rb') as file_read:
                    paragraphs_dict = pickle.load(file_read)
                    
        for paragraph_id in tqdm(paragraphs_dict, desc='Removing non-existent words', leave=False):
            cleaned_lemmatised_paragraph = remove_non_existing_words(paragraphs_dict[paragraph_id], ENGLISH_WORD_LIST)
            paragraphs_dict[paragraph_id] = cleaned_lemmatised_paragraph

        with open(CLEAN_PATH, 'wb') as file_write:
            pickle.dump(paragraphs_dict, file_write)

In [139]:
def is_english_word(word, english_words):
    return word.lower() in english_words

def remove_non_existing_words(wordlist: list, english_words) -> list:
    if not len(english_words):
        raise Exception("The supplied english words list is empty."
                       )
    wordset = set(wordlist)
    non_existent = []
    
    for word in wordset:
        if not is_english_word(word, english_words):
            non_existent.append(word)
            
    return([word for word in wordlist if word not in non_existent])

# def removing_non_existent_words(file_path, english_words):
#     if not 'CLEAN' in file_path:
#         with open(file_path, 'rb') as fp:
#             lemmatised_paragraphs = pickle.load(fp)

#             for paragraph_id in lemmatised_paragraphs:
#                 cleaned_lemmatised_paragraph = remove_non_existing_words(lemmatised_paragraphs[paragraph_id], english_words)
#                 lemmatised_paragraphs[paragraph_id] = cleaned_lemmatised_paragraph

#         new_file_path = file_path.replace('.pickle, CLEAN.pickle')
#         with open(new_file_path, 'wb') as fp:
#             pickle.dump(lemmatised_paragraphs, fp)
            

In [79]:
import os
import re
import time

from tqdm.notebook import tqdm
import pandas as pd

In [91]:
def check_POS(POS):
    POStags=["PROPN", "AUX", "NOUN", "ADJ", "VERB", "ADP", "SYM", "NUM"]
    
    #Checks if valid part-of-speech tag was provided
    if isinstance(POS, str):
        if POS.upper() not in POStags:
            raise Exception(f'POSfilter only allows any of the following (SpaCy) part-of-speech tags: {POStags}.')
    elif isinstance(POS, list):
        for tag in POS:
            if tag.upper() not in POStags:
                raise Exception(f'POSfilter only allows any of the following (SpaCy) part-of-speech tags: {POStags}.')
    
    return True

In [168]:
def lemmatise(INPUT_DIR, POS, BATCHES=[], LEMMATISATION_TYPE='', ONLY_ENGLISH_WORDS=False, english_words_file="../../../input/english_words_alpha_370k.txt", OVERWRITE=False):   
    BATCHES = [int(x) for x in BATCHES]
    reg_str = 'biggest_cities_([0-9]+)'
    
    #Checks if valid part-of-speech tag was provided
    POStags = ["PROPN", "AUX", "NOUN", "ADJ", "VERB", "ADP", "SYM", "NUM"]
    if not isinstance(POS, list) or len([tag.upper() for tag in POS if tag not in POStags]):
        raise Exception(f'POSfilter only allows any of the following (SpaCy) part-of-speech tags: {POStags}.')
    
    if ONLY_ENGLISH_WORDS:
        with open(english_words_file) as word_file:
            ENGLISH_WORDS = set(word.strip().lower() for word in word_file)

    batch_dirs = [os.path.join(INPUT_DIR, batch) for batch in os.listdir(INPUT_DIR) if not BATCHES or int(re.findall(reg_str, batch)[0]) in BATCHES]

    # Where the magic happens
    for batch_dir in tqdm(batch_dirs, desc=f"BATCHES: {BATCHES}"):
        
        for citypair in tqdm(os.listdir(batch_dir), desc="City Pair", leave=False):
            citypair_dir = os.path.join(batch_dir, citypair)
            CITY_PAIR = citypair.split('___')[1]

            df_paragraphs_path = f"{citypair_dir}/{CITY_PAIR}.csv"
            if os.path.exists(df_paragraphs_path):
                df = pd.read_csv(df_paragraphs_path)

                for tag in tqdm(POS, desc=f"POS: {POS}", leave=False):
                    POS_path = f"{citypair_dir}/lemmatisation/{tag}.pickle"
                    lemmatise_paragraphs(df=df, 
                                         OUTPUT_PATH=POS_path,
                                         POS=tag,
                                         OVERWRITE=OVERWRITE,
                                         ONLY_ENGLISH_WORDS=ONLY_ENGLISH_WORDS,
                                         ENGLISH_WORD_LIST = ENGLISH_WORDS,
                                         NLP_MAX_LENGTH=1500000)

                    
#             for foldertype in ['classification, frequencies', 'lemmatisation']:
#                 os.makedirs(f"{citypair_dir}/{foldertype}", exist_ok=True)


#             if lemmatisation:
#                 for tag in POS:

#                     FILE_PATH = os.path.join(root, file)
#                     FILE_OUTPUT_DIR = root + '/' + 'lemmatisation'


In [175]:
%%time

INPUT_DIR = "../../../../../data/clean/city_pair_paragraphs/"
BATCHES = [5]
POS = ["VERB", "ADJ"]
# LEMMATISATION_TYPE = 'quick', 'accurate'
ONLY_ENGLISH_WORDS = True
OVERWRITE = False

lemmatise(INPUT_DIR, POS, BATCHES, ONLY_ENGLISH_WORDS=ONLY_ENGLISH_WORDS, OVERWRITE=True)

BATCHES: [5]:   0%|          | 0/1 [00:00<?, ?it/s]

City Pair:   0%|          | 0/10 [00:00<?, ?it/s]

POS: ['VERB', 'ADJ']:   0%|          | 0/2 [00:00<?, ?it/s]

Lemmatising (berlin_milan):   0%|          | 0/929 [00:00<?, ?it/s]

Removing non-existent words:   0%|          | 0/929 [00:00<?, ?it/s]

Lemmatising (berlin_milan):   0%|          | 0/929 [00:00<?, ?it/s]

Removing non-existent words:   0%|          | 0/929 [00:00<?, ?it/s]

POS: ['VERB', 'ADJ']:   0%|          | 0/2 [00:00<?, ?it/s]

Lemmatising (london_berlin):   0%|          | 0/7389 [00:00<?, ?it/s]

Removing non-existent words:   0%|          | 0/7389 [00:00<?, ?it/s]

Lemmatising (london_berlin):   0%|          | 0/7389 [00:00<?, ?it/s]

Removing non-existent words:   0%|          | 0/7389 [00:00<?, ?it/s]

POS: ['VERB', 'ADJ']:   0%|          | 0/2 [00:00<?, ?it/s]

Lemmatising (london_madrid):   0%|          | 0/2313 [00:00<?, ?it/s]

Removing non-existent words:   0%|          | 0/2313 [00:00<?, ?it/s]

Lemmatising (london_madrid):   0%|          | 0/2313 [00:00<?, ?it/s]

Removing non-existent words:   0%|          | 0/2313 [00:00<?, ?it/s]

POS: ['VERB', 'ADJ']:   0%|          | 0/2 [00:00<?, ?it/s]

Lemmatising (london_milan):   0%|          | 0/2512 [00:00<?, ?it/s]

Removing non-existent words:   0%|          | 0/2512 [00:00<?, ?it/s]

Lemmatising (london_milan):   0%|          | 0/2512 [00:00<?, ?it/s]

Removing non-existent words:   0%|          | 0/2512 [00:00<?, ?it/s]

POS: ['VERB', 'ADJ']:   0%|          | 0/2 [00:00<?, ?it/s]

Lemmatising (madrid_berlin):   0%|          | 0/1034 [00:00<?, ?it/s]

Removing non-existent words:   0%|          | 0/1034 [00:00<?, ?it/s]

Lemmatising (madrid_berlin):   0%|          | 0/1034 [00:00<?, ?it/s]

Removing non-existent words:   0%|          | 0/1034 [00:00<?, ?it/s]

POS: ['VERB', 'ADJ']:   0%|          | 0/2 [00:00<?, ?it/s]

Lemmatising (madrid_milan):   0%|          | 0/1128 [00:00<?, ?it/s]

Removing non-existent words:   0%|          | 0/1128 [00:00<?, ?it/s]

Lemmatising (madrid_milan):   0%|          | 0/1128 [00:00<?, ?it/s]

Removing non-existent words:   0%|          | 0/1128 [00:00<?, ?it/s]

POS: ['VERB', 'ADJ']:   0%|          | 0/2 [00:00<?, ?it/s]

Lemmatising (paris_berlin):   0%|          | 0/7612 [00:00<?, ?it/s]

Removing non-existent words:   0%|          | 0/7612 [00:00<?, ?it/s]

Lemmatising (paris_berlin):   0%|          | 0/7612 [00:00<?, ?it/s]

Removing non-existent words:   0%|          | 0/7612 [00:00<?, ?it/s]

POS: ['VERB', 'ADJ']:   0%|          | 0/2 [00:00<?, ?it/s]

Lemmatising (paris_london):   0%|          | 0/21051 [00:00<?, ?it/s]

Removing non-existent words:   0%|          | 0/21051 [00:00<?, ?it/s]

Lemmatising (paris_london):   0%|          | 0/21051 [00:00<?, ?it/s]

Removing non-existent words:   0%|          | 0/21051 [00:00<?, ?it/s]

POS: ['VERB', 'ADJ']:   0%|          | 0/2 [00:00<?, ?it/s]

Lemmatising (paris_madrid):   0%|          | 0/3277 [00:00<?, ?it/s]

Removing non-existent words:   0%|          | 0/3277 [00:00<?, ?it/s]

Lemmatising (paris_madrid):   0%|          | 0/3277 [00:00<?, ?it/s]

Removing non-existent words:   0%|          | 0/3277 [00:00<?, ?it/s]

POS: ['VERB', 'ADJ']:   0%|          | 0/2 [00:00<?, ?it/s]

Lemmatising (paris_milan):   0%|          | 0/3251 [00:00<?, ?it/s]

Removing non-existent words:   0%|          | 0/3251 [00:00<?, ?it/s]

Lemmatising (paris_milan):   0%|          | 0/3251 [00:00<?, ?it/s]

Removing non-existent words:   0%|          | 0/3251 [00:00<?, ?it/s]

CPU times: total: 5min 8s
Wall time: 18min 46s


In [173]:
P = "../../../../../data/clean/city_pair_paragraphs/biggest_cities_5/cities___paris_london___/lemmatisation/NOUN_CLEAN.pickle"
with open(P, 'rb') as file_read:
    paragraphs = pickle.load(file_read)

FileNotFoundError: [Errno 2] No such file or directory: '../../../../../data/clean/city_pair_paragraphs/biggest_cities_5/cities___paris_london___/lemmatisation/NOUN_CLEAN.pickle'

In [172]:
len(paragraphs)

1749