In [38]:
import os
import re
import pandas as pd
from tqdm.notebook import tqdm

import spacy

nlp = spacy.load("en_core_web_lg")
nlp.disable_pipes('ner', 'parser')

['ner', 'parser']

In [39]:
def get_english_words(path="../../../input/english_words_alpha_370k.txt"):
    if not os.path.exists(path):
        raise Exception("Provide a valid path to a file with English words.")

    with open(path) as word_file:
            ENGLISH_WORDS = set(word.strip().lower() for word in word_file)
    
    if not len(ENGLISH_WORDS):
        raise Exception("No wordlist could be found in the given file!")
            
    return ENGLISH_WORDS

In [63]:
def is_english_word(word, english_words):
    return word.lower() in english_words

In [40]:
def remove_non_existing_words_from_wordlist(wordlist: list, english_words) -> list:
    if not len(english_words):
        raise Exception("The supplied english words list is empty."
                       )
    wordset = set(wordlist)
    non_existent = []
    
    for word in wordset:
        if not is_english_word(word, english_words):
            non_existent.append(word)
            
    return([word for word in wordlist if word not in non_existent])

In [41]:
def keep_english_words_in_paragraphs(paragraphs, english_words=[], english_words_file="../../../input/english_words_alpha_370k.txt"):
    if not english_words:
        english_words = get_english_words(path=english_words_file)
        
    cleaned_paragraphs = []
    for paragraph in paragraphs:
        cleaned_paragraph = remove_non_existing_words_from_wordlist(wordlist=paragraph, english_words=english_words)
        cleaned_paragraphs.append(cleaned_paragraph)
        
    return cleaned_paragraphs

In [141]:
def lemmatise_paragraphs(paragraphs, POStag, NLP_MAX_LENGTH=1500000):
    """
    --> function that lemmatises the paragraphs of a single text file.
    """
    
    nlp.max_length = NLP_MAX_LENGTH
    
    #Checks if valid part-of-speech tag was provided
    POStags=["PROPN", "AUX", "NOUN", "ADJ", "VERB", "ADP", "SYM", "NUM"]
    if not isinstance(POStag, str) or POStag.upper() not in POStags:
        raise Exception(f'POSfilter only allows any of the following (SpaCy) part-of-speech tags: {POStags}.')
    
    processed_paragraphs = [text for text in tqdm(nlp.pipe(paragraphs, n_process=2, batch_size=1, disable=["ner", "parser"]), desc=f"Lemmatising ({POStag})...",total=len(paragraphs), leave=False)]
    lemmatized_paragraphs = [[word.lemma_ for word in paragraph if word.pos_ == POStag and not word.is_punct and not word.is_stop] for paragraph in processed_paragraphs]
    regexed_paragraphs= [[re.sub(r'\W+', '', word) for word in paragraph] for paragraph in lemmatized_paragraphs]
   
    return regexed_paragraphs

In [137]:
def lemmatise_city_pair(df=df, POS=POS, OVERWRITE=False, ONLY_ENGLISH_WORDS=False, ENGLISH_WORDS = [],
    english_words_file="../../../input/english_words_alpha_370k.txt", NLP_MAX_LENGTH=1500000):
    
    for tag in tqdm(POS, desc=f"POS: {POS}", leave=False):
        if OVERWRITE or tag not in df.columns:
            df[f"{tag}"] = lemmatise_paragraphs(paragraphs=df['paragraph'], POStag=tag, NLP_MAX_LENGTH=NLP_MAX_LENGTH)

        if ONLY_ENGLISH_WORDS and (OVERWRITE or f'{tag}_clean' not in df.columns):
            df[f'{tag}_clean'] = keep_english_words_in_paragraphs(paragraphs=df[tag], english_words=ENGLISH_WORDS)
            
    return df

In [138]:
def lemmatise(INPUT_DIR, POS, BATCHES=[], LEMMATISATION_TYPE='', ONLY_ENGLISH_WORDS=False, english_words_file="../../../input/english_words_alpha_370k.txt", OVERWRITE=False, NLP_MAX_LENGTH=1500000):   
    BATCHES = [str(batch) for batch in BATCHES]
    
    #Checks if valid part-of-speech tag was provided
    POStags = ["PROPN", "AUX", "NOUN", "ADJ", "VERB", "ADP", "SYM", "NUM"]
    if not isinstance(POS, list) or len([tag.upper() for tag in POS if tag not in POStags]):
        raise Exception(f'POSfilter only allows any of the following (SpaCy) part-of-speech tags: {POStags}.')
    
    if ONLY_ENGLISH_WORDS:
        with open(english_words_file) as word_file:
            ENGLISH_WORDS = set(word.strip().lower() for word in word_file)

    chosen_batches = [batch for batch in os.listdir(INPUT_DIR) if not BATCHES or batch in BATCHES]
    
#     # Where the magic happens
    for batch in tqdm(chosen_batches, desc=f"BATCHES: {BATCHES}"):
        batch_dir = os.path.join(INPUT_DIR, batch)
        
        for citypair in tqdm(os.listdir(batch_dir), desc="City Pair", leave=False):
            citypair_dir = os.path.join(batch_dir, citypair)
            CITY_PAIR = citypair.split('___')[1]

            df_paragraphs_path = f"{citypair_dir}/{CITY_PAIR}.csv"
            
            if os.path.exists(df_paragraphs_path):
                df = pd.read_csv(df_paragraphs_path)
                df = lemmatise_city_pair(df=df, POS=POS, OVERWRITE=OVERWRITE, ONLY_ENGLISH_WORDS=ONLY_ENGLISH_WORDS, ENGLISH_WORDS=ENGLISH_WORDS, NLP_MAX_LENGTH=NLP_MAX_LENGTH)
                df.to_csv(df_paragraphs_path, index=False)
            else:
                print(f"Batch: {batch}, City Pair: '{CITY_PAIR}' has no file at '{df_paragraphs_path}'.")
                

In [140]:
%%time

INPUT_DIR = "../../../../../data/clean/city_pair_paragraphs3/"
BATCHES = [5]
POS = ["NOUN", "VERB"]
# LEMMATISATION_TYPE = 'quick', 'accurate'
ONLY_ENGLISH_WORDS = True
OVERWRITE = True

df = lemmatise(INPUT_DIR, POS, BATCHES, ONLY_ENGLISH_WORDS=ONLY_ENGLISH_WORDS, OVERWRITE=OVERWRITE)

BATCHES: ['5']:   0%|          | 0/1 [00:00<?, ?it/s]

City Pair:   0%|          | 0/10 [00:00<?, ?it/s]

POS: ['NOUN', 'VERB']:   0%|          | 0/2 [00:00<?, ?it/s]

Lemmatising...:   0%|          | 0/929 [00:00<?, ?it/s]

Lemmatising...:   0%|          | 0/929 [00:00<?, ?it/s]

POS: ['NOUN', 'VERB']:   0%|          | 0/2 [00:00<?, ?it/s]

Lemmatising...:   0%|          | 0/7389 [00:00<?, ?it/s]

Lemmatising...:   0%|          | 0/7389 [00:00<?, ?it/s]

POS: ['NOUN', 'VERB']:   0%|          | 0/2 [00:00<?, ?it/s]

Lemmatising...:   0%|          | 0/2313 [00:00<?, ?it/s]

Lemmatising...:   0%|          | 0/2313 [00:00<?, ?it/s]

POS: ['NOUN', 'VERB']:   0%|          | 0/2 [00:00<?, ?it/s]

Lemmatising...:   0%|          | 0/2512 [00:00<?, ?it/s]

Lemmatising...:   0%|          | 0/2512 [00:00<?, ?it/s]

POS: ['NOUN', 'VERB']:   0%|          | 0/2 [00:00<?, ?it/s]

Lemmatising...:   0%|          | 0/1034 [00:00<?, ?it/s]

Lemmatising...:   0%|          | 0/1034 [00:00<?, ?it/s]

POS: ['NOUN', 'VERB']:   0%|          | 0/2 [00:00<?, ?it/s]

Lemmatising...:   0%|          | 0/1128 [00:00<?, ?it/s]

Lemmatising...:   0%|          | 0/1128 [00:00<?, ?it/s]

POS: ['NOUN', 'VERB']:   0%|          | 0/2 [00:00<?, ?it/s]

Lemmatising...:   0%|          | 0/7612 [00:00<?, ?it/s]

Lemmatising...:   0%|          | 0/7612 [00:00<?, ?it/s]

POS: ['NOUN', 'VERB']:   0%|          | 0/2 [00:00<?, ?it/s]

Lemmatising...:   0%|          | 0/21051 [00:00<?, ?it/s]

Lemmatising...:   0%|          | 0/21051 [00:00<?, ?it/s]

POS: ['NOUN', 'VERB']:   0%|          | 0/2 [00:00<?, ?it/s]

Lemmatising...:   0%|          | 0/3277 [00:00<?, ?it/s]

Lemmatising...:   0%|          | 0/3277 [00:00<?, ?it/s]

POS: ['NOUN', 'VERB']:   0%|          | 0/2 [00:00<?, ?it/s]

Lemmatising...:   0%|          | 0/3251 [00:00<?, ?it/s]

Lemmatising...:   0%|          | 0/3251 [00:00<?, ?it/s]

CPU times: total: 5min 4s
Wall time: 20min 51s


In [10]:
df.columns

Index(['Title', 'city_pair', 'article_id', 'paragraph_id', 'paragraph'], dtype='object')

In [126]:
bm_df = pd.read_csv("../../../../../data/clean/city_pair_paragraphs3/5/cities___berlin_milan___/berlin_milan.csv", converters={'ADJ': literal_eval})

In [107]:
from ast import literal_eval
columns_to_convert = ['']
for column in columns_to_convert:
    try:
        bm_df.column = bm_df.column.apply(literal_eval)
    except:
        print("Could not convert column values to lists.")

In [127]:
bm_df

Unnamed: 0,Title,city_pair,article_id,paragraph_id,paragraph,ADJ,ADJ_clean
0,arthur schopenhauer,berlin_milan,700,1,"after his tenure in academia, he continued to ...","[old, unknown, paralyzed, right, unable, guilt...","['old', 'unknown', 'paralyzed', 'right', 'unab..."
1,asteroid,berlin_milan,791,2,one of the astronomers selected for the search...,"[catholic, 87th, zodiacal, like, final, fellow...","['catholic', 'zodiacal', 'like', 'final', 'fel..."
2,transport in armenia,berlin_milan,1096,3,there are plenty of air connections between ye...,"[regional, daily, major, main, yearly]","['regional', 'daily', 'major', 'main', 'yearly']"
3,barcelona,berlin_milan,4443,4,"since 2009, 'the brandery', an urban fashion s...","[urban, global, annual, seventh, important]","['urban', 'global', 'annual', 'seventh', 'impo..."
4,europe,berlin_milan,9239,5,when considering the commuter belts or metropo...,"[metropolitan, comparable, available, large]","['metropolitan', 'comparable', 'available', 'l..."
...,...,...,...,...,...,...,...
924,caspar voght,berlin_milan,3878145,933,"at the age of 12, caspar voght fell seriously ...","[permanent, facial, inclined, little, grand, s...","['permanent', 'facial', 'inclined', 'little', ..."
925,hasmik papian,berlin_milan,3909402,934,after her debut at the armenian national opera...,"[national, international, numerous, prestigiou...","['national', 'international', 'numerous', 'pre..."
926,michèle crider,berlin_milan,3936952,935,"since 1991, crider has been heard regularly in...","[great, metropolitan, successful, orange, nati...","['great', 'metropolitan', 'successful', 'orang..."
927,surendranath dasgupta,berlin_milan,3939717,936,the impressions that he had made by his speech...,"[international, illuminated, second]","['international', 'illuminated', 'second']"


In [124]:
bm_df.ADJ_clean = bm_df.ADJ_clean.apply(literal_eval)

ValueError: malformed node or string: ['old', 'unknown', 'paralyzed', 'right', 'unable', 'guilty', 'annual']

In [121]:
# bm_df.ADJ_clean = bm_df.ADJ_clean.apply(literal_eval)

for index, row in bm_df.iterrows():
    if len(row.ADJ) != len(row.ADJ_clean):
        #print(type(row.ADJ_clean))
        print(set(row.ADJ).difference(set(row.ADJ_clean))) #, '\n', row.ADJ_clean)
    #print('-->', lenrow.ADJ, '\n==', row.ADJ_clean)

{'87th'}
{'rossinian'}
{'68th'}
{'austro'}
{'operabase', 'calaf'}
{'87th'}
{'wizz'}
{'edizioni'}
{'eldoret'}
{'andré', 'opéra'}
{'tiergarten'}
{'legnanese', 'brianza', 'bustocco'}
{'upmarket'}
{'firelands'}
{'madama'}
{'10th', 'euwe'}
{'opéra'}
{'ailey', '16th', '50th', '1st'}
{'fckh8'}
{'19961997'}
{'duetten'}
{'treybal', '15th', '14th', '2nd'}
{'voix', 'caramoor'}
{'lichbild'}
{'madama'}
{'philarmonic'}
{'click4sky'}
{'1st'}
{'walküre'}
{'mašín'}
{'janin', 'graeco'}
{'grzegorz'}
{'neue'}
{'imcs', '7th'}
{'micropolitan'}
{'18th', '19th'}
{'viennas', 'hungarys'}
{'9th'}
{'wessenberg'}
{'20th', '19th'}
{'opéra'}
{'18th', 'kindl'}
{'järvi', 'rundfunk', 'sinfonieorchester', 'inbal'}
{'calaf'}
{'stokla'}
{'bloblike', 'doomy'}
{'monteggia'}
{'22nd'}
{'zandonai'}
{'opéra'}
{'václav', 'saarbrücken', 'gerd'}
{'cremonese'}
{'custo'}
{'petipa'}
{'groundbreaking'}
{'nur'}
{'mouvement', '1st'}
{'jiří', 'ladislav'}
{'nanki'}
{'opéra'}
{'ernani'}
{'mtsensk'}
{'passante'}
{'6th'}
{'leipsic'}
{'18th'}