In [311]:
from ast import literal_eval

def import_lemmatised_paragraphs(INPUT_DIR, POS, BATCHES=[], ONLY_ENGLISH_WORDS=False, merged_POS=True, sort_by_paragraphs=False):
    BATCHES = [str(batch) for batch in BATCHES]
    
    POStags = ["PROPN", "AUX", "NOUN", "ADJ", "VERB", "ADP", "SYM", "NUM"]
    if not isinstance(POS, list) or len([tag for tag in POS if tag.upper() not in POStags]):
        raise Exception(f'POSfilter only allows any of the following (SpaCy) part-of-speech tags: {POStags}.')
    
    chosen_batches = [batch for batch in os.listdir(INPUT_DIR) if not BATCHES or batch in BATCHES]
    
    # Where the magic happens
    data_list = []
    missing_POS = dict()
    collected_POS = set()
    
    for batch in tqdm(chosen_batches, desc=f"BATCHES: {BATCHES}"):
        batch_dir = os.path.join(INPUT_DIR, batch)
        
        for citypair in tqdm(os.listdir(batch_dir), desc="City Pair", leave=False):
            citypair_dir = os.path.join(batch_dir, citypair)
            CITY_PAIR = citypair.split('___')[1]

            df_paragraphs_path = f"{citypair_dir}/{CITY_PAIR}.csv"
            df = pd.read_csv(df_paragraphs_path)
            
            sub_df = df[['paragraph', 'paragraph_id']]
            if merged_POS:
                sub_df['merged_POS'] = [[] for _ in range(df.shape[0])]
            
            combined_POS = None
            for tag in POS:
                if ONLY_ENGLISH_WORDS:
                    column_name = f'{tag}_clean'   
                else:
                    column_name = f'{tag}'
                
                if column_name not in df.columns:
                    if not column_name in missing_POS.keys():
                        missing_POS[column_name] = []
                        
                    missing_POS[column_name].append(CITY_PAIR)
                    
                else:
                    string_to_list = df[column_name].apply(literal_eval)
                    
                    if merged_POS:
                        sub_df['merged_POS'] += string_to_list    
                        
                    else:   
                        sub_df[tag] = string_to_list
                            
                    collected_POS.add(tag)

             
            citypair_dict = {'batch': batch, 'city_pair': CITY_PAIR, 'paragraphs_count': len(df), 'english_words': ONLY_ENGLISH_WORDS, 'collected_POS': collected_POS, 'lemmatized_paragraphs': sub_df}
            data_list.append(citypair_dict)
    
    if sort_by_paragraphs:
        data_list = sorted(data_list, key=lambda k: k['paragraphs_count'], reverse=True)
    
    if len(missing_POS):
        print(f'The following POS tags are missing: {missing_POS}')
    
    return data_list

In [346]:
INPUT_DIR = "../../../../../data/clean/city_pair_paragraphs3/"
BATCHES = [5]
POS = ["NOUN", "VERB", "ADJ"]
ONLY_ENGLISH_WORDS = True
sort_by_paragraphs_count = True
merged_POS = True

data_list = import_lemmatised_paragraphs(INPUT_DIR, POS, BATCHES, ONLY_ENGLISH_WORDS=ONLY_ENGLISH_WORDS, merged_POS=merged_POS, sort_by_paragraphs=sort_by_paragraphs_count)

BATCHES: ['5']:   0%|          | 0/1 [00:00<?, ?it/s]

City Pair:   0%|          | 0/10 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sub_df['merged_POS'] = [[] for _ in range(df.shape[0])]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sub_df['merged_POS'] += string_to_list
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sub_df['merged_POS'] = [[] for _ in range(df.shape[0])]
A value is trying to be set on a copy of a slice from 

In [348]:
frames = [citypair['lemmatized_paragraphs'] for citypair in data_list]
citypairs = [citypair['city_pair'] for citypair in data_list]
len(frames)
citypairs

['paris_london',
 'paris_berlin',
 'london_berlin',
 'paris_madrid',
 'paris_milan',
 'london_milan',
 'london_madrid',
 'madrid_milan',
 'madrid_berlin',
 'berlin_milan']

In [349]:
result = pd.concat(frames, keys=citypairs)
#result.set_index('paragraph_id', inplace=True)
len(result.paragraph)
result
# result.loc["paris_london"]

Unnamed: 0,Unnamed: 1,paragraph,paragraph_id,merged_POS
paris_london,0,a major revision of the work by composer and a...,23053,"[revision, work, composer, instrumentation, sa..."
paris_london,1,"in 1900, manhattan alone had 130,000 horses, p...",23054,"[horse, streetcar, wagon, carriage, waste, ped..."
paris_london,2,"when lord rockingham, the whig leader and frie...",23055,"[whig, leader, friend, cause, consul, peace, d..."
paris_london,3,"on 2 december 1926, hitchcock married the engl...",23056,"[screenwriter, alma, oratory, couple, floor, h..."
paris_london,4,"in 1750, amsterdam was the fourth largest city...",23057,"[city, capital, city, seat, government, republ..."
...,...,...,...,...
berlin_milan,924,"at the age of 12, caspar voght fell seriously ...",933,"[age, smallpox, scarring, friend, adolescent, ..."
berlin_milan,925,after her debut at the armenian national opera...,934,"[debut, opera, soloist, opera, career, theater..."
berlin_milan,926,"since 1991, crider has been heard regularly in...",935,"[world, opera, house, opera, garden, opera, op..."
berlin_milan,927,the impressions that he had made by his speech...,936,"[impression, speech, debate, invitation, repre..."


In [350]:
# list(new_df[new_df['paragraph_id'] == 23065]['merged_POS'])[0]
#[data_list[1]['lemmatized_paragraphs']['paragraph_id'] == 21065]

In [None]:
len(pd.read_csv('../../../../../data/clean/city_pair_paragraphs/biggest_cities_5/cities___paris_london___/paris_london.csv'))

In [None]:
import os
import pickle

word_list_location = "../../../../data/enwiki_city_pairs_lemmatised/lemmatised_paragraphs/"
city_pair_wordlists = []
city_pairs = []
for file in os.scandir(word_list_location):
    with open(file.path, 'rb') as fp:
        city_pair_wordlists.append(pickle.load(fp))
        city_pairs.append(file.name.split('__')[1])