# Init

In [9]:
import sys
import gc
from tqdm import tqdm
import pandas as pd
import numpy as np
import re
from sciosci.assets import keyword_assets as kw

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
tqdm.pandas()
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/sahand/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [2]:
gc.collect()

35

# Load

In [6]:
year_from = 1900
year_to = 2020

MAKE_SENTENCE_CORPUS = False
MAKE_SENTENCE_CORPUS_ADVANCED = True
MAKE_REGULAR_CORPUS = True
GET_WORD_FREQ_IN_SENTENCE = False
PROCESS_KEYWORDS = False

stops = ['a','an','we','result','however','yet','since','previously','although','propose','proposed','this']
nltk.download('stopwords')
stop_words = list(set(stopwords.words("english")))+stops


# data_path_rel = '/home/sahand/GoogleDrive/Data/Relevant Results _ DOI duplication - scopus keywords - document types - 31 july.csv'
data_path_rel = '/home/sahand/Data/AI ALL 1900-2019 - reformat'
data_full_relevant = pd.read_csv(data_path_rel)

root_dir = '/home/sahand/Data/Corpus/AI ALL/'
subdir = 'AI ALL nolem stopword removed thesaurus/' # no_lemmatization_no_stopwords
gc.collect()

[nltk_data] Downloading package stopwords to /home/sahand/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


0

# Initial Pre-Processing :
Following tags requires WoS format. Change them otherwise.

In [7]:
data_filtered = data_full_relevant.copy()
data_filtered = data_filtered[pd.notnull(data_filtered['PY'])]

data_filtered = data_filtered[data_filtered['PY'].astype('int')>year_from-1]
data_filtered = data_filtered[data_filtered['PY'].astype('int')<year_to]

# Remove columns without keywords/abstract list 
data_with_keywords = data_filtered[pd.notnull(data_filtered['DE'])]
data_with_abstract = data_filtered[pd.notnull(data_filtered['AB'])]

# Remove numbers from abstracts to eliminate decimal points and other unnecessary data
data_with_abstract['AB'] = data_with_abstract['AB'].progress_apply(lambda x: kw.find_and_remove_c(x) if pd.notnull(x) else np.nan).str.lower()
data_with_abstract['AB'] = data_with_abstract['AB'].progress_apply(lambda x: kw.find_and_remove_term(x,'et al.') if pd.notnull(x) else np.nan)
data_with_abstract['AB'] = data_with_abstract['AB'].progress_apply(lambda x: kw.find_and_remove_term(x,'eg.') if pd.notnull(x) else np.nan)
data_with_abstract['AB'] = data_with_abstract['AB'].progress_apply(lambda x: kw.find_and_remove_term(x,'ie.') if pd.notnull(x) else np.nan)
data_with_abstract['AB'] = data_with_abstract['AB'].progress_apply(lambda x: kw.find_and_remove_term(x,'vs.') if pd.notnull(x) else np.nan)
data_with_abstract['AB'] = data_with_abstract['AB'].progress_apply(lambda x: kw.find_and_remove_term(x,'ieee') if pd.notnull(x) else np.nan)
data_with_abstract['AB'] = data_with_abstract['AB'].progress_apply(lambda x: kw.find_and_remove_term(x,'fig.','figure') if pd.notnull(x) else np.nan)

# gc.collect()
abstracts = []
for abstract in tqdm(data_with_abstract['AB'].values.tolist()):
    numbers = re.findall(r"[-+]?\d*\.\d+|\d+", abstract)
    for number in numbers:
        abstract = kw.find_and_remove_term(abstract,number)
    abstracts.append(abstract)
data_with_abstract['AB'] = abstracts.copy()
del  abstracts

year_list = pd.DataFrame(data_with_abstract['PY'].values.tolist(),columns=['year'])
year_list.to_csv(root_dir+subdir+str(year_from)+'-'+str(year_to-1)+' corpus years',index=False) # Save year indices to disk for further use
gc.collect()

100%|██████████| 2118312/2118312 [01:04<00:00, 32703.67it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':
100%|██████████| 2118312/2118312 [00:07<00:00, 283472.38it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
100%|██████████| 2118312/2118312 [00:06<00:00, 320108.14it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-vie

10

# Sentence Making

## Simple

In [4]:
if MAKE_SENTENCE_CORPUS is True:
    thesaurus = pd.read_csv('data/thesaurus/thesaurus_for_ai_keyword_with_().csv')
    thesaurus = thesaurus.fillna('')
    print("\nSentence maker and thesaurus matching. \nThis will take some time...")
    
    data_with_abstract['AB_no_c'] = data_with_abstract['AB'].apply(lambda x: kw.find_and_remove_c(x) if pd.notnull(x) else np.nan)
    sentence_corpus = []
    
    for index,row in tqdm(data_with_abstract.iterrows(),total=data_with_abstract.shape[0]):
        words = re.split('( |\\n|\.|\?|!|:|;|,|_|\[|\])',row['AB_no_c'].lower())
        new_words = []
        year = row['PY']
        flag_word_removed = False
        for w_idx,word in enumerate(words):
            if flag_word_removed is True:
                if word==' ':
                    flag_word_removed = False
                    continue
            if word in thesaurus['alt'].values.tolist():
                word_old = word
                buffer_word = word
                word = thesaurus[thesaurus['alt']==word]['original'].values.tolist()[0]
#                print("changed '",word_old,"' to '",word,"'.")
                
            new_words.append(word)
            
        row = ''.join(new_words)
        
        sentences = re.split('(\. |\? |\\n)',row)
        sentences = [i+j for i,j in zip(sentences[0::2], sentences[1::2])]
        
        for sentence_n in sentences:
            sentence_corpus.append([index,sentence_n,year])
    
    sentence_corpus = pd.DataFrame(sentence_corpus,columns=['article_index','sentence','year'])
    
    sentence_corpus.to_csv(root_dir+subdir+str(year_from)+'-'+str(year_to-1)+' corpus sentences abstract-title',index=False,header=True)

gc.collect()

20

## Advanced

In [12]:
if MAKE_SENTENCE_CORPUS_ADVANCED is True:    
    data_with_abstract['TI_AB'] = data_with_abstract.TI.map(str) + ". " + data_with_abstract.AB
    data_fresh = data_with_abstract[['TI_AB','PY']].copy()
    data_fresh['TI_AB'] = data_fresh['TI_AB'].str.lower()
    
    del data_with_abstract
    gc.collect()
    
    data_tmp = data_fresh[1:10]
    data_fresh[-2:-1]

    print("\nSentence extraction")
    sentences = []
    years = []
    indices = []
    for index,row in tqdm(data_fresh.iterrows(),total=data_fresh.shape[0]):
        abstract_str = row['TI_AB']
        year = row['PY']
        abstract_sentences = re.split('\. |\? |\\n',abstract_str)
        length = len(abstract_sentences)
        
        sentences.extend(abstract_sentences)
        years.extend([year for x in range(length)])
        indices.extend([index for x in range(length)])
        
    print("\nTokenizing")
    tmp = []
    for sentence in tqdm(sentences):
        tmp.append(word_tokenize(sentence))
    sentences = tmp.copy()
    del tmp

    print("\nString pre processing for abstracts: lower and strip")
    sentences = [list(map(str.lower, x)) for x in sentences]
    sentences = [list(map(str.strip, x)) for x in sentences]
    
    tmp = []
    print("\nString pre processing for abstracts: lemmatize and stop word removal")
    for string_list in tqdm(sentences, total=len(sentences)):
        tmp_list = [kw.string_pre_processing(x,stemming_method='None',lemmatization=False,stop_word_removal=True,stop_words_extra=stops,verbose=False,download_nltk=False) for x in string_list]
        tmp.append(tmp_list)
    sentences = tmp.copy()
    del tmp
    
    tmp = []
    print("\nString pre processing for abstracts: null word removal")
    for string_list in tqdm(sentences, total=len(sentences)):
        tmp.append([x for x in string_list if x!=''])
    sentences = tmp.copy()
    del tmp
    
    print("\nThesaurus matching")
    sentences = kw.thesaurus_matching(sentences)
    
    print("\nStitiching words")
    tmp = []
    for words in tqdm(sentences, total=len(sentences)):
        tmp.append(' '.join(words))
    sentences = tmp.copy()
    del tmp
    
    sentence_df = pd.DataFrame(indices,columns=['article_index'])
    sentence_df['sentence'] = sentences
    sentence_df['year'] = years
    sentence_df.to_csv(root_dir+subdir+str(year_from)+'-'+str(year_to-1)+' corpus sentences abstract-title',index=False,header=True)
    
# gc.collect()

  0%|          | 0/2118312 [00:00<?, ?it/s]


Sentence extraction


100%|██████████| 2118312/2118312 [04:06<00:00, 8590.69it/s]
  0%|          | 0/17531442 [00:00<?, ?it/s]


Tokenizing


100%|██████████| 17531442/17531442 [47:21<00:00, 6170.38it/s]  



String pre processing for abstracts: lower and strip


  0%|          | 0/17531442 [00:00<?, ?it/s]


String pre processing for abstracts: lemmatize and stop word removal


100%|██████████| 17531442/17531442 [25:08:04<00:00, 193.75it/s]    



String pre processing for abstracts: null word removal


100%|██████████| 17531442/17531442 [01:49<00:00, 159943.56it/s]
  0%|          | 0/17531442 [00:00<?, ?it/s]


Thesaurus matching


100%|██████████| 17531442/17531442 [27:02<00:00, 10801.96it/s]
  0%|          | 65799/17531442 [00:00<00:26, 657982.78it/s]


Stitiching words


100%|██████████| 17531442/17531442 [00:14<00:00, 1216825.97it/s]


In [26]:
sentence_df = pd.read_csv(root_dir+subdir+str(year_from)+'-'+str(year_to-1)+' corpus sentences abstract-title')

In [27]:
print(sentence_df.shape)
sentence_df = sentence_df[pd.notnull(sentence_df['sentence'])]
print(sentence_df.shape)

(17531442, 3)
(16564332, 3)


In [40]:
abstracts_df = sentence_df.groupby(['article_index','year'])['sentence'].progress_apply('. '.join).reset_index()
abstracts_df.columns = ['article_index','year','abstract']

100%|██████████| 2118312/2118312 [03:11<00:00, 11042.47it/s]


In [41]:
abstracts_df.shape

(2118312, 3)

In [45]:
abstracts_df.to_csv(root_dir+subdir+str(year_from)+'-'+str(year_to-1)+' corpus abstract-title',index=False,header=True)

### Replace n-grams

In [96]:
data_keywords = pd.read_csv(root_dir+'../n-gram author keyword taxonomy 300k.csv')

wanted_grams = [2,3,4,5,6] # Statistically, 5 seems to be a proper cutting point as the frequency table suggests. Refer to : "Get statsitic of n in n-grams of corpus" block in drafts.
periods = [[1990,2005],[2005,2008],[2008,2011],[2011,2014],[2014,2017],[2017,2019]]
thesaurus = []

# =============================================================================
# Prepare keywords
# =============================================================================
print('\nPreparing keywords...')
data_keywords['grams'] = [len(x.split()) for x in data_keywords.keywords.values.tolist()]
data_keywords = data_keywords[data_keywords['count']>1]

# =============================================================================
# Make keyword dictionary/thesaurus for all wanted gram counts
# =============================================================================
print('\nPreparing thesaurus...')
idx = 0
for grams_count in tqdm(wanted_grams):
    data_keywords_tmp = data_keywords[data_keywords.grams==wanted_grams[idx]].copy()
    data_keywords_tmp['keywords'] = data_keywords_tmp.keywords.str.lower().str.strip().str.replace('  ',' ')
    keywords_underscored = data_keywords_tmp.keywords.str.lower().str.strip().str.replace(' ','_').str.upper().values.tolist()
    keywords_spaced = data_keywords_tmp.keywords.str.lower().str.strip().values.tolist()
    thesaurus.append(dict(zip(keywords_spaced,keywords_underscored)))
    idx+=1
    
thesaurus[2]['fpga'] = 'FIELD_PROGRAMMABLE_GATE_ARRAY'
thesaurus[3]['anfis'] = 'ADAPTIVE_NEURO_FUZZY_INFERENCE_SYSTEM'
thesaurus[3]['lssvm'] = ''

100%|██████████| 5/5 [00:00<00:00, 68.93it/s]


Preparing keywords...

Preparing thesaurus...





In [99]:
def multiple_replace(string, rep_dict):
    pattern = re.compile("|".join([re.escape(k) for k in sorted(rep_dict,key=len,reverse=True)]), flags=re.DOTALL)
    return pattern.sub(lambda x: rep_dict[x.group(0)], string)

period_names = []
for period in periods[:]:
    period_name = str(period[0])+'-'+str(period[1]-1)
    print('Processing for period:',period_name)
    period_names.append(period_name)
    abstracts_period = abstracts_df[(abstracts_df['year']>=period[0]) & (abstracts_df['year']<period[1])].copy()
    for thesaurus_gram in list(reversed(thesaurus)):
        abstracts_period['abstract'] = abstracts_period['abstract'].progress_apply(lambda x: multiple_replace(x,thesaurus_gram))
    abstracts_period.to_csv(root_dir+'n-gram by 2 repetition keywords '+period_name,index=False,header=False) 

  0%|          | 934/210571 [00:00<00:22, 9338.14it/s]

Processing for period: 2005-2007


100%|██████████| 210571/210571 [00:21<00:00, 9860.79it/s] 
100%|██████████| 210571/210571 [02:03<00:00, 1705.40it/s]
100%|██████████| 210571/210571 [11:57<00:00, 293.63it/s]
100%|██████████| 210571/210571 [49:55<00:00, 70.29it/s] 
100%|██████████| 210571/210571 [1:58:21<00:00, 29.65it/s]  
  0%|          | 0/296780 [00:00<?, ?it/s]

Processing for period: 2008-2010


100%|██████████| 296780/296780 [00:29<00:00, 10083.55it/s]
100%|██████████| 296780/296780 [02:52<00:00, 1716.69it/s]
100%|██████████| 296780/296780 [17:09<00:00, 288.27it/s]
100%|██████████| 296780/296780 [1:11:03<00:00, 69.62it/s]
100%|██████████| 296780/296780 [2:50:54<00:00, 28.94it/s]  


Processing for period: 2011-2013


100%|██████████| 366163/366163 [00:38<00:00, 9503.20it/s]
100%|██████████| 366163/366163 [03:42<00:00, 1646.25it/s]
100%|██████████| 366163/366163 [22:03<00:00, 276.58it/s]
100%|██████████| 366163/366163 [1:32:05<00:00, 66.26it/s]
100%|██████████| 366163/366163 [3:40:05<00:00, 27.73it/s]  


Processing for period: 2014-2016


100%|██████████| 420980/420980 [00:44<00:00, 9398.92it/s]
100%|██████████| 420980/420980 [04:22<00:00, 1604.26it/s]
100%|██████████| 420980/420980 [26:15<00:00, 267.23it/s]
100%|██████████| 420980/420980 [1:50:29<00:00, 63.50it/s]
100%|██████████| 420980/420980 [4:24:28<00:00, 26.53it/s]  


Processing for period: 2017-2018


100%|██████████| 332030/332030 [00:36<00:00, 9102.12it/s]
100%|██████████| 332030/332030 [03:30<00:00, 1576.64it/s]
100%|██████████| 332030/332030 [21:15<00:00, 260.36it/s]
100%|██████████| 332030/332030 [1:29:45<00:00, 61.65it/s]
100%|██████████| 332030/332030 [3:40:12<00:00, 25.13it/s]  


Make sentences again

In [101]:
root_dir

'/home/sahand/Data/Corpus/AI ALL/'

In [103]:
abst_data = pd.read_csv(root_dir+'n-grams/'+'1900-2019 n-gram by 2 repetition keywords',names=['article_index','year','abstract'])

sentences = []
years = []
indices = []
for index,row in tqdm(abst_data.iterrows(),total=abst_data.shape[0]):
    index = row['article_index']
    year = row['year']
    abstract_sentences = row['abstract'].split('. ')
    length = len(abstract_sentences)

    sentences.extend(abstract_sentences)
    years.extend([year for x in range(length)])
    indices.extend([index for x in range(length)])

sent_df = pd.DataFrame(indices,columns=['article_index'])
sent_df['sentence'] = sentences
sent_df['year'] = years
sent_df.to_csv(root_dir+'n-grams/'+'1900-2019 sentences n-gram by 2 repetition keywords',index=False,header=True)


100%|██████████| 1897668/1897668 [04:08<00:00, 7641.99it/s]


# Regular Corpus Making

In [None]:
if MAKE_REGULAR_CORPUS is False:
    sys.exit('Did not continue to create normal corpus. If you want a corpus, set it to True at init section.')
# =============================================================================
#   Get word frequency in sentence corpus -- OPTIONAL
# =============================================================================
if GET_WORD_FREQ_IN_SENTENCE is True:
    import pandas as pd
    import numpy as np
    from tqdm import tqdm
    
    file = root_dir+subdir+str(year_from)+'-'+str(year_to-1)+' corpus sentences abstract-title'#'/home/sahand/GoogleDrive/Data/corpus/AI ALL/1900-2019 corpus sentences abstract-title'
    file = pd.read_csv(file)
    size = 500000
    unique = []
    for data_start_point in tqdm(np.arange(0,file.shape[0],size)):
        if data_start_point+size<file.shape[0]:
            end_point = data_start_point+size
        else:
            end_point = file.shape[0]-1
    #    print(data_start_point,end_point)
        str_split = list(file.sentence[data_start_point:end_point].str.split())
        str_flat = pd.DataFrame([item for sublist in str_split for item in sublist])
        str_flat.columns = ['words']
        str_flat.head()
    
        unique = unique+list(str_flat.words.unique())
    
    unique = pd.DataFrame(unique)
    unique.columns = ['words']
    unique = list(unique.words.unique())
    len(unique)
# =============================================================================
# Tokenize (Author Keywords and Abstracts+Titles)
# =============================================================================
abstracts = []
keywords = []
keywords_index = []
abstracts_pure = []
for index,paper in tqdm(data_with_abstract.iterrows(),total=data_with_abstract.shape[0]):
    keywords_str = paper['DE']
    keywords_index_str = paper['ID']
    abstract_str = paper['AB']
    title_str = paper['TI']
    abstract_dic = word_tokenize(title_str+' '+abstract_str)
    abstract_dic_pure = abstract_dic.copy()
    if pd.notnull(paper['DE']):
        keywords_dic = word_tokenize(keywords_str)
        keywords.append(keywords_str.split(';'))
        abstract_dic.extend(keywords_dic)
    else:
        keywords.append([])
    if pd.notnull(paper['ID']):
        keywords_index.append(keywords_index_str.split(';'))
    else:
        keywords_index.append([])
    abstracts.append(abstract_dic)
    abstracts_pure.append(abstract_dic_pure)

# Add to main df. Not necessary
data_with_abstract['AB_split'] = abstracts_pure 
data_with_abstract['AB_KW_split'] = abstracts

# =============================================================================
# Strip and lowe case 
# =============================================================================
abstracts_pure = [list(map(str.strip, x)) for x in abstracts_pure]
abstracts_pure = [list(map(str.lower, x)) for x in abstracts_pure]

abstracts = [list(map(str.strip, x)) for x in abstracts]
abstracts = [list(map(str.lower, x)) for x in abstracts]

keywords = [list(map(str.strip, x)) for x in keywords]
keywords = [list(map(str.lower, x)) for x in keywords]

keywords_index = [list(map(str.strip, x)) for x in keywords_index]
keywords_index = [list(map(str.lower, x)) for x in keywords_index]
# =============================================================================
# Pre Process 
# =============================================================================
tmp_data = []
print("\nString pre processing for abstracts")
for string_list in tqdm(abstracts, total=len(abstracts)):
    tmp_list = [kw.string_pre_processing(x,stemming_method='None',lemmatization=True,stop_word_removal=True,stop_words_extra=stops,verbose=False,download_nltk=False) for x in string_list]
    tmp_data.append(tmp_list)
abstracts = tmp_data.copy()
del tmp_data

tmp_data = []
for string_list in tqdm(abstracts_pure, total=len(abstracts_pure)):
    tmp_list = [kw.string_pre_processing(x,stemming_method='None',lemmatization=True,stop_word_removal=True,stop_words_extra=stops,verbose=False,download_nltk=False) for x in string_list]
    tmp_data.append(tmp_list)
abstracts_pure = tmp_data.copy()
del tmp_data

print("\nString pre processing for keywords")
tmp_data = []
for string_list in tqdm(keywords, total=len(keywords)):
    tmp_list = []
    for string in string_list:
        tmp_sub_list = string.split()
        tmp_list.append(' '.join([kw.string_pre_processing(x,stemming_method='None',lemmatization=True,stop_word_removal=True,stop_words_extra=stops,verbose=False,download_nltk=False) for x in tmp_sub_list]))
    tmp_data.append(tmp_list)
keywords = tmp_data.copy()
del tmp_data

tmp_data = []
for string_list in tqdm(keywords_index, total=len(keywords_index)):
    tmp_list = []
    for string in string_list:
        tmp_sub_list = string.split()
        tmp_list.append(' '.join([kw.string_pre_processing(x,stemming_method='None',lemmatization=True,stop_word_removal=True,stop_words_extra=stops,verbose=False,download_nltk=False) for x in tmp_sub_list]))
    tmp_data.append(tmp_list)
keywords_index = tmp_data.copy()
del tmp_data

#tmp_data = []
#for string_list in tqdm(keywords, total=len(keywords)):
#    tmp_list = []
#    for sub_string_list in string_list:
#        tmp_list.append(' '.join(sub_string_list))
#    tmp_data.append(tmp_list)
#keywords = tmp_data.copy()
#del tmp_data

# =============================================================================
# Clean-up dead words
# =============================================================================
tmp_data = []
for string_list in tqdm(abstracts, total=len(abstracts)):
    tmp_data.append([x for x in string_list if x!=''])
abstracts = tmp_data.copy()
del tmp_data

tmp_data = []
for string_list in tqdm(abstracts_pure, total=len(abstracts_pure)):
    tmp_data.append([x for x in string_list if x!=''])
abstracts_pure = tmp_data.copy()
del tmp_data

tmp_data = []
for string_list in tqdm(keywords, total=len(keywords)):
    tmp_data.append([x for x in string_list if x!=''])
keywords = tmp_data.copy()
del tmp_data

tmp_data = []
for string_list in tqdm(keywords_index, total=len(keywords_index)):
    tmp_data.append([x for x in string_list if x!=''])
keywords_index = tmp_data.copy()
del tmp_data
# =============================================================================
# Break-down abstracts again
# =============================================================================
tmp_data = []
for abstract in tqdm(abstracts):
    words = []
    for word in abstract:
        words = words+word.split()
    tmp_data.append(words)
abstracts = tmp_data.copy()
del tmp_data

tmp_data = []
for abstract in tqdm(abstracts_pure):
    words = []
    for word in abstract:
        words = words+word.split()
    tmp_data.append(words)
abstracts_pure = tmp_data.copy()
del tmp_data

# =============================================================================
# Thesaurus matching
# =============================================================================
print("\nThesaurus matching")

abstracts_backup = abstracts.copy()
abstracts_pure_backup = abstracts_pure.copy()
keywords_backup = keywords.copy()
keywords_index_backup = keywords_index.copy()

abstracts = abstracts_backup.copy()
abstracts_pure = abstracts_pure_backup.copy()
keywords = keywords_backup.copy()
keywords_index = keywords_index_backup.copy()

abstracts = kw.thesaurus_matching(abstracts)
abstracts_pure = kw.thesaurus_matching(abstracts_pure)
keywords = kw.thesaurus_matching(keywords)
keywords_index = kw.thesaurus_matching(keywords_index)

# =============================================================================
# Term to string corpus for co-word analysis
# =============================================================================
print("\nTerm to string corpus for co-word analysis")
corpus_abstract = []
for words in tqdm(abstracts, total=len(abstracts)):
    corpus_abstract.append(' '.join(words))

corpus_abstract_pure = []
for words in tqdm(abstracts_pure, total=len(abstracts_pure)):
    corpus_abstract_pure.append(' '.join(words))

corpus_keywords = []
for words in tqdm(keywords, total=len(keywords)):
    corpus_keywords.append(';'.join(words))
    
corpus_keywords_index = []
for words in tqdm(keywords_index, total=len(keywords_index)):
    corpus_keywords_index.append(';'.join(words))


# =============================================================================
# Remove substrings : 
#   be careful with this one! It might remove parts of a string or half of a word
# =============================================================================
thesaurus = pd.read_csv('data/thesaurus/to_remove.csv')
thesaurus['alt'] = ''
thesaurus = thesaurus.values.tolist()
print("\nRemoving substrings")

corpus_abstract_tr = []
for paragraph in tqdm(corpus_abstract, total=len(corpus_abstract)):
    paragraph = kw.filter_string(paragraph,thesaurus)
    corpus_abstract_tr.append(paragraph)

corpus_abstract_pure_tr = []
for paragraph in tqdm(corpus_abstract_pure, total=len(corpus_abstract_pure)):
    paragraph = kw.filter_string(paragraph,thesaurus)
    corpus_abstract_pure_tr.append(paragraph)

corpus_keywords_tr = []
for paragraph in tqdm(corpus_keywords, total=len(corpus_keywords)):
    paragraph = kw.filter_string(paragraph,thesaurus)
    corpus_keywords_tr.append(paragraph)
    
corpus_keywords_index_tr = []
for paragraph in tqdm(corpus_keywords_index, total=len(corpus_keywords_index)):
    paragraph = kw.filter_string(paragraph,thesaurus)
    corpus_keywords_index_tr.append(paragraph)
    
# =============================================================================
# Final clean-up (double space and leading space)
# =============================================================================
tmp_data = []
for paragraph in tqdm(corpus_abstract, total=len(corpus_abstract)):
    paragraph = ' '.join(paragraph.split())
    tmp_data.append(paragraph)
corpus_abstract = tmp_data.copy()
del tmp_data

tmp_data = []
for paragraph in tqdm(corpus_abstract_tr, total=len(corpus_abstract_tr)):
    paragraph = ' '.join(paragraph.split())
    tmp_data.append(paragraph)
corpus_abstract_tr = tmp_data.copy()
del tmp_data

tmp_data = []
for paragraph in tqdm(corpus_abstract_pure, total=len(corpus_abstract_pure)):
    paragraph = ' '.join(paragraph.split())
    tmp_data.append(paragraph)
corpus_abstract_pure = tmp_data.copy()
del tmp_data

tmp_data = []
for paragraph in tqdm(corpus_abstract_pure_tr, total=len(corpus_abstract_pure_tr)):
    paragraph = ' '.join(paragraph.split())
    tmp_data.append(paragraph)
corpus_abstract_pure_tr = tmp_data.copy()
del tmp_data

tmp_data = []
for paragraph in tqdm(corpus_keywords, total=len(corpus_keywords)):
    paragraph = ' '.join(paragraph.split(' '))
    paragraph = ';'.join(paragraph.split(';'))
    tmp_data.append(paragraph)
corpus_keywords = tmp_data.copy()
del tmp_data

tmp_data = []
for paragraph in tqdm(corpus_keywords_tr, total=len(corpus_keywords_tr)):
    paragraph = ' '.join(paragraph.split(' '))
    paragraph = ';'.join(paragraph.split(';'))
    tmp_data.append(paragraph)
corpus_keywords_tr = tmp_data.copy()
del tmp_data
tmp_data = []
for paragraph in tqdm(corpus_keywords_index, total=len(corpus_keywords_index)):
    paragraph = ' '.join(paragraph.split(' '))
    paragraph = ';'.join(paragraph.split(';'))
    tmp_data.append(paragraph)
corpus_keywords_index = tmp_data.copy()
del tmp_data

tmp_data = []
for paragraph in tqdm(corpus_keywords_index_tr, total=len(corpus_keywords_index_tr)):
    paragraph = ' '.join(paragraph.split(' '))
    paragraph = ';'.join(paragraph.split(';'))
    tmp_data.append(paragraph)
corpus_keywords_index_tr = tmp_data.copy()
del tmp_data

# =============================================================================
# Write to disk
# =============================================================================
corpus_abstract = pd.DataFrame(corpus_abstract,columns=['words'])
corpus_abstract_tr = pd.DataFrame(corpus_abstract_tr,columns=['words'])
corpus_abstract_pure = pd.DataFrame(corpus_abstract_pure,columns=['words'])
corpus_abstract_pure_tr = pd.DataFrame(corpus_abstract_pure_tr,columns=['words'])
corpus_keywords = pd.DataFrame(corpus_keywords,columns=['words'])
corpus_keywords_tr = pd.DataFrame(corpus_keywords_tr,columns=['words'])
corpus_keywords_index = pd.DataFrame(corpus_keywords_index,columns=['words'])
corpus_keywords_index_tr = pd.DataFrame(corpus_keywords_index_tr,columns=['words'])

corpus_abstract.to_csv(root_dir+subdir+''+str(year_from)+'-'+str(year_to-1)+' abstract_title_keys',index=False,header=False)
corpus_abstract_tr.to_csv(root_dir+subdir+''+str(year_from)+'-'+str(year_to-1)+' abstract_title_keys-terms_removed' ,index=False,header=False)
corpus_abstract_pure.to_csv(root_dir+subdir+''+str(year_from)+'-'+str(year_to-1)+' abstract_title',index=False,header=False)
corpus_abstract_pure_tr.to_csv(root_dir+subdir+''+str(year_from)+'-'+str(year_to-1)+' abstract_title-terms_removed',index=False,header=False)
corpus_keywords.to_csv(root_dir+subdir+''+str(year_from)+'-'+str(year_to-1)+' keywords',index=False,header=False)
corpus_keywords_tr.to_csv(root_dir+subdir+''+str(year_from)+'-'+str(year_to-1)+' keywords-terms_removed',index=False,header=False)
corpus_keywords_index.to_csv(root_dir+subdir+''+str(year_from)+'-'+str(year_to-1)+' keywords_index',index=False,header=False)
corpus_keywords_index_tr.to_csv(root_dir+subdir+''+str(year_from)+'-'+str(year_to-1)+' keywords_index-terms_removed',index=False,header=False)