In [1]:
import pandas as pd
import re
import nltk
import json
import ast
import os

In [2]:
PATH = '../coronavirus_twenty_years_of_research/technical_validation/'
k = 8

In [3]:
df = pd.read_csv(PATH + "NMF_topic_modelling_results({}clusters).csv".format(k))

In [4]:
# Cleaning title and abstract

def removeWhiteSpaces(text):
    # 1) Remove newline, etc. (into Space)
    redun_lines = ["\n", chr(13)]
    for line in redun_lines:
        text = text.replace(line, " ")
    # 2) Remove >1 conseq Spaces
    text = re.sub(' +', ' ', text)
    # 3) other whitespaces* (incl 1,2?)
    text = " ".join(re.split(r"\s+", text))
    #  https://www.delftstack.com/howto/python/how-to-remove-whitespace-in-a-string/
    #  \s for any whitespaces (incl new line!?): incl collection [ \t\n\r\f\v]
    return text.strip()

def clear_tags(dataObj, tags_only=False):
    """
    Cleaning - remove tags, URLs, special characters
    """
    dataObj = dataObj.replace("\\n", '')
    dataObj = dataObj.replace("['", '')
    dataObj = dataObj.replace("']", '')
    
    # Del Tag + Content (sub-titles):   <jats:title content-type="abstract-subheading">Purpose</jats:title>
    redun_tags = ['<jats:title>', '<title>']
    for tag in redun_tags:
        start = dataObj.find(tag[:-1])
        while start != -1:
            end = dataObj.find("</" + tag[1:-2], start)  # length 13      (excl last 2: for </tag   >
            if end != -1: dataObj = dataObj.replace(dataObj[start:end + 13], " ")
            start = dataObj.find(tag[:-1], start + 5)  # NEXT start (SKIP current - *if prev without end)

    # Del ALL Tags <....>    # redun_tags = ["<p>", "<jats:p>", "<jats:sec>", "<sec>", "<jats:italic>", "<jats:bold>", "<jats:p id=""p1"">"]
    dataObj = re.sub('<[^<]+?>', ' ', dataObj)
    
    
    
    # Del URLs
    re_url = 'https?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.](?:com|net|org|edu|gov|mil|aero|asia|info)/' + '[a-z0-9.\-]'
    dataObj = re.sub(re_url, '', dataObj)

    if not tags_only:
        ### Del Symbols
        dataObj = re.sub('[&;]\d+;*', ' ', dataObj)  # [0-9] -> \d, [;] -> ;
        dataObj = re.sub('&[A-Z]{4}', ' ', dataObj)
        dataObj = re.sub('&\W{2,10};', ' ', dataObj)  # [\W] -> \W
        dataObj = re.sub('&#\d{2,4};', ' ', dataObj)
        redun = ["amp", ";lt", ";gt", "&lt", "&gt", ";p", "div", "&#x0D;", "ldquo", "rdquo", " ", " ", " ", "#160", "/p", ";"]
        for substr in redun:
            dataObj = dataObj.replace(substr, " ")

    return removeWhiteSpaces(dataObj)

In [5]:
# cleaning subjects, ISSN, container-title

def cleaning_columns(x):
    if type(x) != float:
        values = eval(x)
        cleaned_values = []
        for v in values:
            cleaned_values.append(v)
        return cleaned_values
    else:
        return 'nan'

def cleaning_container_title(x):
    if type(x) != float:
        values = eval(x)
        return values[-1]
    else:
        return 'nan'

In [6]:
# Cleaning created, published, and last-updated
def cleaning_date_columns(date):
    try:
        row_value = ast.literal_eval(date)
        date = str(row_value['date-parts'])
        cleaned_date = date.replace('[', '')
        cleaned_date = cleaned_date.replace(']', '')
        cleaned_date = cleaned_date.replace(', ', '-')
        return cleaned_date
    except:
        return date

def cleaning_last_updated(date):
    try:
        row_value = ast.literal_eval(date)
        return row_value['$date'][:10]
    except:
        return date

In [7]:
#Cleaning link

def cleaning_link_text(text):
    try:
        row_value = ast.literal_eval(text)
        return row_value[0]['URL']
    except:
        return text

In [8]:
#Cleaning author

def clean_author_list(x):
    if type(x) != float:
        authors = eval(x)
        cleaned_authors = []
        for person in authors:
            if 'given' in person:
                cleaned_authors.append(person['given'] + " " + person['family'])
        return cleaned_authors
    else:
        return ''

In [9]:
# cleaning funder

def clean_funder(x):
    if type(x) != float:
        funders = eval(x)
        cleaned_funder = []
        for funder in funders:
            if 'name' in funder:
                cleaned_funder.append(funder['name'] )
        return cleaned_funder
    else:
        return 'nan'

In [10]:
# cleaning cluster-coefficient

def clean_topic_correlation(x):
    correlations = eval(x)
    correlations = { "cluster"+str(k): v for k, v in correlations.items() }
    return list(correlations.items())
    

In [11]:
df['title'] = df['title'].apply(lambda x:clear_tags(str(x)))
df['abstract'] = df['abstract'].apply(lambda x:clear_tags(str(x)))
df['ISSN'] = df['ISSN'].apply(lambda x:cleaning_columns(x)) 
df['subject'] = df['subject'].apply(lambda x:cleaning_columns(x)) 
df['container-title'] = df['container-title'].apply(lambda x:cleaning_container_title(x)) 
df['created'] = df['created'].apply(lambda x: cleaning_date_columns(x)) 
df['published'] = df['published'].apply(lambda x: cleaning_date_columns(x)) 
df['last-updated'] = df['last-updated'].apply(lambda x: cleaning_last_updated(x)) 
df['link'] = df['link'].apply(lambda x: cleaning_link_text(x)) 
df['author'] = df['author'].apply(lambda x: clean_author_list(x)) 
df['funder'] = df['funder'].apply(lambda x: clean_funder(x)) 
df.rename(columns={"research-topic":"cluster", "topic-correlation":"cluster-coefficient"}, inplace=True)
df['cluster-coefficient'] = df['cluster-coefficient'].apply(lambda x: clean_topic_correlation(x)) 

In [12]:
# sort df
sorted_df = df.sort_values(by=['created'], inplace=False) # sort by date
sorted_df['volume'] = sorted_df['volume'].astype(str)
sorted_df['issue'] = sorted_df['issue'].astype(str)
sorted_df['created'] = sorted_df['created'].astype(str)
sorted_df['published'] = sorted_df['published'].astype(str)
sorted_df = sorted_df[['DOI', 'title', 'abstract', 
                      'author', 'created', 'published', 'subject',
                      'URL', 'link', "ISSN", 'container-title', 'source', 'type', 
                      'publisher', 'volume', 'issue', 'funder', 'cluster', 'cluster-coefficient']]

In [24]:
before_2020_df = sorted_df.loc[sorted_df['created'] < '2020-01'].reset_index(drop=True)
after_2020_df = sorted_df.loc[sorted_df['created'] > '2020-01'].reset_index(drop=True)

In [25]:
# wrtie JSON pre 2020
dict_records = before_2020_df.to_dict('records')

for c in range(k):
    OUTPUT_PATH = '../coronavirus_twenty_years_of_research/clusters/cluster{}/pre_2020/'.format(str(c))
    tmp_df = before_2020_df.loc[before_2020_df['cluster'] == c].reset_index(drop=True)
    for i, r in tmp_df.iterrows():
        doi = tmp_df['DOI'].iloc[i]
        doi = doi.replace('/', '-')
        if not os.path.exists(OUTPUT_PATH):
            os.makedirs(OUTPUT_PATH) 
        with open(OUTPUT_PATH+'{}.json'.format(doi), 'w') as f:
            json.dump(dict_records[i], f,ensure_ascii=False, indent=2)

In [28]:
# wrtie JSON post 2020
for c in range(k):
    tmp_df = after_2020_df.loc[after_2020_df['cluster'] == c].reset_index(drop=True)
    for Y in range(2020, 2029):
        for M in range(1, 13):
            OUTPUT_PATH = '../coronavirus_twenty_years_of_research/clusters/cluster{}/{}-{}/'.format(str(c), Y, M)
            sub_tmp_df = tmp_df.loc[tmp_df['created'].str.contains("{}-{}".format(Y, M))] # select data by date
            if len(sub_tmp_df) == 0:continue
            sub_tmp_df = sub_tmp_df.reset_index(drop=True) # reset index

            dict_records = sub_tmp_df.to_dict('records')
            #print("Cluster#{}:{}-{} has {} of articles".format(str(c), Y, M, len(sub_tmp_df)))
            
            for i, r in sub_tmp_df.iterrows():
                doi = sub_tmp_df['DOI'].iloc[i]
                doi = doi.replace('/', '-')
                if not os.path.exists(OUTPUT_PATH):
                    os.makedirs(OUTPUT_PATH)          
                with open(OUTPUT_PATH+'{}.json'.format(doi), 'w') as f:
                    json.dump(dict_records[i], f,ensure_ascii=False, indent=2)