In [1]:
import bs4 # to read xml tree structure
from bs4 import BeautifulSoup as bs
import os
import csv
import pandas as pd

In [2]:
xml_dir = "../data/xml" # directory to store all the xml files
xml_list = [] # list of all the xml files
for ele in os.listdir(xml_dir):
    xml_list.append(os.path.join(xml_dir,ele))

In [3]:
xml_list

['../data/xml/pmc_result_20210101_20210105.xml',
 '../data/xml/pmc_result_20210106_20210110.xml']

In [4]:
def load_file(filename):
    """
    It reads & load the file in xml

    :filename: the name of the xml file to be read
    :return: file is loaded and it's read ready for further processes
    """
    input_file= open(filename, 'r')
    contents = input_file.read()
    # embedding the XML with beautiful soup module
    soup = bs(contents, 'xml')
    #extracts of the article units 
    xml_papers = soup.find_all("article")
    
        
    return xml_papers

In [5]:
def track_abstract(index:list, tempo:list)-> list:
    """
    It uses the indice of whitespace to sort out the appropriate indice that contains abstract 
    
    :index: list of indice of whitespace in a complete abstract and its other details in list form
    :return: indice of abstract
    """

def extract_tags(xml_papers) -> list:
    result=[]
    art_title,pmc_id,doi,acklge = "","","",""
    
    for paper in xml_papers:
        art_title = paper.find_all('article-title')[0].text
        #print(art_title)

        # getting pmc & doi
        meta=paper.find_all("article-meta")
        art_ids = meta[0].find_all("article-id")

        for ids in art_ids:
            id = ids.attrs 

            if id['pub-id-type'] == 'pmc':
                pmc_id = ids.text 
                #print(ids.text)

            if id['pub-id-type'] == 'doi':
                doi = ids.text 
                #print(doi)
            else:
                doi = 'na'
        # getting acknowlegdement
        acklge_ = paper.find_all("ack") #attrs
        #print(acklge_)

        #acklge = acklge_.text
        #print(len(acklge_))
        if len(acklge_)==0:
            acklge = "na"
            result.append([art_title,pmc_id,doi,acklge])
        else:
            acklge = acklge_[0].p.text
                #print(acklge_[0].p.text,'\n')
            result.append([art_title,pmc_id,doi,acklge])
        
    return result

In [6]:
def save_file(out_filename: str, input_item: list) -> csv:
    """
    It write list output as 'csv' file in the current directory

    :out_filename:   the name that will be give of the output file
    :input_item:   processed data in list.
    """
    with open(out_filename, "w") as s:
        w = csv.writer(s)
        for row in input_item:
            w.writerow(row)

In [None]:
# loading the raw_xml file
xml_extract_list =[]
for ele in xml_list:
    xml_extract_list.append(load_file(ele))

# extracting tags of interest
annotated_data = []
for ele in xml_extract_list:
    annotated_data+=extract_tags(ele)

In [None]:
# name the processed output file 
out_filename = "../data/ack_data.csv"

# save the processed file into csv
save_file(out_filename, annotated_data)

In [None]:
#checking the processed csv file

#loading the "ack_data.csv" in pandas
df = pd.read_csv("../data/ack_data.csv",header=None)

#name the columns of the processed csv file
df.columns = ['article_title','pmc_id','doi','acknowledgement'] #art_title,pmc_id,doi,acklge
df