# TIMC PUBMED 100 files


In [4]:
#allows for multiple cell outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

## Import and Clean

In [1]:
#python file for running the extraction script on morrowind

import os
import glob
import gzip
import csv
import time
from lxml import etree

journals = ['Antonie van Leeuwenhoek', 'International journal of systematic and evolutionary microbiology']

path = '/DATA2/choberts/pubmed_abstracts_10-03-23/'
xml_files = glob.glob(os.path.join(path, '*.xml.gz'))

# file_limit = 2  # set number of files to process
# xml_files = xml_files[:file_limit]  # apply file limit

start_time = time.time()  # start timing

count = 0  # counter for articles from two journals

# Open the logfile for writing
with open('logfile.txt', 'w', encoding='utf-8') as logfile:
    # Define a function to write to the logfile and print progress updates
    def log(message):
        logfile.write(f"{message}\n")
        print(message)

    # Open the output CSV file for writing
    with open('output.csv', 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['Journal Name', 'Year', 'Month', 'PubMedID', 'Title', 'Abstract'])

        log("Started processing XML files.")

        for i, file_path in enumerate(xml_files):
            try:
                with gzip.open(file_path, 'rb') as f:
                    content = f.read()
            except OSError as e:
                log(f"Error opening file: {file_path}. Reason: {str(e)}")
                continue

            try:
                root = etree.fromstring(content)
            except etree.XMLSyntaxError as e:
                log(f"Error parsing XML: {file_path}. Reason: {str(e)}")
                continue

            log(f"Processing file {i+1}/{len(xml_files)}: {file_path}")

            for article in root.xpath('.//PubmedArticle'):
                abstract_element = article.xpath('.//AbstractText')[0] if len(article.xpath('.//AbstractText')) > 0 else None
                journal_name_element = article.xpath('.//Journal/Title')[0] if len(article.xpath('.//Journal/Title')) > 0 else None

                if abstract_element is None or journal_name_element is None or journal_name_element.text not in journals:
                    continue

                title = article.xpath('.//ArticleTitle/text()')[0] if len(article.xpath('.//ArticleTitle/text()')) > 0 else ''
                abstract = abstract_element.text if abstract_element is not None else ''
                pubmed_id = article.xpath('.//PMID/text()')[0] if len(article.xpath('.//PMID/text()')) > 0 else ''

                if not pubmed_id.isnumeric():
                    log(f"Unexpected PubMedID in file: {file_path}. Skipping this record.")
                    continue

                pub_date_element = article.xpath('.//PubDate')[0] if len(article.xpath('.//PubDate')) > 0 else None
                if pub_date_element is not None:
                    pub_year = pub_date_element.xpath('.//Year/text()')[0] if len(pub_date_element.xpath('.//Year/text()')) > 0 else ''
                    pub_month = pub_date_element.xpath('.//Month/text()')[0] if len(pub_date_element.xpath('.//Month/text()')) > 0 else ''
                    #pub_date = f"{pub_year}-{pub_month}"
                else:
                    pub_date = ''

                writer.writerow([journal_name_element.text, pub_year, pub_month, pubmed_id, title, abstract])
                count += 1  # increment counter

        log("Finished processing XML files.")

end_time = time.time()  # end timing

log(f"Process took {end_time - start_time} seconds.")
log(f"Total articles processed from specified journals: {count}")

# Close the logfile
logfile.close()


In [2]:
import pandas as pd 

df = pd.read_csv("output.csv")
print(df.info())
print(df.head())
print(df.tail())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77 entries, 0 to 76
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Title     77 non-null     object
 1   Abstract  63 non-null     object
dtypes: object(2)
memory usage: 1.3+ KB
None
                                               Title  \
0  Ottowia flava sp. nov., isolated from fish int...   
1  Bioinformatic analyses of a potential Salmonel...   
2  Hymenobacter oligotrophus sp. nov., isolated f...   
3  Flavisolibacter galbus sp. nov., isolated from...   
4  Vibrio profundi sp. nov., isolated from a deep...   

                                            Abstract  
0  A novel Gram-negative bacterium, non-motile an...  
1  Foodborne Enterobacteriaceae pathogens, especi...  
2  A taxonomic study of a Gram-stain negative, ro...  
3  A Gram-stain negative, non-motile, and yellow-...  
4  A Gram-stain negative, rod-shaped, facultative...  
                                  

### Optimizing the Search

In [2]:
import os
import gzip
from lxml import etree
import csv
import pandas as pd 

data = []
journals = ['Antonie van Leeuwenhoek', 'International journal of systematic and evolutionary microbiology']

path = 'TIMC-quinoneMining/pubmed-data'
file_limit = 1  # set the number of files to process

# get a list of all .xml.gz files in the path
xml_files = [os.path.join(path, f) for f in os.listdir(path) if f.endswith('.xml.gz')]

# process the files
for i, file_path in enumerate(xml_files):
    if i >= file_limit:
        break
    with gzip.open(file_path, 'rb') as f:
        content = f.read()
        root = etree.fromstring(content)
        for article in root.findall('.//PubmedArticle'):
            if article.find('.//ArticleTitle') is None:
                continue
            if article.find('.//AbstractText') is None:
                continue
            journal_name = article.find('.//Journal/Title').text or ''
            if journal_name not in journals:
                continue
            title = article.find('.//ArticleTitle').text or ''
            abstract = article.find('.//AbstractText').text or ''
            data.append([title, abstract])

# write the data to a CSV file
with open('output.csv', 'w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow(['Title', 'Abstract'])
    writer.writerows(data)

df = pd.read_csv('output.csv')
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33 entries, 0 to 32
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Title     23 non-null     object
 1   Abstract  33 non-null     object
dtypes: object(2)
memory usage: 656.0+ bytes


Unnamed: 0,Title,Abstract
0,"Kineobactrum sediminis gen. nov., sp. nov., is...","A novel Gram-stain-negative, rod-shaped marine..."
1,"Cutaneotrichosporon suis sp. nov., a lipolytic...","Two conspecific yeast strains, which based on ..."
2,"Shewanella psychromarinicola sp. nov., a psych...","Two Gram-stain-negative, rod-shaped, facultati..."
3,,"A rod-shaped, spore-forming, thermophilic, che..."
4,,"Two Gram-stain-negative, catalase- and oxidase..."


# Search for the Quinone/Ubiquinone keywords

In [None]:
quinones = {}

quinones[" Q "] = ["quinone"]
quinones["UQ"] = ["ubiquinone", "uq", " q "]
quinones["UQ-5"] = ["q5 ", "q-5", "uq-5", "uq5", "ubiquinone(q5)", "ubiquinone (q5)", "ubiquinone(5)", "ubiquinone (5)", "uq(5)", "uq (5)"]
quinones["UQ-6"] = ["q6 ", "q-6", "uq-6", "uq6", "ubiquinone(q6)", "ubiquinone (q6)", "ubiquinone(6)", "ubiquinone (6)", "uq(6)", "uq (6)"]
quinones["UQ-7"] = ["q7 ", "q-7", "uq-7", "uq7", "ubiquinone(q7)", "ubiquinone (q7)", "ubiquinone(7)", "ubiquinone (7)", "uq(7)", "uq (7)"]
quinones["UQ-8"] = ["q8 ", "q-8", "uq-8", "uq8", "ubiquinone(q8)", "ubiquinone (q8)", "ubiquinone(8)", "ubiquinone (8)", "uq(8)", "uq (8)"]
quinones["UQ-9"] = ["q9 ", "q-9", "uq-9", "uq9", "ubiquinone(q9)", "ubiquinone (q9)", "ubiquinone(9)", "ubiquinone (9)", "uq(9)", "uq (9)"]
quinones["UQ-10"] = ["q10 ", "q-10", "uq-10", "uq10", "ubiquinone(q10)", "ubiquinone (q10)", "ubiquinone(10)", "ubiquinone (10)", "uq(10)", "uq (10)"]

quinones["MK"] = ["menaquinone", " mq ", " mk "]
quinones["MK-5"] = ["mq5", "mq-5", "mk-5", "mk5", "menaquinone(mq5)", "menaquinone(mk5)", "menaquinone (mq5)", "menaquinone (mk5)", "menaquinone(5)", "menaquinone (5)", "mk(5)", "mq(5)", "mq (5)", "mk (5)"]
quinones["MK-6"] = ["mq6", "mq-6", "mk-6", "mk6", "menaquinone(mq6)", "menaquinone(mk6)", "menaquinone (mq6)", "menaquinone (mk6)", "menaquinone(6)", "menaquinone (6)", "mk(6)", "mq(6)", "mq (6)", "mk (6)"]
quinones["MK-7"] = ["mq7", "mq-7", "mk-7", "mk7", "menaquinone(mq7)", "menaquinone(mk7)", "menaquinone (mq7)", "menaquinone (mk7)", "menaquinone(7)", "menaquinone (7)", "mk(7)", "mq(7)", "mq (7)", "mk (7)"]
quinones["MK-8"] = ["mq8", "mq-8", "mk-8", "mk8", "menaquinone(mq8)", "menaquinone(mk8)", "menaquinone (mq8)", "menaquinone (mk8)", "menaquinone(8)", "menaquinone (8)", "mk(8)", "mq(8)", "mq (8)", "mk (8)"]
quinones["MK-9"] = ["mq9", "mq-9", "mk-9", "mk9", "menaquinone(mq9)", "menaquinone(mk9)", "menaquinone (mq9)", "menaquinone (mk9)", "menaquinone(9)", "menaquinone (9)", "mk(9)", "mq(9)", "mq (9)", "mk (9)"]
quinones["MK-10"] = ["mq10", "mq-10", "mk-10", "mk10", "menaquinone(mq10)", "menaquinone(mk10)", "menaquinone (mq10)", "menaquinone (mk10)", "menaquinone(10)", "menaquinone (10)", "mk(10)", "mq(10)", "mq (10)", "mk (10)"]

quinones["PQ"] = ["plastoquinone", " PQ "]
quinones["K1"] = ["phylloquinone", " K1 ", " K2 ", " K3 ", "phytomenadione"]


### Creating a new dictionary

In [8]:
#work case sensitivity

quinones = {
"q": ["quinone", "{},{}-benzoquinone", "{},{}-dimethyl-{},{}-benzoquinone", "{}-methyl-{},{}-naphthoquinone", "{},{}-dimethoxy-{},{}-naphthoquinone", "{},{},{},{}-tetramethyl-{},{}-benzoquinone"],
"uq": ["ubiquinone", "coenzyme Q", "coQ", "UQ", "Ubiquinone{}", "Coenzyme Q{}", "CoQ{}"],
"uq-{}": ["UQ{}", "UQ{}H{}", "UQ{}H{}", "ubiquinone-{}", "ubiquinone-{}(H{})", "ubiquinone-{}(H{})", "Ubiquinone-{}", "Ubiquinone-{}(H{})", "Ubiquinone-{}(H{})"],
"mk": ["menaquinone", "vitamin K2", "MK", "Menaquinone", "Vitamin K2"],
"mk-{}": ["MK{}", "MK{}H{}", "MK{}H{}", "menaquinone-{}", "menaquinone-{}(H{})", "menaquinone-{}(H{})", "Menaquinone-{}", "Menaquinone-{}(H{})", "Menaquinone-{}(H{})"],
"pq": ["plastoquinone", "PQ", "Plastoquinone"],
"rhq": ["rhodoquinone", "RQ", "Rhodoquinone"],
"caldq": ["Caldariella quinone", "CQ", "Caldariella Quinone"],
"dmk": ["demethyl-menaquinone", "DMK", "Demethyl-menaquinone"],
"mmk": ["methyl-menaquinone", "MMK", "Methyl-menaquinone"],
"{}-mmk": ["{}-methyl-menaquinone", "{}-MMK", "{}-methyl-Menaquinone"],
"{}-mmk": ["{}-methyl-menaquinone", "{}-MMK", "{}-Methyl-menaquinone"],
"{},{}-dmmk": ["{},{}-dimethyl-menaquinone", "{},{}-DMMK", "{},{}-Dimethyl-menaquinone"]
}

### Search data using dictionary

In [12]:

import pandas as pd 
df = pd.read_csv("output100.csv")
df.fillna("", inplace=True)
#del df["PubDate"] #deleting column

#checking to see where the keywords shows up
def contains_quinone(row):
    matches = {}
    for k, v in quinones.items():
        for term in v:
            found_in_title = term in row["Title"].lower()
            found_in_abstract = term in row["Abstract"].lower()
            if found_in_title and found_in_abstract:
                location = "Both"
            elif found_in_title:
                location = "Title"
            elif found_in_abstract:
                location = "Abstract"
            else:
                continue
            pattern = term.format(*[""] * term.count("{}"))
            if pattern not in matches:
                matches[pattern] = []
            matches[pattern].append(k)
    if matches:
        matches_str = ", ".join(f"'{k}': ({', '.join(matches[k])})" for k in matches)
        return matches_str, ", ".join(matches.keys())
    else:
        return None, None



df.fillna("", inplace=True)

#concat the keyword search and location found
df[["Quinone", "Quinone Location"]] = df.apply(contains_quinone, axis=1, result_type="expand")

#df[["Quinone", "Quinone Location"]] = df.apply(lambda row: pd.Series(contains_quinone(row)), axis=1)

df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 419 entries, 0 to 418
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Title             419 non-null    object
 1   Abstract          419 non-null    object
 2   Quinone           36 non-null     object
 3   Quinone Location  36 non-null     object
dtypes: object(4)
memory usage: 13.2+ KB


### Exploring the Results

In [26]:
#new dataframes for each search
dfNEW = df[(df["Quinone"].isnull())]


In [27]:
for index, row in dfNEW.iterrows():
    print("%s \n\t %s" % (row["Title"],row["Abstract"]))

Ottowia flava sp. nov., isolated from fish intestines. 
	 A novel Gram-negative bacterium, non-motile and short rod-shaped, designated strain GY511
Bioinformatic analyses of a potential Salmonella-virus-FelixO1 biocontrol phage BPS15S6 and the characterisation and anti-Enterobacteriaceae-pathogen activity of its endolysin LyS15S6. 
	 Foodborne Enterobacteriaceae pathogens, especially Salmonella, still seriously threaten food safety. To establish a foundation for further developing phage- and endolysin-based methods combating these pathogens, in this study, the newly isolated Salmonella-virus-FelixO1 phage BPS15S6 for biocontrol purposes was characterised by genomic bioinformatic analysis, and then its endolysin LyS15S6 was obtained using a prokaryotic expression system, characterised in vitro and evaluated in the antibacterial efficacy. It was shown that BPS15S6 had an 87,609-bp genome with 130 open reading frames and does not appear to carry known lysogeny-associated genes and other d

In [None]:
# Group the data by the "Quinone" column and count the frequency of each keyword match
grouped = df.groupby("Quinone")["Quinone Location"].apply(lambda x: pd.Series(x.str.split(", ").sum()).value_counts())
print(grouped)

In [None]:
### TO DO
#search for abstracts with ONE species and ONE quinone discovery/findings
#extract the species names and genus names from title
#search for "only" or !"predominant, main, major" near the quinone match