## Searching Abstracts


In [None]:
#search for abstracts with ONE species and ONE quinone discovery/findings
#extract the species names and genus names from title
#search for "only" or !"predominant, main, major" near the quinone match


In [2]:
#allows for multiple cell outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk

from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from collections import Counter

## Extract Files

In [3]:
import os
import gzip
from lxml import etree
import csv

data = []
journals = ['Antonie van Leeuwenhoek', 'International journal of systematic and evolutionary microbiology']

path = 'TIMC-quinoneMining/pubmed-data'
file_limit = 1  # set the number of files to process

# get a list of all .xml.gz files in the path
xml_files = [os.path.join(path, f) for f in os.listdir(path) if f.endswith('.xml.gz')]

# process the files
for i, file_path in enumerate(xml_files):
    if i >= file_limit:
        break
    with gzip.open(file_path, 'rb') as f:
        content = f.read()
        root = etree.fromstring(content)
        for article in root.findall('.//PubmedArticle'):
            if article.find('.//ArticleTitle') is None:
                continue
            if article.find('.//AbstractText') is None:
                continue
            journal_name = article.find('.//Journal/Title').text or ''
            if journal_name not in journals:
                continue
            title = article.find('.//ArticleTitle').text or ''
            abstract = article.find('.//AbstractText').text or ''
            pubmed_id = article.find('.//PMID').text or ''
            pub_date = article.find('.//PubDate')
            if pub_date is not None:
                pub_year_element = pub_date.find('.//Year')
                if pub_year_element is not None:
                    pub_year = pub_year_element.text
                else:
                    pub_year = ''
            else:
                pub_year = ''
            data.append([journal_name, pub_year, pubmed_id, title, abstract])

# write the data to a CSV file
with open('output.csv', 'w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow(['Journal Name', 'Year of Publication', 'PubMedID', 'Title', 'Abstract'])
    writer.writerows(data)


df = pd.read_csv("output.csv")
df.info()
df.head()
df.tail()


58

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33 entries, 0 to 32
Data columns (total 5 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Journal Name         33 non-null     object
 1   Year of Publication  33 non-null     int64 
 2   PubMedID             33 non-null     int64 
 3   Title                23 non-null     object
 4   Abstract             33 non-null     object
dtypes: int64(2), object(3)
memory usage: 1.4+ KB


Unnamed: 0,Journal Name,Year of Publication,PubMedID,Title,Abstract
0,International journal of systematic and evolut...,2019,31145673,"Kineobactrum sediminis gen. nov., sp. nov., is...","A novel Gram-stain-negative, rod-shaped marine..."
1,International journal of systematic and evolut...,2019,31145674,"Cutaneotrichosporon suis sp. nov., a lipolytic...","Two conspecific yeast strains, which based on ..."
2,International journal of systematic and evolut...,2019,31145675,"Shewanella psychromarinicola sp. nov., a psych...","Two Gram-stain-negative, rod-shaped, facultati..."
3,International journal of systematic and evolut...,2019,31145676,,"A rod-shaped, spore-forming, thermophilic, che..."
4,International journal of systematic and evolut...,2019,31145678,,"Two Gram-stain-negative, catalase- and oxidase..."


Unnamed: 0,Journal Name,Year of Publication,PubMedID,Title,Abstract
28,International journal of systematic and evolut...,2019,31169487,"Desertihabitans aurantiacus gen. nov., sp. nov...",The taxonomic position of an actinobacterium i...
29,International journal of systematic and evolut...,2019,31169491,"Acidimangrovimonas sediminis gen. nov., sp. no...","A Gram-stain-negative, aerobic, non-motile, sh..."
30,International journal of systematic and evolut...,2019,31169492,"Sphingobium terrigena sp. nov., isolated from ...","A Gram-stain-negative, strictly aerobic bacter..."
31,Antonie van Leeuwenhoek,2019,31172329,"Flavisolibacter galbus sp. nov., isolated from...","A Gram-stain negative, non-motile, and yellow-..."
32,Antonie van Leeuwenhoek,2019,31172330,"Vibrio profundi sp. nov., isolated from a deep...","A Gram-stain negative, rod-shaped, facultative..."


In [4]:
df = pd.read_csv("output.csv")
df.info()
df.head()
df.tail()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33 entries, 0 to 32
Data columns (total 5 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Journal Name         33 non-null     object
 1   Year of Publication  33 non-null     int64 
 2   PubMedID             33 non-null     int64 
 3   Title                23 non-null     object
 4   Abstract             33 non-null     object
dtypes: int64(2), object(3)
memory usage: 1.4+ KB


Unnamed: 0,Journal Name,Year of Publication,PubMedID,Title,Abstract
0,International journal of systematic and evolut...,2019,31145673,"Kineobactrum sediminis gen. nov., sp. nov., is...","A novel Gram-stain-negative, rod-shaped marine..."
1,International journal of systematic and evolut...,2019,31145674,"Cutaneotrichosporon suis sp. nov., a lipolytic...","Two conspecific yeast strains, which based on ..."
2,International journal of systematic and evolut...,2019,31145675,"Shewanella psychromarinicola sp. nov., a psych...","Two Gram-stain-negative, rod-shaped, facultati..."
3,International journal of systematic and evolut...,2019,31145676,,"A rod-shaped, spore-forming, thermophilic, che..."
4,International journal of systematic and evolut...,2019,31145678,,"Two Gram-stain-negative, catalase- and oxidase..."


Unnamed: 0,Journal Name,Year of Publication,PubMedID,Title,Abstract
28,International journal of systematic and evolut...,2019,31169487,"Desertihabitans aurantiacus gen. nov., sp. nov...",The taxonomic position of an actinobacterium i...
29,International journal of systematic and evolut...,2019,31169491,"Acidimangrovimonas sediminis gen. nov., sp. no...","A Gram-stain-negative, aerobic, non-motile, sh..."
30,International journal of systematic and evolut...,2019,31169492,"Sphingobium terrigena sp. nov., isolated from ...","A Gram-stain-negative, strictly aerobic bacter..."
31,Antonie van Leeuwenhoek,2019,31172329,"Flavisolibacter galbus sp. nov., isolated from...","A Gram-stain negative, non-motile, and yellow-..."
32,Antonie van Leeuwenhoek,2019,31172330,"Vibrio profundi sp. nov., isolated from a deep...","A Gram-stain negative, rod-shaped, facultative..."


## Dictionary Search

In [3]:
#work case sensitivity
#for each token if there is quinone, and not in exact match store somewhere 
quinones = {
"q": ["quinone", "{},{}-benzoquinone", "{},{}-dimethyl-{},{}-benzoquinone", "{}-methyl-{},{}-naphthoquinone", "{},{}-dimethoxy-{},{}-naphthoquinone", "{},{},{},{}-tetramethyl-{},{}-benzoquinone"],
"uq": ["ubiquinone", "coenzyme Q", "coQ", "UQ", "Ubiquinone{}", "Coenzyme Q{}", "CoQ{}"],
"uq-{}": ["UQ{}", "UQ{}H{}", "UQ{}H{}", "ubiquinone-{}", "ubiquinone-{}(H{})", "ubiquinone-{}(H{})", "Ubiquinone-{}", "Ubiquinone-{}(H{})", "Ubiquinone-{}(H{})"],
"mk": ["menaquinone", "vitamin K2", "MK", "Menaquinone", "Vitamin K2"],
"mk-{}": ["MK{}", "MK{}H{}", "MK{}H{}", "menaquinone-{}", "menaquinone-{}(H{})", "menaquinone-{}(H{})", "Menaquinone-{}", "Menaquinone-{}(H{})", "Menaquinone-{}(H{})"],
"pq": ["plastoquinone", "PQ", "Plastoquinone"],
"rhq": ["rhodoquinone", "RQ", "Rhodoquinone"],
"caldq": ["Caldariella quinone", "CQ", "Caldariella Quinone", "Caldariellaquinone"],
"dmk": ["demethyl-menaquinone", "DMK", "Demethyl-menaquinone"],
"mmk": ["methyl-menaquinone", "MMK", "Methyl-menaquinone"],
"{}-mmk": ["{}-methyl-menaquinone", "{}-MMK", "{}-methyl-Menaquinone"],
"{}-mmk": ["{}-methyl-menaquinone", "{}-MMK", "{}-Methyl-menaquinone"],
"{},{}-dmmk": ["{},{}-dimethyl-menaquinone", "{},{}-DMMK", "{},{}-Dimethyl-menaquinone"],
#case sensitiviity
"Q": ["Quinone", "{},{}-Benzoquinone", "{},{}-Dimethyl-{},{}-Benzoquinone", "{}-Methyl-{},{}-Naphthoquinone", "{},{}-Dimethoxy-{},{}-Naphthoquinone", "{},{},{},{}-Tetramethyl-{},{}-Benzoquinone"],
"UQ": ["Ubiquinone", "Coenzyme Q", "CoQ", "UQ", "Ubiquinone{}", "Coenzyme Q{}", "CoQ{}"],
"UQ-{}": ["UQ{}", "UQ{}H{}", "UQ{}H{}", "Ubiquinone-{}", "Ubiquinone-{}(H{})", "Ubiquinone-{}(H{})", "Ubiquinone-{}", "Ubiquinone-{}(H{})", "Ubiquinone-{}(H{})"],
"MK": ["Menaquinone", "Vitamin K2", "MK", "Menaquinone", "Vitamin K2"],
"MK-{}": ["MK{}", "MK{}H{}", "MK{}H{}", "Menaquinone-{}", "Menaquinone-{}(H{})", "Menaquinone-{}(H{})", "Menaquinone-{}", "Menaquinone-{}(H{})", "Menaquinone-{}(H{})"],
"PQ": ["Plastoquinone", "PQ", "Plastoquinone"],
"RHQ": ["Rhodoquinone", "RQ", "Rhodoquinone"],
"CALDQ": ["Caldariella quinone", "CQ", "Caldariella Quinone"],
"DMK": ["Demethyl-menaquinone", "DMK", "Demethyl-menaquinone"],
"MMK": ["Methyl-menaquinone", "MMK", "Methyl-menaquinone"],
"{}-MMK": ["{}-Methyl-menaquinone", "{}-MMK", "{}-Methyl-Menaquinone"],
"{}-MMK": ["{}-Methyl-menaquinone", "{}-MMK", "{}-Methyl-menaquinone"],
"{},{}-DMMK": ["{},{}-Dimethyl-menaquinone", "{},{}-DMMK", "{},{}-Dimethyl-menaquinone"]

}

In [5]:
species_genus_pattern = re.compile(r"[A-Z][a-z]*\s[a-z]*")

def extract_species_genus(title):
    return ", ".join(species_genus_pattern.findall(title))

def find_keyword_sentence(text, keyword):
    sentences = sent_tokenize(text)
    return "; ".join(sentence for sentence in sentences if keyword.lower() in sentence.lower())

def contains_quinone(row):
    matches = {}
    keyword_sentences = []
 
    for k, v in quinones.items():
        for term in v:
            pattern = term.format(*["\d+"] * term.count("{}"))
            regex = re.compile(pattern, re.IGNORECASE)

            match_in_title = regex.search(row["Title"])
            match_in_abstract = regex.search(row["Abstract"])

            if match_in_title or match_in_abstract:
                match_text = match_in_title.group(0) if match_in_title else match_in_abstract.group(0)

                if match_text not in matches:
                    matches[match_text] = []
                matches[match_text].append(k)

                keyword_sentence_title = find_keyword_sentence(row["Title"], match_text)
                keyword_sentence_abstract = find_keyword_sentence(row["Abstract"], match_text)

                if keyword_sentence_title:
                    keyword_sentences.append(keyword_sentence_title)
                if keyword_sentence_abstract:
                    keyword_sentences.append(keyword_sentence_abstract)

    if matches:
        matches_str = ", ".join(f"'{k}': ({', '.join(matches[k])})" for k in matches)
        return matches_str, "; ".join(keyword_sentences)
    else:
        return None, None


df = pd.read_csv("output500.csv")
df.fillna("", inplace=True)

df[["Quinone", "Keyword Sentence"]] = df.apply(contains_quinone, axis=1, result_type="expand")
df["Species and Genus"] = df["Title"].apply(extract_species_genus)

#df = df[(~df["Quinone"].isnull())]
df.info()
df.head()
df.tail()



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1956 entries, 0 to 1955
Data columns (total 8 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Journal Name         1956 non-null   object
 1   Year of Publication  1956 non-null   object
 2   PubMedID             1956 non-null   int64 
 3   Title                1956 non-null   object
 4   Abstract             1956 non-null   object
 5   Quinone              259 non-null    object
 6   Keyword Sentence     259 non-null    object
 7   Species and Genus    1956 non-null   object
dtypes: int64(1), object(7)
memory usage: 122.4+ KB


Unnamed: 0,Journal Name,Year of Publication,PubMedID,Title,Abstract,Quinone,Keyword Sentence,Species and Genus
0,Antonie van Leeuwenhoek,2019.0,31147966,"Ottowia flava sp. nov., isolated from fish int...","A novel Gram-negative bacterium, non-motile an...",,,Ottowia flava
1,Antonie van Leeuwenhoek,2019.0,31147967,Bioinformatic analyses of a potential Salmonel...,"Foodborne Enterobacteriaceae pathogens, especi...",,,Bioinformatic analyses
2,Antonie van Leeuwenhoek,2019.0,31165292,"Hymenobacter oligotrophus sp. nov., isolated f...","A taxonomic study of a Gram-stain negative, ro...",,,Hymenobacter oligotrophus
3,Antonie van Leeuwenhoek,2019.0,31172329,"Flavisolibacter galbus sp. nov., isolated from...","A Gram-stain negative, non-motile, and yellow-...",,,"Flavisolibacter galbus, Jeju"
4,Antonie van Leeuwenhoek,2019.0,31172330,"Vibrio profundi sp. nov., isolated from a deep...","A Gram-stain negative, rod-shaped, facultative...",,,Vibrio profundi


Unnamed: 0,Journal Name,Year of Publication,PubMedID,Title,Abstract,Quinone,Keyword Sentence,Species and Genus
1951,Antonie van Leeuwenhoek,2012.0,22711299,Molecular signatures for the phylum Synergiste...,Species belonging to the phylum Synergistetes ...,,,"Molecular signatures, Synergistetes and"
1952,Antonie van Leeuwenhoek,2012.0,22718122,Structured morphological modeling as a framewo...,Successful application of a computational mode...,,,"Structured morphological, Streptomyces species"
1953,Antonie van Leeuwenhoek,2012.0,22733059,Saccharomyces bacillaris is not a synonym of C...,Torulopsis bacillaris (Kroemer and Krumbholz) ...,,,"Saccharomyces bacillaris, Candida stellata, St..."
1954,Antonie van Leeuwenhoek,2012.0,22733060,Extracellular sugar phosphates are assimilated...,Filamentous microorganisms of the bacterial ge...,,,"Extracellular sugar, Streptomyces in"
1955,Antonie van Leeuwenhoek,2012.0,22733061,"Pseudonocardia nantongensis sp. nov., a novel ...","A novel isolate, designated strain KLBMP 1282(...","'quinone': (q, Q), 'ubiquinone': (uq, UQ), 'MK...",Strain KLBMP 1282(T) contained MK-8(H(4)) as t...,"Pseudonocardia nantongensis, Tamarix chinensis"


In [74]:
#looking at Keyword Sentence & Species and Genus
count = 0
for index, row in df.iterrows():
    count += 1
    print("%d. Title: %s \n\t Quinone Found: %s \n\t %s\n\n" % (count, row["Species and Genus"], row["Quinone"], row["Keyword Sentence"]))


1. Title: Oceanobacillus aidingensis 
	 Quinone Found: 'quinone': (q, Q), 'MK': (mk, MK) 
	 The diamino acid in the peptidoglycan and the major quinone system were determined to be meso-diaminopimelic acid (meso-DAP) and MK-7, respectively.; The diamino acid in the peptidoglycan and the major quinone system were determined to be meso-diaminopimelic acid (meso-DAP) and MK-7, respectively.; The diamino acid in the peptidoglycan and the major quinone system were determined to be meso-diaminopimelic acid (meso-DAP) and MK-7, respectively.; The diamino acid in the peptidoglycan and the major quinone system were determined to be meso-diaminopimelic acid (meso-DAP) and MK-7, respectively.


2. Title: Variibacter gotjawalensis 
	 Quinone Found: 'quinone': (q, Q) 
	 The major fatty acids were identified as C18:1ω7c, C16:0 and C17:0, the predominant isoprenoid quinone as Q-10, the polar lipids as diphosphatidylglycerol, phosphatidylglycerol, phosphatidylethanolamine, phosphatidylcholine, an unid

In [None]:
##code to format the columns
# df = pd.read_csv("output500.csv")
# new_order = [ 'Journal Name', 'Year of Publication', 'PubMedID', 'Title', 'Abstract', 'Quinone', 'Keyword Sentence', 'Species and Genus', 'Species and Quinone', 'Matched Keywords']
# cols_to_drop = ['Quinone', 'Keyword Sentence', 'Species and Genus', 'Species and Quinone', 'Matched Keywords']
# df = df.reindex(columns=new_order)
# df.drop(cols_to_drop, axis=1, inplace=True)
# df.to_csv('output500.csv', index=False)


## Analyzing the search results

In [80]:
df.columns

Index(['Journal Name', 'Year of Publication', 'PubMedID', 'Title', 'Abstract',
       'Quinone', 'Keyword Sentence', 'Species and Genus'],
      dtype='object')

In [None]:
# for compressing .xml -> .xml.gz
# import gzip

# def compress_xml_file(input_file_path, output_file_path):
#     with open(input_file_path, 'rb') as input_file:
#         with gzip.open(output_file_path, 'wb') as output_file:
#             output_file.writelines(input_file)

# # Example usage
# input_file_path = '/Users/suraj/Desktop/TIMC/TIMC-quinoneMining/pubmed-data/pubmed22n1105.xml'
# output_file_path = '/Users/suraj/Desktop/TIMC/TIMC-quinoneMining/pubmed-data/pubmed22n1105.xml.gz'
# compress_xml_file(input_file_path, output_file_path)