In [49]:
# allows for multiple cell outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

#importing necessary libraries
import os
import re
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import glob
import gzip
from lxml import etree
import time
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from collections import Counter

In [None]:
import glob
import gzip
import csv
import time
from lxml import etree

journals = ['Antonie van Leeuwenhoek', 'International journal of systematic and evolutionary microbiology', 'Biochemical and biophysical research communications']

path = 'TIMC-quinoneMining/pubmed-data'
xml_files = glob.glob(os.path.join(path, '*.xml.gz'))

file_limit = 1  # set number of files to process
xml_files = xml_files[:file_limit]  # apply file limit

start_time = time.time()  # start timing

count = 0  # initialize counter

with open('output.csv', 'w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow(['Journal Name', 'Date of Publication', 'PubMedID', 'Title', 'Abstract'])

    for i, file_path in enumerate(xml_files):
        try:
            with gzip.open(file_path, 'rb') as f:
                content = f.read()
        except OSError as e:
            print(f"Error opening file: {file_path}. Reason: {str(e)}")
            continue

        try:
            root = etree.fromstring(content)
        except etree.XMLSyntaxError as e:
            print(f"Error parsing XML: {file_path}. Reason: {str(e)}")
            continue

        for article in root.findall('.//PubmedArticle'):
            abstract_element = article.find('.//AbstractText')
            journal_name_element = article.find('.//Journal/Title')

            if abstract_element is None or journal_name_element is None or journal_name_element.text not in journals:
                continue

            title = article.findtext('.//ArticleTitle', default='')
            abstract = abstract_element.text or ''
            pubmed_id = article.findtext('.//PMID', default='')

            if not pubmed_id.isnumeric():
                print(f"Unexpected PubMedID in file: {file_path}. Skipping this record.")
                continue

            pub_date_element = article.find('.//PubDate')
            if pub_date_element is not None:
                pub_year = pub_date_element.findtext('.//Year', default='')
                pub_month = pub_date_element.findtext('.//Month', default='')
                pub_date = f"{pub_year}-{pub_month}"
            else:
                pub_date = ''

            writer.writerow([journal_name_element.text, pub_date, pubmed_id, title, abstract])
            count += 1  # increment counter

end_time = time.time()  # end timing

print(f"Process took {end_time - start_time} seconds.")
print(f"Total articles processed from specified journals: {count}")


In [53]:
### Building Quinone Dictionary
df = pd.read_csv("output.csv")
df.info()
df.head()
df.tail()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 79 entries, 0 to 78
Data columns (total 5 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Journal Name         79 non-null     object
 1   Date of Publication  79 non-null     object
 2   PubMedID             79 non-null     int64 
 3   Title                69 non-null     object
 4   Abstract             79 non-null     object
dtypes: int64(1), object(4)
memory usage: 3.2+ KB


Unnamed: 0,Journal Name,Date of Publication,PubMedID,Title,Abstract
0,International journal of systematic and evolut...,2019-Aug,31145673,"Kineobactrum sediminis gen. nov., sp. nov., is...","A novel Gram-stain-negative, rod-shaped marine..."
1,International journal of systematic and evolut...,2019-Aug,31145674,"Cutaneotrichosporon suis sp. nov., a lipolytic...","Two conspecific yeast strains, which based on ..."
2,International journal of systematic and evolut...,2019-Aug,31145675,"Shewanella psychromarinicola sp. nov., a psych...","Two Gram-stain-negative, rod-shaped, facultati..."
3,International journal of systematic and evolut...,2019-Oct,31145676,,"A rod-shaped, spore-forming, thermophilic, che..."
4,International journal of systematic and evolut...,2019-Oct,31145678,,"Two Gram-stain-negative, catalase- and oxidase..."


Unnamed: 0,Journal Name,Date of Publication,PubMedID,Title,Abstract
74,Biochemical and biophysical research communica...,2019-07,31171360,Crystal structure of pentameric shell protein ...,"Carboxysome, encapsulating an enzymatic core w..."
75,Biochemical and biophysical research communica...,2019-07,31171361,miR-374a/Myc axis modulates iron overload-indu...,The transformation of hepatic stellate cells (...
76,Biochemical and biophysical research communica...,2019-07,31171362,In situ reactivity of electrochemically genera...,There is enough proof to believe that free-rad...
77,Antonie van Leeuwenhoek,2019-Oct,31172329,"Flavisolibacter galbus sp. nov., isolated from...","A Gram-stain negative, non-motile, and yellow-..."
78,Antonie van Leeuwenhoek,2019-Nov,31172330,"Vibrio profundi sp. nov., isolated from a deep...","A Gram-stain negative, rod-shaped, facultative..."


In [14]:
### Building Quinone Dictionary
df = pd.read_csv("output50.csv")
df.info()
df.head()
df.tail()

quinones = {
    "q": ["quinone", 
          "{},{}-benzoquinone","{},{}-bq", 
          "{},{}-dimethyl-{},{}-benzoquinone","{},{}-dimethyl-{},{}-bq", "{},{}-d-{},{}-benzoquinone","{},{}-d-{},{}-bq", 
          "{}-methyl-{},{}-naphthoquinone", "{}-methyl-{},{}-nq", "{}-m-{},{}-naphthoquinone", "{}-m-{},{}-nq",
          "{},{}-dimethoxy-{},{}-naphthoquinone", "{},{}-dimethoxy-{},{}-nq","{},{}-d-{},{}-naphthoquinone", "{},{}-d-{},{}-nq",
          "{},{},{},{}-tetramethyl-{},{}-benzoquinone", "{},{},{},{}-tetramethyl-{},{}-bq", "{},{},{},{}-t-{},{}-benzoquinone", "{},{},{},{}-t-{},{}-bq",],
    "uq": ["ubiquinone", "coenzyme Q", "coQ", "UQ", "Ubiquinone{}", "Coenzyme Q{}", "CoQ{}", "uq{}", "coQ{}", "UQ{}", "Ubq{}", "CoQ{}", "CoQ{}"],
    "uq-{}": ["UQ{}", "UQ{}H{}", "ubiquinone-{}", "ubiquinone-{}(H{})", "Ubiquinone-{}", "Ubiquinone-{}(H{})", "uq-{}", "uq{}H{}", "Ubq-{}", "Ubq{}(H{})"],
    "mk": ["menaquinone", "vitamin K2", "MK", "Menaquinone", "Vitamin K2", "mk", "vK2", "MK", "Mk", "VK2"],
    "mk-{}": ["MK{}", "MK{}H{}", "menaquinone-{}", "menaquinone-{}(H{})", "Menaquinone-{}", "Menaquinone-{}(H{})", "mk-{}", "mk{}H{}", "Mk-{}", "Mk{}(H{})"],
    "pq": ["plastoquinone", "PQ", "Plastoquinone", "pq", "Pq"],
    "rhq": ["rhodoquinone", "RQ", "Rhodoquinone", "rhq", "Rhq"],
    "caldq": ["Caldariella quinone", "CQ", "Caldariella Quinone", "Caldariellaquinone", "caldq", "Caldq", "Cq"],
    "dmk": ["demethyl-menaquinone", "DMK", "Demethyl-menaquinone", "dmk", "Dmk"],
    "mmk": ["methyl-menaquinone", "MMK", "Methyl-menaquinone", "mmk", "Mmk"],
    "{}-mmk": ["{}-methyl-menaquinone", "{}-MMK", "{}-methyl-Menaquinone", "{}-Methyl-menaquinone", "{}-mmk", "{}-Mmk"],
    "{},{}-dmmk": ["{},{}-dimethyl-menaquinone", "{},{}-DMMK", "{},{}-Dimethyl-menaquinone", "{},{}-dmmk", "{},{}-Dmmk"],
}

### Extracting Quninones, Species and Genus, and Keywords
# regex pattern to match species and genus names
species_genus_pattern = re.compile(r"[A-Z][a-z]*\s[a-z]*")

# function to extract species and genus names from a title
def extract_species_genus(title):
    return ", ".join(species_genus_pattern.findall(title))

# function to find sentences containing a keyword in a given text
def find_keyword_sentence(text, keyword):
    sentences = sent_tokenize(text)
    return "; ".join(sentence for sentence in sentences if keyword.lower() in sentence.lower())

# function to check if a row contains any quinone terms
def contains_quinone(row):
    matches = {}
    keyword_sentences = set()

    # iterate over quinone terms
    for k, v in quinones.items():
        for term in v:
            # create a regex pattern for term
            pattern = term.format(*["\d+"] * term.count("{}"))
            regex = re.compile(pattern, re.IGNORECASE)

            # search for term in title and abstract of row
            match_in_title = regex.search(row["Title"])
            match_in_abstract = regex.search(row["Abstract"])

            # if there is a match, store matched text and related information
            if match_in_title or match_in_abstract:
                match_text = match_in_title.group(0) if match_in_title else match_in_abstract.group(0)

                if match_text not in matches:
                    matches[match_text] = []
                matches[match_text].append(k)

                # find keyword sentences containing matched text in title and abstract
                keyword_sentence_title = find_keyword_sentence(row["Title"], match_text)
                keyword_sentence_abstract = find_keyword_sentence(row["Abstract"], match_text)

                if keyword_sentence_title:
                    keyword_sentences.add(keyword_sentence_title)
                if keyword_sentence_abstract:
                    keyword_sentences.add(keyword_sentence_abstract)

    # if there are matches, return matches and related keyword sentences
    if matches:
        matches_str = ", ".join(f"'{k}': ({', '.join(matches[k])})" for k in matches)
        return matches_str, "; ".join(keyword_sentences)
    else:
        return None, None


df = pd.read_csv("output50.csv")
df.fillna("", inplace=True)

# apply contains_quinone function to each row of DataFrame and expand results into columns
df[["Quinone", "Keyword Sentence"]] = df.apply(contains_quinone, axis=1, result_type="expand")

# extract species and genus names from titles
df["Species and Genus"] = df["Title"].apply(extract_species_genus)

# df['Journal Name'].describe()

df = df[(~df["Quinone"].isnull())] # df with just quinone matches
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 420 entries, 0 to 419
Data columns (total 5 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Journal Name         420 non-null    object
 1   Date of Publication  420 non-null    object
 2   PubMedID             420 non-null    int64 
 3   Title                346 non-null    object
 4   Abstract             419 non-null    object
dtypes: int64(1), object(4)
memory usage: 16.5+ KB


Unnamed: 0,Journal Name,Date of Publication,PubMedID,Title,Abstract
0,International journal of systematic and evolut...,2019-Aug,31145673,"Kineobactrum sediminis gen. nov., sp. nov., is...","A novel Gram-stain-negative, rod-shaped marine..."
1,International journal of systematic and evolut...,2019-Aug,31145674,"Cutaneotrichosporon suis sp. nov., a lipolytic...","Two conspecific yeast strains, which based on ..."
2,International journal of systematic and evolut...,2019-Aug,31145675,"Shewanella psychromarinicola sp. nov., a psych...","Two Gram-stain-negative, rod-shaped, facultati..."
3,International journal of systematic and evolut...,2019-Oct,31145676,,"A rod-shaped, spore-forming, thermophilic, che..."
4,International journal of systematic and evolut...,2019-Oct,31145678,,"Two Gram-stain-negative, catalase- and oxidase..."


Unnamed: 0,Journal Name,Date of Publication,PubMedID,Title,Abstract
415,International journal of systematic and evolut...,2016-Dec,27618795,"Labrenzia salina sp. nov., isolated from the r...","A novel, halophilic, motile, rod-shaped, Gram-..."
416,International journal of systematic and evolut...,2016-Dec,27619232,Isolation and characterization of a novel Gram...,The taxonomic position of a Gram-stain negativ...
417,International journal of systematic and evolut...,2016-Dec,27620694,"Patulibacter brassicae sp. nov., isolated from...","A novel actinobacterial strain, designated SDT..."
418,International journal of systematic and evolut...,2016-Dec,27620848,Genome-based phylogeny and taxonomy of the 'En...,Understanding of the phylogeny and interrelati...
419,International journal of systematic and evolut...,2016-Dec,27620889,"Saccharomonospora xiaoerkulensis sp. nov., iso...","A novel actinomycete, strain TRM 41495T, was i..."


<class 'pandas.core.frame.DataFrame'>
Int64Index: 108 entries, 40 to 419
Data columns (total 8 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Journal Name         108 non-null    object
 1   Date of Publication  108 non-null    object
 2   PubMedID             108 non-null    int64 
 3   Title                108 non-null    object
 4   Abstract             108 non-null    object
 5   Quinone              108 non-null    object
 6   Keyword Sentence     108 non-null    object
 7   Species and Genus    108 non-null    object
dtypes: int64(1), object(7)
memory usage: 7.6+ KB


In [46]:

## keyword analysis - create a counter dictionary and update it with quinones from data
## taxonomy classification - NCBI's API for taxonony data?
## quinone length analysis
## visualization 

# statistics - on taxonomy distribution, chains of quinones, lengths varying different genus/order, variablility in the quinone chain.

# crosscheck species/genus mentioned in title with abstract to find sentence where that specific species/genus was mentioned.
### check what to do with pubmedResults.txt file
# classify organisms using the taxonomy based on species and genus names extracted.
# the hieacal classification order - missing species
# look for artices give a species, validate quinone info.
# get the tail length in a new column.

# determine frequency of occurrence of each type of quinone in data set.
from collections import Counter
from nltk.tokenize import word_tokenize

quinone_counter = Counter()

for quinone_description in df["Quinone Description"]:
    tokens = word_tokenize(quinone_description)
    quinone_counter.update([token for token in tokens if token in quinone_types])

print(quinone_counter)

# identify relationships between types of quinones found and species or genus of organisms.
# look for patterns or trends in data that might indicate a relationship between length of quinone tail and species or genus of organism.
# create visualizations to better understand and communicate these patterns or relationships.
# based on these patterns or relationships, make predictions about types of quinones that might be found in other species or genus.

# using 'Species and Genus' classify organisms according to their taxonomy 
# link organisms to specific quinones using 'Quinone Found' 
# understand quinone structure: information about type of quinone and length of quinone tail can be used to study structural diversity of quinones across different organisms.


Counter({'quinone': 178, 'menaquinone': 138, 'ubiquinone': 76, 'Menaquinone': 5})


In [None]:
# formating outputs
outputFile = open("pubMedResults50.txt", "w+")
for count, (index, row) in enumerate(df.iterrows(), start=1):
    outputFile.write(f"{count}.Journal: {row['Journal Name']}\n\tPubMedID: {row['PubMedID']}\n\tSpecies and Genus: {row['Species and Genus']}\n\tQuinone Found: {row['Quinone']}\n\t{row['Keyword Sentence']}\n\n")
outputFile.close()

In [None]:
# Feedback on your intermediate report:

# Please provide a title: 
    #Taxonomy Classification and Quinone Analysis from Scientific Publications: A Natural Language Processing Approach
# The report is clear and fulfills the requirements, but it contains many typos that should be corrected.

# There is no explicit related work section, and only one reference is given, this has to be strengthened.
# State of the art - explain the importance of work and relate it to how the last real research done on this was from the 1981 paper
# quinones are sometimes used as taxonomic markers - specific quiones can be link to specific species
# molecule that is a diagnostic for certain species
# look at relation same species order have same quinones

# There is a figure that reports experiments but no details are given. - explain to novice reader
# What was the motivation and the experimental conditions?
# Unique topic, computer science to improve the field of biology
# biological context - info, importantance, understand of quinone distrubution lilited. info in journals, microbiology intrest in quinones, TIMC motivations - pathways oninone biosynthese, resparatory chains, qunone type breath different substrates, what conditions organism can grow and colonize. quinone and organism give a hint of what conditions are used to grow them

# team - charaterizing quinone pathways by identifying genes in production
# confront with team data - validate, discover new pathways? 
# figures to explain the pathways, variants, etc.
# very expiremental work - plenty of surprises

# The purpose is to extract all this uncollected quinone data and compare it to research, in a way validating the journals and updating any new discoveries or information previously unknown.

# species -> quinones
