In [None]:
import parse 
import taxonomic_classification

import pandas as pd
from scipy.cluster.hierarchy import dendrogram, linkage
import matplotlib.pyplot as plt
from Bio import Entrez

import pickle

1. With `count_keywords` function we get number of articles that cite deeplabcut for each specie. Notice that the function is designed to exclude "others" category. 

In [None]:
df = parse.count_keywords(file_name='madlc_pubmed.ris', include_titles=True)

2. Since this function takes a bit long, let's save the output as a pickle file in the repository.

In [None]:
with open('count_keywords_pubmed.pkl', 'wb') as file:
    pickle.dump(df, file)

3. Loading the dictionary `count_keyword.pkl` 

In [None]:
with open('count_keywords.pkl', 'rb') as file:
    df = pickle.load(file)

4. From the dictionary, get animal list, without conts, only animal's name/label. 

In [None]:
#from the dictionary, get animal list, without counts, only animal name/label.
animal_list = list(df.keys()) 

In [None]:
animal_list

In [None]:
df['insect']

5. Convert this list of common names to list of scientific names. 

The NCBI E-utilities we are using require an email address to be specified. So first, we need to set our email before making the API calls. 

In [None]:
scientific_names = taxonomic_classification.common_to_scientific_names(animal_list)

In [None]:
def common_to_scientific(common_names:list):
    """
    Convert a list of specific animal species to their corresponding scientific names.
    Broad categories like 'primate' or 'crab' are left as common names.

    Args:
        common_names (list): A list of common animal names.

    Returns:
        list: A list of scientific names corresponding to the input common names.
              If a common name cannot be found or is a broad category, the corresponding entry in the list remains unchanged.
    """
    specific_species = ['mouse', 'dog', 'sheep', 'antelope', 'ant', 'jellyfish']  # Add more specific species here
    scientific_names = []
    
    for common_name in common_names:
        if common_name not in specific_species:
            scientific_names.append(common_name)
            continue

        try:
            handle = Entrez.esearch(db="taxonomy", term=f"{common_name}[Common Name]")
            record = Entrez.read(handle)
            handle.close()

            if record["Count"] == "0":
                scientific_names.append(common_name)  # Keep the common name if no scientific name is found
            else:
                taxid = record["IdList"][0]
                handle = Entrez.efetch(db="taxonomy", id=taxid, retmode="xml")
                records = Entrez.read(handle)
                handle.close()
                scientific_name = records[0]["ScientificName"]
                scientific_names.append(scientific_name)
        except Exception as e:
            print(f"An error occurred for '{common_name}': {str(e)}")
            scientific_names.append(common_name)  # Keep the common name in case of an error

    return scientific_names


In [None]:
sn = common_to_scientific(animal_list)

In [None]:
sn

In [None]:
def classifyanimals(animal_list:list, api_key:str):
    """
    Classify a list of animals based on their scientific names using the NCBI Taxonomy Database.

    Args:
        animal_list (list): A list of scientific names of animals to be classified.
        api_key (str): A valid API key for access to NCBI services. You need a user in the NCBI server and
                find the API keys in user settings. 

    Returns:
        pd.DataFrame: A DataFrame containing taxonomic data for the given animals retrieved from the NCBI Taxonomy Database.
                     The DataFrame includes information about the animals' classification at various taxonomic levels.
                     If a classification fails for an animal, the corresponding row in the DataFrame will contain None.
    """
    Entrez.api_key = api_key
    results = []
    for animal in animal_list:
        try:
            handle = Entrez.esearch(db="taxonomy", term=animal)
            record = Entrez.read(handle)

            if record["IdList"]:
                taxid = record["IdList"][0]
                handle = Entrez.efetch(db="taxonomy", id=taxid, retmode="xml")
                records = Entrez.read(handle)
                taxonomy_data = records[0]
                results.append(taxonomy_data)
            else:
                results.append(None)  # Add None if no ID is found
        except HTTPError as e:
            print(f"HTTPError for '{animal}': {e}")
            results.append(None)

    
    df = pd.DataFrame(results)
    return df

In [None]:
r = classifyanimals(sn, api_key='551e53dbe190f57bbd7cf1784ecd3e72b509')

#######################################

In [None]:
filtered_list = [item for item in scientific_names if item is not None]
filtered_list

6. Classify a list of animals based on their scientific names using the NCBI Taxonomy Database.

In [None]:
classified_data = taxonomic_classification.classify_animal(animal_list=filtered_list, api_key='551e53dbe190f57bbd7cf1784ecd3e72b509')

In [None]:
classified_data['Division']
#classified_data['Lineage']

In [None]:
import pandas as pd
import requests

def classify(animal_list: list, api_key: str):
    itis_url = "https://www.itis.gov/ITISWebService/jsonservice/searchForAnyMatch?srchKey="

    categorized_results = []

    for animal in animal_list:
        try:
            response = requests.get(f"{itis_url}{animal}")
            itis_data = response.json()

            if "commonNames" in itis_data:
                common_names = itis_data["commonNames"]
                if common_names:
                    common_name = common_names[0].get("name", "")
                    result = {"ScientificName": animal, "Category": common_name}
                    categorized_results.append(result)
            else:
                print(f"No commonNames field in response for '{animal}'")
        except Exception as e:
            print(f"An error occurred for '{animal}': {e}")

    df = pd.DataFrame(categorized_results)
    return df.dropna()


In [None]:
d = classify(animal_list=animal_list, api_key='551e53dbe190f57bbd7cf1784ecd3e72b509')

# NEW CODE

In [2]:
from reading_pdf import extract_section_text, get_animals_from_abstract, analyze_papers_from_abstracts, check_deeplabcut_citation, analyze_papers

In [3]:
dir = '/Users/annateruel/Documents/Papers Library'
papers_with_deeplabcut_count, _, papers_with_deeplabcut_dict, _ = analyze_papers(dir)

Reading /Users/annateruel/Documents/Papers Library/Hausmann-Measuring and modeling the motor system with machine learning-2021-arXiv.pdf...
No table of contents found, reading full document.
Reading /Users/annateruel/Documents/Papers Library/Hunter-Dopamine Neuron Stimulation Induces Context-Dependent Dyskinesias in Non-Parkinsonian Rats-2021-SSRN Electronic Journal.pdf...
No table of contents found, reading full document.
Reading /Users/annateruel/Documents/Papers Library/Pang-State-dependent central synaptic regulation by GLP-1 is essential for energy homeostasis-2024-Research Square.pdf...
No table of contents found, reading full document.
Reading /Users/annateruel/Documents/Papers Library/Ota-Implementing machine learning methods for imaging flow cytometry-2020-Microscopy.pdf...
Table of contents found.
Reading /Users/annateruel/Documents/Papers Library/Villafranca-Faus-Integrating pheromonal and spatial information in the amygdalo-hippocampal network-2021-Nature Communications.pdf

In [4]:
animals_in_papers = analyze_papers_from_abstracts(papers_with_deeplabcut_dict)

Processing /Users/annateruel/Documents/Papers Library/Hausmann-Measuring and modeling the motor system with machine learning-2021-arXiv.pdf...
No table of contents found, reading full document.
Processing /Users/annateruel/Documents/Papers Library/Hunter-Dopamine Neuron Stimulation Induces Context-Dependent Dyskinesias in Non-Parkinsonian Rats-2021-SSRN Electronic Journal.pdf...
No table of contents found, reading full document.
Processing /Users/annateruel/Documents/Papers Library/Pang-State-dependent central synaptic regulation by GLP-1 is essential for energy homeostasis-2024-Research Square.pdf...
No table of contents found, reading full document.
Processing /Users/annateruel/Documents/Papers Library/Ota-Implementing machine learning methods for imaging flow cytometry-2020-Microscopy.pdf...
Table of contents found.
Processing /Users/annateruel/Documents/Papers Library/Villafranca-Faus-Integrating pheromonal and spatial information in the amygdalo-hippocampal network-2021-Nature Com

In [None]:
for paper, animals in animals_in_papers.items():
    print(f"{paper}: {animals}")