In [1]:
from pathlib import Path
from zipfile import ZipFile
import deepsearch as ds
import os
import json
import re
import pdf2doi
import csv
import pandas as pd
pdf2doi.config.set("verbose", False)
from get_extraction import get_spans
from ipydatagrid import DataGrid

In [None]:
PROFILE_NAME = ""  # the profile to use
PROJ_KEY = (
    ""  # Project Key : https://ds4sd.github.io/deepsearch-toolkit/#getting-started
)
api = ds.CpsApi.from_env(profile_name=PROFILE_NAME)

In [3]:
def convert_and_download_documents(source_path: Path, api, proj_key, output_dir: Path):
    """Converts and downloads documents using Deepgram's API.

    Args:
        source_path (Path): Path to the source documents.
        api: Deepgram API.
        proj_key: Project key.
        output_dir (Path): Output directory to save the converted documents.
    """
    document = ds.convert_documents(
        api=api, proj_key=proj_key, source_path=source_path, progress_bar=True
    )
    document.download_all(result_dir=output_dir, progress_bar=True)


def process_zip_archive(zip_file: Path) -> Path:
    """Processes a zip archive, extracts JSON files, and returns the path of the extracted file.

    Args:
        zip_file (Path): Path to the zip archive.

    Returns:
        Path: Path to the extracted JSON file.
    """
    with ZipFile(zip_file) as archive:
        all_files = archive.namelist()
        for archive_file in all_files:
            if archive_file.endswith(".json"):
                archive.extract(archive_file, zip_file.parent)
                return zip_file.parent / archive_file

In [4]:
input_dir = Path("publication")

In [5]:
"""convert_and_download_documents(
        source_path=input_dir,
        api=api, 
        proj_key=PROJ_KEY, 
        output_dir=input_dir
)
print("Done")
for zip_file in [file for file in input_dir.iterdir() if file.name[-4:] == ".zip"]:
    process_zip_archive(zip_file=zip_file)
    zip_file.unlink()"""

'convert_and_download_documents(\n        source_path=input_dir,\n        api=api, \n        proj_key=PROJ_KEY, \n        output_dir=input_dir\n)\nprint("Done")\nfor zip_file in [file for file in input_dir.iterdir() if file.name[-4:] == ".zip"]:\n    process_zip_archive(zip_file=zip_file)\n    zip_file.unlink()'

In [6]:
results = pdf2doi.pdf2doi("publication")

In [7]:
json_folder_path = "publication"
outpu_folder_path = "publication/Training_data/"
counter = 0
for json_file_name in os.listdir(json_folder_path):
    if json_file_name.endswith(".json"):
        json_file_path = os.path.join(json_folder_path, json_file_name)

        # Read JSON data from the file
        with open(json_file_path, "r") as file:
            json_data = file.read()

        # Parse the JSON data
        data = json.loads(json_data)
        text_segments = []
        # Extract title
        try:
            title = data["description"]["title"]
            print("Title:", title)
        except Exception as e:
            for item in data.get("main-text", []):
                if item.get("type") == "subtitle-level-1":
                    extracted_text = item.get("text")
                    print(extracted_text)
                    title = extracted_text
                    break

        text_segments.append("Title:" + title + "\t")

        # Extract DOI from all text fields
        doi = results[0]["identifier"]
        print("DOI: ", doi)
        # Extract text from abstract to Results and Discussion
        for item in data["main-text"]:
            if "text" in item and "ABSTRACT" in item["text"]:
                start_index = data["main-text"].index(item)
                break

        for item in data["main-text"][start_index:]:
            if "text" in item and "RESULTS" in item["text"]:
                end_index = data["main-text"].index(item)
                print(end_index)
                break

        for item in data["main-text"][start_index:end_index]:
            if "text" in item:
                # Exclude specific patterns
                if re.match(r"© 2021 .*", item["text"]):
                    continue
                if "* s ı" in item["text"]:
                    continue
                if re.match(r"Supporting Information", item["text"]):
                    continue

                text_segments.append(item["text"])

        full_text = "\t".join(text_segments)

        # Write the extracted text to a file with the filename as the DOI

        if doi:
            text_output_file_path = os.path.join(
                outpu_folder_path, f'{str(counter+1)+"_"+doi.replace("/", ".")}.txt'
            )
            with open(text_output_file_path, "w") as text_output_file:
                text_output_file.write(full_text)

            print(f"Text has been written to {text_output_file_path}")

            # Save the original JSON file with the filename as the DOI
            json_output_file_path = os.path.join(
                outpu_folder_path, f'{str(counter+1)+"_"+doi.replace("/", ".")}.json'
            )
            with open(json_output_file_path, "w") as json_output_file:
                json.dump(data, json_output_file, indent=4)
            counter = counter + 1
        else:
            print("DOI not found.")

Talarodrides A-F, Nonadrides from the Antarctic Sponge-Derived Fungus Talaromyces sp. HDN1820200
DOI:  10.1021/acs.jnatprod.1c00203
19
Text has been written to publication/Training_data/1_10.1021.acs.jnatprod.1c00203.txt


In [8]:
extracted_text, data_dict, positions = get_spans(full_text)
data_dict['DOI'] = doi
print(data_dict)

{'compound_group': "Talarodrides A-F', 'talarodrides A-F(1-6)'", 'compound_class': 'Nonadrides, dimeric maleic anhydride nonadrides, maleic anhydride nonadrides', 'organism_part': 'nan', 'organism_or_species': 'Talaromyces sp', 'geo_location': 'Antarctic', 'Kingdom': 'Fungi', 'trivial_name': 'Talarodride A (1), talarodride B (2)', 'location': 'Antarctic Sponge', 'iupac_name': 'nan', 'abbreviation': 'nan', 'iupac_like_name': 'nan', 'DOI': '10.1021/acs.jnatprod.1c00203'}


In [12]:
def get_COCONUT_input(text_data, SMILES_list):
    df = pd.DataFrame(columns=['canonical_smiles', 'reference_id', 'name', 'doi', 'link', 'organism',
                               'organism_part', 'coconut_id', 'mol_filename', 'structural_comments',
                               'geo_location', 'location'])

    for smiles in SMILES_list:
        row = {'canonical_smiles': smiles,
               'reference_id': '',
               'name': text_data['trivial_name'].split(",")[0],
               'doi': text_data['DOI'],
               'link': '',
               'organism': text_data['organism_or_species'],
               'organism_part': text_data['organism_part'],
               'coconut_id': '',
               'mol_filename': '',
               'structural_comments': '',
               'geo_location': text_data['geo_location'],
               'location': text_data['location']}
        df = pd.concat([df, pd.DataFrame([row])], ignore_index=True)

    return df

In [13]:
with open('Image_Data_Extraction/final_output.csv', newline='') as csvfile:
    reader = csv.DictReader(csvfile)
    smiles_list = []
    for row in reader:
        smiles = row['Predicted Smiles']
        smiles_list.append(smiles)

print(smiles_list)

['CC(=O)NC1=CC=C(C=C1)OO', 'CC(C)CC1=CC=C(C=C1)C(C)C(=O)OO', 'CC(C)CC1=CC=C(C=C1)C(C)C(=O)OO', 'CC(C)CC1=CC=C(C=C1)C(C)C(=O)OC[C@@H]2[C@@H](CO)O[C@](CO)([C@H]2O)OC3[C@@H]([C@H]([C@@H]([C@@H](COC4[C@@H]([C@H]([C@H]([C@@H](CO)O4)O)O)O)O3)O)O)OOO']


In [14]:
df = get_COCONUT_input(data_dict,smiles_list)

In [15]:
grid = DataGrid(df, base_row_size=30, base_column_size=300, editable=True)
grid

DataGrid(auto_fit_params={'area': 'all', 'padding': 30, 'numCols': None}, base_column_size=300, base_row_size=…

In [26]:
updated_df = grid.data
updated_df.to_csv('data_updated.csv', index=False)