# Create Index

- First we load all required packages.

In [1]:
import requests
import os
from pathlib import Path

import pandas as pd
import numpy as np
import bibtexparser

from paperqa import Settings
from paperqa.agents.search import get_directory_index

#if all(os.environ.get(var) for var in ["OPENAI_API_KEY", "CR_API_MAILTO", "CR_API_AGENT", "SEMANTIC_SCHOLAR_API_KEY"]):
#    pass
#else:
#    raise EnvironmentError("Required environment variables are not set.")

- Testing the Semantic Scholar API

- Status can also be checked here: https://status.api.semanticscholar.org/

In [2]:
import requests

paperId = "649def34f8be52c8b66281af98ae884c09aef38b"

# Define the API endpoint URL
url = f"http://api.semanticscholar.org/graph/v1/paper/{paperId}"

# Define the query parameters
query_params = {"fields": "title,year,abstract,citationCount"}

# Directly define the API key (Reminder: Securely handle API keys in production environments)
#api_key = os.environ.get("SEMANTIC_SCHOLAR_API_KEY")
#api_key = "e9i75ckBya3PRMOJdCUWu7gD3xUl2mAh20woJMUI"

# Define headers with API key
headers = {"x-api-key": None}

# Send the API request
response = requests.get(url, params=query_params, headers=headers)

# Check response status
if response.status_code == 200:
   response_data = response.json()
   # Process and print the response data as needed
   print(response_data)
else:
   print(f"Request failed with status code {response.status_code}: {response.text}")

{'paperId': '649def34f8be52c8b66281af98ae884c09aef38b', 'title': 'Construction of the Literature Graph in Semantic Scholar', 'abstract': 'We describe a deployed scalable system for organizing published scientific literature into a heterogeneous graph to facilitate algorithmic manipulation and discovery. The resulting literature graph consists of more than 280M nodes, representing papers, authors, entities and various interactions between them (e.g., authorships, citations, entity mentions). We reduce literature graph construction into familiar NLP tasks (e.g., entity extraction and linking), point out research challenges due to differences from standard formulations of these tasks, and report empirical results for each task. The methods described in this paper are used to enable semantic features in www.semanticscholar.org.', 'year': 2018, 'citationCount': 381}


- For now I will delete the semantic scholar API key, and just slow down the concurrency

In [3]:
#del os.environ['SEMANTIC_SCHOLAR_API_KEY']

- Create the `manifest.csv` file that contains all the required information.

In [4]:
export_dir_name = "2024-01-09_MS_Export"
project_dir = Path("/") / "Users" / "pschafer" / "Projects" / "MS_expert"
paper_dir = project_dir / export_dir_name
index_dir = project_dir / f"{export_dir_name}_index"
bib_file = paper_dir / f"{export_dir_name}.bib"
manifest_file = project_dir / f"{export_dir_name}_manifest.csv"

with open(bib_file, "r") as bibfile:
    bib_database = bibtexparser.load(bibfile)

attributes = ["title", "doi", "file"]
manifest_df_list = []

# Access the entries
entries = bib_database.entries
for entry in entries:
    check_vec = np.array([attribute in entry.keys() for attribute in attributes])
    if np.all(check_vec):
        entry_df = pd.DataFrame({"title": [entry["title"]],
                                 "doi": [entry["doi"]],
                                 "file_location": [entry["file"]]})
        manifest_df_list.append(entry_df)
    else:
        print(entry)
        print(check_vec)
manifest_df = pd.concat(manifest_df_list, axis=0)
manifest_df.reset_index()
manifest_df["file_location"] = [str(paper_dir) + "/" + f for f in manifest_df["file_location"]]

# test
#manifest_df = manifest_df.iloc[0:10, :]
manifest_df.to_csv(manifest_file)
manifest_df

{'pmid': '35658012', 'abstract': 'The immune system is highly time-of-day dependent. Pioneering studies in the 1960s were the first to identify immune responses to be under a circadian control. Only in the last decade, however, have the molecular factors governing circadian immune rhythms been identified. These studies have revealed a highly complex picture of the interconnectivity of rhythmicity within immune cells with that of their environment. Here, we provide a global overview of the circadian immune system, focusing on recent advances in the rapidly expanding field of circadian immunology.', 'doi': '10.1126/sciimmunol.abm2465', 'pages': 'eabm2465', 'number': '72', 'volume': '7', 'journal': 'Science Immunology', 'year': '2022', 'author': 'Wang, Chen and Lutes, Lydia Kay and Barnoud, Coline and Scheiermann, Christoph', 'title': 'The Circadian Immune System', 'ENTRYTYPE': 'article', 'ID': '10.1126/sciimmunol.abm2465'}
[ True  True False]
{'pmid': '34767455', 'pmcid': 'PMC8855935', '

Unnamed: 0,title,doi,file_location
0,Spatially Resolved Gene Signatures of White Ma...,10.1038/s41593-024-01765-6,/Users/pschafer/Projects/MS_expert/2024-01-09_...
0,Tensor-Based Insights into Systems Immunity an...,10.1016/j.it.2023.03.003,/Users/pschafer/Projects/MS_expert/2024-01-09_...
0,Cell Type Mapping Reveals Tissue Niches and In...,10.1038/s41593-024-01796-z,/Users/pschafer/Projects/MS_expert/2024-01-09_...
0,"Location, Location, Location: {{Tissue}} Resid...",10.1126/sciimmunol.aas9673,/Users/pschafer/Projects/MS_expert/2024-01-09_...
0,Innate Lymphoid Cells: {{A}} New Paradigm in I...,10.1126/science.aaa6566,/Users/pschafer/Projects/MS_expert/2024-01-09_...
...,...,...,...
0,{{TCR}} Sequencing Paired with Massively Paral...,10.1038/s41590-019-0544-5,/Users/pschafer/Projects/MS_expert/2024-01-09_...
0,Dual {{TCR T Cells}}: {{Identity Crisis}} or {...,10.4049/jimmunol.1800904,/Users/pschafer/Projects/MS_expert/2024-01-09_...
0,The Immunoregulatory Landscape of Human Tuberc...,10.1038/s41590-021-01121-x,/Users/pschafer/Projects/MS_expert/2024-01-09_...
0,Hematopoiesis in Numbers,10.1016/j.it.2021.10.006,/Users/pschafer/Projects/MS_expert/2024-01-09_...


- Building the index

In [5]:
# let's use a somewhat smaller llm for now
settings = Settings(paper_directory=paper_dir,
                    use_absolute_paper_directory=True,
                    index_directory=index_dir,
                    index_absolute_directory=True,
                    manifest_file=manifest_file,
                    llm="gpt-4o-mini", # smaller than default (bc cheaper)
                    summary_llm="gpt-4o-mini", # smaller than default (bc cheaper)
                    embedding="text-embedding-3-small", # default
                    temperature=0, # default
                    )
settings.agent.index.concurrency = 1 # reducing not to hit rate limits (bc my S2 key is not working anymore)
settings

Settings(llm='gpt-4o-mini', llm_config=None, summary_llm='gpt-4o-mini', summary_llm_config=None, embedding='text-embedding-3-small', embedding_config=None, temperature=0.0, batch_size=1, texts_index_mmr_lambda=1.0, index_absolute_directory=True, index_directory=PosixPath('/Users/pschafer/Projects/MS_expert/2024-01-09_MS_Export_index'), index_recursively=True, verbosity=0, manifest_file=PosixPath('/Users/pschafer/Projects/MS_expert/2024-01-09_MS_Export_manifest.csv'), paper_directory=PosixPath('/Users/pschafer/Projects/MS_expert/2024-01-09_MS_Export'), answer=AnswerSettings(evidence_k=10, evidence_detailed_citations=True, evidence_retrieval=True, evidence_summary_length='about 100 words', evidence_skip_summary=False, answer_max_sources=5, max_answer_attempts=None, answer_length='about 200 words, but can be longer', max_concurrent_requests=4, answer_filter_extra_background=False, get_evidence_if_no_contexts=True), parsing=ParsingSettings(chunk_size=5000, page_size_limit=1280000, use_doc_

In [6]:
settings.agent.index.paper_directory

PosixPath('/Users/pschafer/Projects/MS_expert/2024-01-09_MS_Export')

In [7]:
settings.agent.index.index_directory

PosixPath('/Users/pschafer/Projects/MS_expert/2024-01-09_MS_Export_index')

In [8]:
settings.agent.index.concurrency

1

- I still get `429` errors here, indicating time outs, because I am positing to many requests...

In [8]:
#built_index = await get_directory_index(settings=settings)

[bold red]Removing /Users/pschafer/Projects/MS_expert/2024-01-09_MS_Export/Exported Items/files/27191/Tortosa-Carreres et al. - 2024 - Predictive potential of serum and cerebrospinal fluid biomarkers for disease activity in treated mul.pdf from index.[/bold red]
[bold red]Removing /Users/pschafer/Projects/MS_expert/2024-01-09_MS_Export/files/8775/Mey and DeSilva - 2022 - Endothelial-to-mesenchymal transition in multiple .pdf from index.[/bold red]
[bold red]Removing /Users/pschafer/Projects/MS_expert/2024-01-09_MS_Export/files/22752/Machado-Santos et al. - 2018 - The compartmentalized inflammatory response in the multiple sclerosis brain is composed of tissue-re.pdf from index.[/bold red]
[bold red]Removing /Users/pschafer/Projects/MS_expert/2024-01-09_MS_Export/Exported Items/files/27200/Krämer et al. - 2023 - Bruton tyrosine kinase inhibitors for multiple sclerosis.pdf from index.[/bold red]
[bold red]Removing /Users/pschafer/Projects/MS_expert/2024-01-09_MS_Export/Exported Items/fil

MuPDF error: format error: cmsOpenProfileFromMem failed



SEMANTIC_SCHOLAR_API_KEY environment variable not set. Semantic Scholar API rate limits may apply.


MuPDF error: format error: cmsOpenProfileFromMem failed



SEMANTIC_SCHOLAR_API_KEY environment variable not set. Semantic Scholar API rate limits may apply.
Metadata not found for Quantitating CD8+ T Cell Memory Development in SemanticScholarProvider.
SEMANTIC_SCHOLAR_API_KEY environment variable not set. Semantic Scholar API rate limits may apply.
SEMANTIC_SCHOLAR_API_KEY environment variable not set. Semantic Scholar API rate limits may apply.
SEMANTIC_SCHOLAR_API_KEY environment variable not set. Semantic Scholar API rate limits may apply.
SEMANTIC_SCHOLAR_API_KEY environment variable not set. Semantic Scholar API rate limits may apply.
SEMANTIC_SCHOLAR_API_KEY environment variable not set. Semantic Scholar API rate limits may apply.
SEMANTIC_SCHOLAR_API_KEY environment variable not set. Semantic Scholar API rate limits may apply.
SEMANTIC_SCHOLAR_API_KEY environment variable not set. Semantic Scholar API rate limits may apply.
Metadata not found for Research Briefing: Human Mucosal-Associated Invariant T (MAIT) Cells Exhibit Many Function

In [9]:
print(settings.get_index_name())

pqa_index_8a1af8a9281b0ff67e51ecdf7bfc4dca


In [10]:
print(await built_index.index_files)

{'/Users/pschafer/Projects/MS_expert/2024-01-09_MS_Export/files/22154/Weng - 2023 - Transcriptome-based measurement of CD8+ T cell age and its applications.pdf': '46f7f58a1e4e27f3ed8db79305471594', '/Users/pschafer/Projects/MS_expert/2024-01-09_MS_Export/files/20635/Schirmer et al. - 2021 - Diversity and Function of Glial Cell Types in Multiple Sclerosis.pdf': '652af1a8fec8356740a54b4923eb92f1', '/Users/pschafer/Projects/MS_expert/2024-01-09_MS_Export/files/23072/Mey and DeSilva - 2022 - Endothelial-to-mesenchymal transition in multiple sclerosis Good cells gone bad.pdf': '1c1d78817f46897681e12f52274e4b92', '/Users/pschafer/Projects/MS_expert/2024-01-09_MS_Export/files/23075/Fischbach et al. - 2024 - CD19-targeted chimeric antigen receptor T cell therapy in two patients with multiple sclerosis.pdf': 'd62030b76e6d2fbdef1b612fdccceeeb', '/Users/pschafer/Projects/MS_expert/2024-01-09_MS_Export/files/23086/Pollard and Bijker - 2021 - A guide to vaccinology from basic principles to new deve

In [15]:
len((await built_index.index_files).keys())

522

In [43]:
await built_index.save_index()

In [9]:
import anyio

index_settings = settings.agent.index
#index_settings.paper_directory = "/"

paper_directory = anyio.Path(index_settings.paper_directory)
valid_papers_rel_file_paths = [
        file.relative_to(paper_directory)
        async for file in (
            paper_directory.rglob("*")
            if index_settings.recurse_subdirectories
            else paper_directory.iterdir()
        )
        if file.suffix in {".txt", ".pdf", ".html"}
    ]
len(valid_papers_rel_file_paths)

522

In [10]:
await valid_papers_rel_file_paths[0].absolute()

Path('/Users/pschafer/Projects/MS_expert/files/23075/Fischbach et al. - 2024 - CD19-targeted chimeric antigen receptor T cell therapy in two patients with multiple sclerosis.pdf')

In [11]:
index_unique_file_paths = (await built_index.index_files).keys()
list(index_unique_file_paths)[0]

NameError: name 'built_index' is not defined

In [67]:
index_unique_file_paths = (await built_index.index_files).keys()
extra_index_files = index_unique_file_paths - {str(f) for f in valid_papers_rel_file_paths}
len(extra_index_files)
#index_settings.sync_with_paper_directory
#for extra_file in extra_index_files:

522

In [12]:
from paperqa.agents.models import QueryRequest
from paperqa.agents.search import SearchIndex, SearchDocumentStorage
from paperqa import Docs
from paperqa.agents.main import run_agent, AgentSettings


query = QueryRequest(
    query="What do you know about the role of the iron metabolism in multiple sclerosis?", settings=settings
    )
docs = Docs()

answers_index = SearchIndex(
    fields=[*SearchIndex.REQUIRED_FIELDS, "question"],
    index_name="answers",
    index_directory=query.settings.agent.index.index_directory,
    storage=SearchDocumentStorage.JSON_MODEL_DUMP,
)
await answers_index.docs_index_directory

Path('/Users/pschafer/Projects/MS_expert/2024-01-09_MS_Export_index/answers/docs')

In [13]:
index_settings.use_absolute_paper_directory

True

In [14]:
docs

Docs(id=UUID('acd2aa93-da74-4714-a36d-92b4634e8dd4'), docs={}, texts=[], docnames=set(), texts_index=NumpyVectorStore(mmr_lambda=1.0, texts_hashes=set(), texts=[]), name='default', index_path=PosixPath('/Users/pschafer/.paperqa/default'), deleted_dockeys=set())

In [15]:
agent_type = AgentSettings.model_fields["agent_type"].default

response = await run_agent(docs, query, agent_type)
f"agent_response: {response}"

[bold red]Removing /Users/pschafer/Projects/MS_expert/2024-01-09_MS_Export/Exported Items/files/23454/Ulutekin et al. - 2024 - B cell depletion attenuates CD27 signaling of T helper cells in multiple sclerosis.pdf from index.[/bold red]
[bold red]Removing /Users/pschafer/Projects/MS_expert/2024-01-09_MS_Export/Exported Items/files/22322/Kappes and Wiest - 2023 - Doubling down to make killer T cells.pdf from index.[/bold red]
[bold red]Removing /Users/pschafer/Projects/MS_expert/2024-01-09_MS_Export/files/22254/Nakaya et al. - 2011 - Systems biology of vaccination for seasonal influenza in humans.pdf from index.[/bold red]
[bold red]Removing /Users/pschafer/Projects/MS_expert/2024-01-09_MS_Export/files/21598/Mendiola et al. - 2023 - Defining blood-induced microglia functions in neurodegeneration through multiomic profiling.pdf from index.[/bold red]
[bold red]Removing /Users/pschafer/Projects/MS_expert/2024-01-09_MS_Export/Exported Items/files/23075/Fischbach et al. - 2024 - CD19-target

test


[bold red]Removing /Users/pschafer/Projects/MS_expert/2024-01-09_MS_Export/files/22070/Ginhoux et al. - 2022 - Single-cell immunology Past, present, and future.pdf from index.[/bold red]
[bold red]Removing /Users/pschafer/Projects/MS_expert/2024-01-09_MS_Export/files/21613/De Simone et al. - 2018 - Single Cell T Cell Receptor Sequencing Techniques and Future Challenges.pdf from index.[/bold red]
[bold red]Removing /Users/pschafer/Projects/MS_expert/2024-01-09_MS_Export/Exported Items/files/20620/Lerma-Martin et al. - 2022 - Spatial cell type mapping of multiple sclerosis lesions.pdf from index.[/bold red]
[bold red]Removing /Users/pschafer/Projects/MS_expert/2024-01-09_MS_Export/files/22284/Germain et al. - 2011 - Systems Biology in Immunology A Computational Modeling Perspective.pdf from index.[/bold red]
[bold red]Removing /Users/pschafer/Projects/MS_expert/2024-01-09_MS_Export/files/20698/Wang et al. - 2021 - Antibodies from Multiple Sclerosis Brain Identified Epstein-Barr Virus Nuc

CancelledError: 

In [None]:
# test questions
from paperqa.agents.main import agent_query
from paperqa.agents.models import QueryRequest

settings = Settings(paper_directory=paper_dir,
                    use_absolute_paper_directory=True,
                    index_directory=index_dir,
                    index_absolute_directory=True,
                    manifest_file=manifest_file,
                    llm="gpt-4o-mini", # smaller than default (bc cheaper)
                    summary_llm="gpt-4o-mini", # smaller than default (bc cheaper)
)
settings.agent.index.sync_with_paper_directory = True
settings.agent.index.paper_directory = "/"
settings.verbosity = 3

answer_response_1 = await agent_query(
    query=QueryRequest(
        query="What do you know about the role of the iron metabolism in multiple sclerosis?", settings=settings
    )
)

[bold red]Removing /Users/pschafer/Projects/MS_expert/2024-01-09_MS_Export/Exported Items/files/27191/Tortosa-Carreres et al. - 2024 - Predictive potential of serum and cerebrospinal fluid biomarkers for disease activity in treated mul.pdf from index.[/bold red]
[bold red]Removing /Users/pschafer/Projects/MS_expert/2024-01-09_MS_Export/files/8775/Mey and DeSilva - 2022 - Endothelial-to-mesenchymal transition in multiple .pdf from index.[/bold red]
[bold red]Removing /Users/pschafer/Projects/MS_expert/2024-01-09_MS_Export/files/22752/Machado-Santos et al. - 2018 - The compartmentalized inflammatory response in the multiple sclerosis brain is composed of tissue-re.pdf from index.[/bold red]
[bold red]Removing /Users/pschafer/Projects/MS_expert/2024-01-09_MS_Export/Exported Items/files/27200/Krämer et al. - 2023 - Bruton tyrosine kinase inhibitors for multiple sclerosis.pdf from index.[/bold red]
[bold red]Removing /Users/pschafer/Projects/MS_expert/2024-01-09_MS_Export/Exported Items/fil

CancelledError: 

In [53]:
answer_response_1

AnswerResponse(session=PQASession(id=UUID('4f26ba55-0473-4ae9-b2cd-4c33cf35d44b'), question='What do you know about the role of the iron metabolism in multiple sclerosis?', answer='I cannot answer.', has_successful_answer=False, context='\n\nValid Keys: ', contexts=[], references='', formatted_answer='Question: What do you know about the role of the iron metabolism in multiple sclerosis?\n\nI cannot answer.\n', graded_answer=None, cost=0.00922705, token_counts={'gpt-4o-2024-08-06': [3236, 110], 'gpt-4o-mini': [227, 5]}, config_md5='ad06402d71626d2c954c56e03e43885a', tool_history=[['paper_search'], ['paper_search'], ['paper_search'], ['complete'], ['gen_answer']], used_contexts=set()), bibtex=None, status=<AgentStatus.UNSURE: 'unsure'>, timing_info=None, duration=0.0, stats=None)

In [22]:
answer_response_2 = await agent_query(
    query=QueryRequest(
        query="What is multiple sclerosis", settings=settings
    )
)

[bold red]Removing /Users/pschafer/Projects/MS_expert/2024-01-09_MS_Export/Exported Items/files/27191/Tortosa-Carreres et al. - 2024 - Predictive potential of serum and cerebrospinal fluid biomarkers for disease activity in treated mul.pdf from index.[/bold red]
[bold red]Removing /Users/pschafer/Projects/MS_expert/2024-01-09_MS_Export/files/8775/Mey and DeSilva - 2022 - Endothelial-to-mesenchymal transition in multiple .pdf from index.[/bold red]
[bold red]Removing /Users/pschafer/Projects/MS_expert/2024-01-09_MS_Export/files/22752/Machado-Santos et al. - 2018 - The compartmentalized inflammatory response in the multiple sclerosis brain is composed of tissue-re.pdf from index.[/bold red]
[bold red]Removing /Users/pschafer/Projects/MS_expert/2024-01-09_MS_Export/Exported Items/files/27200/Krämer et al. - 2023 - Bruton tyrosine kinase inhibitors for multiple sclerosis.pdf from index.[/bold red]
[bold red]Removing /Users/pschafer/Projects/MS_expert/2024-01-09_MS_Export/Exported Items/fil

In [23]:
answer_response_2

AnswerResponse(session=PQASession(id=UUID('50353f7f-ea02-4cc1-9717-45be55260d9f'), question='What is multiple sclerosis', answer='I cannot answer.', has_successful_answer=False, context='\n\nValid Keys: ', contexts=[], references='', formatted_answer='Question: What is multiple sclerosis\n\nI cannot answer.\n', graded_answer=None, cost=0.00902055, token_counts={'gpt-4o-2024-08-06': [3190, 101], 'gpt-4o-mini': [217, 5]}, config_md5='31024d5c8e705dc5e4e85603fa19212e', tool_history=[['paper_search'], ['paper_search'], ['paper_search'], ['complete'], ['gen_answer']], used_contexts=set()), bibtex=None, status=<AgentStatus.UNSURE: 'unsure'>, timing_info=None, duration=0.0, stats=None)

# Appendix

- This is the old code, based on the readcube pipeline.

In [26]:
#info = pd.read_csv("export_2024-10-2.csv")
#
#pdf_files = list(Path("export_2024-10-2").iterdir())
#
#pdf_table = pd.DataFrame({
#    "year": [pdf_file.name.split("_")[0] if re.match("[0-9]+", pdf_file.name.split("_")[0]) else "NaN" for pdf_file in pdf_files],
#    "pmid": [float(pdf_file.name.split("_")[-1].replace(".pdf", "")) if re.match("[0-9]+", pdf_file.name.split("_")[-1]) else np.nan for pdf_file in pdf_files],
#    "path": [pdf_file.resolve() for pdf_file in pdf_files],
#    "title": [pdf_file.name.split("_")[3] if len(pdf_file.name.split("_")) > 1 else "NaN" for pdf_file in pdf_files],
#    "first_auth": [pdf_file.name.split("_")[1] if len(pdf_file.name.split("_")) > 1 else "NaN" for pdf_file in pdf_files],
#    "last_auth": [pdf_file.name.split("_")[2] if len(pdf_file.name.split("_")) > 1 else "NaN" for pdf_file in pdf_files],
#    "name": [file.name for file in pdf_files]
#})
#
#info["file_location"] = ["" for _ in range(len(info))]
#info["match_via"] = ["" for _ in range(len(info))]
#for row_idx in range(info.shape[0]):
#    title_oi = info["title"][row_idx].replace(":", "-").replace("/", "-")
#    first_auth = info["author"][row_idx].split(",")[0]
#    last_auth = info["author"][row_idx].split(",")[-1]
#    year = info["year"][row_idx]
#    pmid = info["pmid"][row_idx]
#
#    # match via pmid
#    if not np.isnan(pmid) and pmid in pdf_table["pmid"].to_list():
#
#        for pdf_idx in range(pdf_table.shape[0]):
#            pmid_pdf = pdf_table["pmid"][pdf_idx]
#            file_location = pdf_table["path"][pdf_idx]
#
#            if not np.isnan(pmid_pdf):
#                if np.isclose(pmid, pmid_pdf):
#                    info["file_location"].values[row_idx] = file_location
#                    info["match_via"].values[row_idx] = "pmid"
#
#    # match via title
#    else:
#        for pdf_idx in range(pdf_table.shape[0]):
#            pmid_pdf = pdf_table["pmid"][pdf_idx]
#            file_location = pdf_table["path"][pdf_idx]
#            pdf_name = pdf_table["name"][pdf_idx]
#
#            if title_oi in pdf_name:
#                info["file_location"].values[row_idx] = file_location
#                info["match_via"].values[row_idx] = "title"
#
#info_subset = info[["title", "doi", "file_location"]].query("file_location != ''")
#info_subset.to_csv("manifest.csv")
#info_subset