In [6]:
import pandas as pd
import usi_utils

In [7]:
dtype = {
    "filepath": object,
    "dataset": object,
    "collection": object,
    "is_update": "int64",
    "update_name": object,
    "create_time": object,
    "size": "int64",
    "size_mb": "int64",
    "sample_type": object,
    "spectra_ms1": "float64",
    "spectra_ms2": "float64",
    "instrument_vendor": object,
    "instrument_model": object,
    "file_processed": object,
    "file_usi": object
}

mzml_stats = pd.read_csv("../local_files/all_gnps_files/all_mzml.txt", dtype=dtype)
mzxml_stats = pd.read_csv("../local_files/all_gnps_files/all_mzxml.txt", dtype=dtype)
file_stats = pd.concat([mzml_stats, mzxml_stats])

In [8]:
file_stats["file_usi"] = [
    usi_utils.create_simple_file_usi(path, dataset) for path, dataset in zip(file_stats["filepath"],
                                                                             file_stats["dataset"])
]
file_stats

Unnamed: 0,filepath,dataset,collection,is_update,update_name,create_time,size,size_mb,sample_type,spectra_ms1,spectra_ms2,instrument_vendor,instrument_model,file_processed,file_usi
0,MSV000073062/73062/mzml/peaklist1.mzml,MSV000073062,73062,0,,2013-06-07 00:00:00,7166071,6,DEFAULT,0.0,0.0,,,FAILED,mzspec:MSV000073062:peaklist1
1,MSV000073062/73062/mzml/peaklist2.mzml,MSV000073062,73062,0,,2013-06-07 00:00:00,8821747,8,DEFAULT,0.0,0.0,,,FAILED,mzspec:MSV000073062:peaklist2
2,MSV000073062/73062/mzml/peaklist3.mzml,MSV000073062,73062,0,,2013-06-07 00:00:00,5751122,5,DEFAULT,0.0,0.0,,,FAILED,mzspec:MSV000073062:peaklist3
3,MSV000073062/73062/mzml/peaklist4.mzml,MSV000073062,73062,0,,2013-06-07 00:00:00,10366660,9,DEFAULT,0.0,0.0,,,FAILED,mzspec:MSV000073062:peaklist4
4,MSV000073062/73062/mzml/peaklist5.mzml,MSV000073062,73062,0,,2013-06-07 00:00:00,2216166,2,DEFAULT,0.0,0.0,,,FAILED,mzspec:MSV000073062:peaklist5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
484281,MSV000091804/updates/2023-04-28_Bjosephc_21f27...,MSV000091804,peak,1,2023-04-28_Bjosephc_21f275f7,2023-04-26 03:17:00,160338738,152,DEFAULT,8275.0,97.0,Thermo Fisher Scientific,Q Exactive HF,DONE,mzspec:MSV000091804:2022081803
484282,MSV000091804/updates/2023-04-28_Bjosephc_21f27...,MSV000091804,peak,1,2023-04-28_Bjosephc_21f275f7,2023-04-26 03:16:00,152195548,145,DEFAULT,7066.0,33.0,Thermo Fisher Scientific,Q Exactive HF,DONE,mzspec:MSV000091804:2022081806
484283,MSV000091804/updates/2023-04-28_Bjosephc_262e3...,MSV000091804,peak,1,2023-04-28_Bjosephc_262e347f,2023-04-26 02:57:00,68710868,65,DEFAULT,3603.0,0.0,Thermo Fisher Scientific,Q Exactive HF,DONE,mzspec:MSV000091804:BJ2022111603
484284,MSV000091804/updates/2023-04-28_Bjosephc_262e3...,MSV000091804,peak,1,2023-04-28_Bjosephc_262e347f,2023-04-26 02:57:00,71670891,68,DEFAULT,3795.0,0.0,Thermo Fisher Scientific,Q Exactive HF,DONE,mzspec:MSV000091804:BJ2022111604


In [9]:
file_stats = file_stats.sort_values(by=["spectra_ms2"], ascending=[False]).drop_duplicates(["file_usi"])

In [10]:
file_stats.to_csv(r"..\local_files\all_gnps_files\all_gnps_files.csv", index=False)

In [11]:
file_stats = pd.read_csv("../local_files/all_gnps_files/all_gnps_files.csv", dtype=dtype)
file_stats["has_stats"] = True
file_stats.shape

(679308, 16)

In [18]:
import requests
response = requests.get("https://gnps-datasetcache.ucsd.edu/datasette/database/filename"
              ".csv?_stream=on&_sort=filepath&filepath__endswith=.mzxml&_size=max")
response.raise_for_status()

DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): gnps-datasetcache.ucsd.edu:443
DEBUG:urllib3.connectionpool:https://gnps-datasetcache.ucsd.edu:443 "GET /datasette/database/filename.csv?_stream=on&_sort=filepath&filepath__endswith=.mzxml&_size=max HTTP/1.1" 200 80761312


In [30]:
with open(r"..\local_files\all_gnps_files\response.csv", mode="w") as f:
    f.write(response.text)

In [None]:
response_df = pd.read_csv()
response_df.shape()

In [13]:
from masst_utils import SPECIAL_MASSTS

rows = []
num_files = len(file_stats)
rows.append(
    {
        "spectra_ms1_sum": file_stats["spectra_ms1"].sum(),
        "spectra_ms2_sum": file_stats["spectra_ms2"].sum(),
        "spectra_ms1_mean": file_stats["spectra_ms1"].sum() / num_files,
        "spectra_ms2_mean": file_stats["spectra_ms2"].sum() / num_files,
        "size_mb": file_stats["size_mb"].sum(),
        "num_files": len(file_stats),
        "num_files_processed": len(file_stats[file_stats["file_processed"] == "DONE"]),
        "description": "ALL_GNPS/MASSIVE_mzML_mzXML"
    }
)

dfs = []
for masst in SPECIAL_MASSTS:
    file = masst.metadata_file
    df = pd.read_csv(file)
    df["masst"] = masst.prefix

    df = df.merge(file_stats, on="file_usi", how="left")
    num_files = len(df)
    has_stats = len(df[df["has_stats"] == True])
    rows.append(
        {
            "spectra_ms1_sum": df["spectra_ms1"].sum(),
            "spectra_ms2_sum": df["spectra_ms2"].sum(),
            "spectra_ms1_mean": df["spectra_ms1"].sum() / has_stats,
            "spectra_ms2_mean": df["spectra_ms2"].sum() / has_stats,
            "size_mb": df["size_mb"].sum(),
            "num_files": num_files,
            "num_files_processed": len(df[df["file_processed"] == "DONE"]),
            "num_files_with_stats": has_stats,
            "files_with_stats_percent": has_stats / num_files,
            "description": masst.prefix
        }
    )
    dfs.append(df)

sum_df = pd.DataFrame(rows)
sum_df

Unnamed: 0,spectra_ms1_sum,spectra_ms2_sum,spectra_ms1_mean,spectra_ms2_mean,size_mb,num_files,num_files_processed,description,num_files_with_stats,files_with_stats_percent
0,2259182000.0,6782255000.0,3325.711448,9984.064711,122272081.0,679308,376061,ALL_GNPS/MASSIVE_mzML_mzXML,,
1,2062382.0,8665498.0,585.072908,2458.297305,40819.0,3579,0,food,3525.0,0.984912
2,60901210.0,90595310.0,1002.35699,1491.0844,2797302.0,60781,10223,microbe,60758.0,0.999622
3,28895400.0,48253240.0,1823.284011,3044.752903,949264.0,20134,8472,plant,15848.0,0.787126


In [14]:
all_masst = pd.concat(dfs)
all_masst.to_csv("../local_files/all_gnps_files/file_summary_combined.csv", index=False)


In [15]:
all_masst[all_masst["has_stats"]==True]

Unnamed: 0,MassIVE,Filename,node_id,file_usi,masst,filepath,dataset,collection,is_update,update_name,...,file_processed,has_stats,Filepath,Taxaname_file,Taxaname_alternative,Taxa_NCBI,Taxa_Assigment,ReDU_Availability,Blank,QC
2,MSV000084900,15NAVY01_v1_brk_1_GA4_01_39548.mzXML,peanut,mzspec:MSV000084900:15NAVY01_v1_brk_1_GA4_01_3...,food,MSV000084900/peak/Global_Foodomics_composite_d...,MSV000084900,peak,0.0,,...,No,True,,,,,,,,
3,MSV000084900,15NAVY01_v1_brk_2_GB4_01_39546.mzXML,grain/grass,mzspec:MSV000084900:15NAVY01_v1_brk_2_GB4_01_3...,food,MSV000084900/ccms_peak/Global_Foodomics_compos...,MSV000084900,ccms_peak,0.0,,...,No,True,,,,,,,,
4,MSV000084900,15NAVY01_v1_brk_3_GC4_01_39547.mzXML,grain/grass,mzspec:MSV000084900:15NAVY01_v1_brk_3_GC4_01_3...,food,MSV000084900/ccms_peak/Global_Foodomics_compos...,MSV000084900,ccms_peak,0.0,,...,No,True,,,,,,,,
5,MSV000084900,15NAVY01_v1_lun_1_GA5_01_39630.mzXML,complex,mzspec:MSV000084900:15NAVY01_v1_lun_1_GA5_01_3...,food,MSV000084900/peak/Global_Foodomics_composite_d...,MSV000084900,peak,0.0,,...,No,True,,,,,,,,
6,MSV000084900,15NAVY01_v1_lun_2_GB5_01_39627.mzXML,fruit,mzspec:MSV000084900:15NAVY01_v1_lun_2_GB5_01_3...,food,MSV000084900/ccms_peak/Global_Foodomics_compos...,MSV000084900,ccms_peak,0.0,,...,No,True,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20129,MSV000091100,MSV000091100/DN-187_positive_40eV+.mzXML,,mzspec:MSV000091100:DN-187_positive_40eV+,plant,MSV000091100/peak/pos_40/DN-187_positive_40eV+...,MSV000091100,peak,0.0,,...,DONE,True,MSV000091100/peak/pos_40/DN-187_positive_40eV+...,Pogostemon cablin,,28511,,No,No,No
20130,MSV000091100,MSV000091100/DN-191_positive_40eV+.mzXML,,mzspec:MSV000091100:DN-191_positive_40eV+,plant,MSV000091100/peak/pos_40/DN-191_positive_40eV+...,MSV000091100,peak,0.0,,...,DONE,True,MSV000091100/peak/pos_40/DN-191_positive_40eV+...,Sinomenium acutum,,152363,,No,No,No
20131,MSV000091100,MSV000091100/DN-193_positive_40eV+.mzXML,,mzspec:MSV000091100:DN-193_positive_40eV+,plant,MSV000091100/peak/pos_40/DN-193_positive_40eV+...,MSV000091100,peak,0.0,,...,DONE,True,MSV000091100/peak/pos_40/DN-193_positive_40eV+...,Ephedra sinica,,33152,,No,No,No
20132,MSV000091100,MSV000091100/DN-194_positive_40eV+.mzXML,,mzspec:MSV000091100:DN-194_positive_40eV+,plant,MSV000091100/peak/pos_40/DN-194_positive_40eV+...,MSV000091100,peak,0.0,,...,DONE,True,MSV000091100/peak/pos_40/DN-194_positive_40eV+...,Prunus persica,,3760,,No,No,No


In [16]:
sum_df.to_csv("../local_files/all_gnps_files/file_summary.csv", index=False)

In [17]:
file_stats.groupby("file_processed").count().reset_index()

Unnamed: 0,file_processed,filepath,dataset,collection,is_update,update_name,create_time,size,size_mb,sample_type,spectra_ms1,spectra_ms2,instrument_vendor,instrument_model,file_usi,has_stats
0,DONE,376061,376061,376061,376061,8490,376061,376061,376061,376061,376061,376061,376061,376061,376061,376061
1,FAILED,66903,66903,66903,66903,642,66903,66903,66903,66903,66903,66903,42,42,66903,66903
2,No,236344,236344,236344,236344,8332,236344,236344,236344,236344,235791,235796,233068,233068,236344,236344
