# 2.1 Metadata

Author: Sandra Godinho Silva \
Creation date: 07/09/2020 \
Version: 0.1

In [1]:
#import libraries
import pandas as pd
import numpy as np
import seaborn as sns

In [2]:
pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 10)

## Import dataset

In [5]:
dataset = pd.read_csv("../1_Dataset_creation/Dataset.csv")
dataset.head(2)

Unnamed: 0,Genome_ID,Bin_Id,Marker lineage,Completeness,Contamination,Strain_heterogeneity,Quality_score,classification,Domain,Phyla,...,scaf_L90,ctg_N90,ctg_L90,scaf_max,ctg_max,scaf_n_gt50K,scaf_pct_gt50K,gc_avg,gc_std,Classification_quality
0,GCA_000016645.1,GCA_000016645.1_ASM1664v1_genomic,f__Flavobacteriaceae (UID2817),99.65,0.14,0.0,98.95,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__...,Bacteria,Bacteroidota,...,6096872,1,6096872,6096872,6096872,1,100.0,0.34113,0.0,Medium
1,GCA_000023285.1,GCA_000023285.1_ASM2328v1_genomic,p__Bacteroidetes (UID2605),100.0,0.0,0.0,100.0,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__...,Bacteria,Bacteroidota,...,2612925,1,2612925,2612925,2612925,1,100.0,0.39588,0.0,Medium


In [4]:
genomes = dataset["Genome_ID"]
genomes.to_csv("assembly_ids.csv", index=False,  header=False)

In [5]:
len(genomes)

2680

## 2.1 Import metadata coupled with GenBank

***On the command line:*** \
wget ftp://ftp.ncbi.nlm.nih.gov/genomes/ASSEMBLY_REPORTS/assembly_summary_genbank.txt

In [6]:
genbank_metadata = pd.read_csv("assembly_summary_genbank.txt", sep="\t", skiprows=1)

In [7]:
genbank_metadata.head(2)

Unnamed: 0,# assembly_accession,bioproject,biosample,wgs_master,refseq_category,taxid,species_taxid,organism_name,infraspecific_name,isolate,version_status,assembly_level,release_type,genome_rep,seq_rel_date,asm_name,submitter,gbrs_paired_asm,paired_asm_comp,ftp_path,excluded_from_refseq,relation_to_type_material
0,GCA_000001215.4,PRJNA13812,SAMN02803731,,reference genome,7227,7227,Drosophila melanogaster,,,latest,Chromosome,Major,Full,2014/08/01,Release 6 plus ISO1 MT,The FlyBase Consortium/Berkeley Drosophila Gen...,GCF_000001215.4,identical,ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000...,,
1,GCA_000001405.28,PRJNA31257,,,reference genome,9606,9606,Homo sapiens,,,latest,Chromosome,Patch,Full,2019/02/28,GRCh38.p13,Genome Reference Consortium,GCF_000001405.39,different,ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000...,,


In [8]:
genbank_filtered = pd.merge(genomes, genbank_metadata, how="left", left_on="Genome_ID", right_on="# assembly_accession", indicator="merge")
genbank_filtered["merge"].value_counts()

both          2679
left_only        1
right_only       0
Name: merge, dtype: int64

In [9]:
genbank_filtered.drop(columns=["merge"], inplace=True)
genbank_filtered.head(2)

Unnamed: 0,Genome_ID,# assembly_accession,bioproject,biosample,wgs_master,refseq_category,taxid,species_taxid,organism_name,infraspecific_name,isolate,version_status,assembly_level,release_type,genome_rep,seq_rel_date,asm_name,submitter,gbrs_paired_asm,paired_asm_comp,ftp_path,excluded_from_refseq,relation_to_type_material
0,GCA_000016645.1,GCA_000016645.1,PRJNA16082,SAMN02598357,,representative genome,376686.0,986.0,Flavobacterium johnsoniae UW101,strain=UW101; ATCC 17061,,latest,Complete Genome,Major,Full,2007/05/01,ASM1664v1,US DOE Joint Genome Institute,GCF_000016645.1,identical,ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000...,,assembly from type material
1,GCA_000023285.1,GCA_000023285.1,PRJNA29403,SAMN00001911,,representative genome,521097.0,1018.0,Capnocytophaga ochracea DSM 7271,strain=DSM 7271,,latest,Complete Genome,Major,Full,2009/08/26,ASM2328v1,US DOE Joint Genome Institute (JGI-PGF),GCF_000023285.1,identical,ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000...,,assembly from type material


In [10]:
#REMOVE empty columns
null_cols = genbank_filtered.isnull().all()
genbank_filtered = genbank_filtered[genbank_filtered.columns[~null_cols]]

In [11]:
#genbank_filtered.info()

## 2.2 Import Metadata from PATRIC

Download Genome features for the taxon Flavobacteriia on 7 September 2020

In [12]:
patric = pd.read_csv("PATRIC_genome_Flavobacteria.csv")

In [13]:
l = patric.columns.to_list()
print(l)

['Genome ID', 'Genome Name', 'Organism Name', 'NCBI Taxon ID', 'Genome Status', 'Strain', 'Serovar', 'Biovar', 'Pathovar', 'MLST', 'Other Typing', 'Culture Collection', 'Type Strain', 'Completion Date', 'Publication', 'BioProject Accession', 'BioSample Accession', 'Assembly Accession', 'SRA Accession', 'GenBank Accessions', 'RefSeq Accessions', 'Sequencing Centers', 'Sequencing Status', 'Sequencing Platform', 'Sequencing Depth', 'Assembly Method', 'Chromosomes', 'Plasmids', 'Contigs', 'Genome Length', 'GC Content', 'PATRIC CDS', 'RefSeq CDS', 'Isolation Site', 'Isolation Source', 'Isolation Comments', 'Collection Date', 'Isolation Country', 'Geographic Location', 'Latitude', 'Longitude', 'Altitude', 'Depth', 'Other Environmental', 'Host Name', 'Host Gender', 'Host Age', 'Host Health', 'Body Sample Site', 'Body Sample Subsite', 'Other Clinical', 'AntiMicrobial Resistance', 'AntiMicrobial Resistance Evidence', 'Gram Stain', 'Cell Shape', 'Motility', 'Sporulation', 'Temperature Range', 'O

In [14]:
#REMOVE uninteresting columns
patric = patric.drop(columns=['Genome ID','PATRIC CDS','Coarse Consistency', 'Fine Consistency', 'Checkm Completeness', 'Checkm Contamination','Genome Quality','Date Inserted', 'Date Modified'])

In [15]:
#REMOVE empty columns
null_cols = patric.isnull().all()
patric = patric[patric.columns[~null_cols]]

In [16]:
patric.head(2)

Unnamed: 0,Genome Name,Organism Name,NCBI Taxon ID,Genome Status,Strain,Serovar,Pathovar,MLST,Other Typing,Culture Collection,Type Strain,Completion Date,Publication,BioProject Accession,BioSample Accession,Assembly Accession,SRA Accession,GenBank Accessions,RefSeq Accessions,Sequencing Centers,Sequencing Status,Sequencing Platform,Sequencing Depth,Assembly Method,Chromosomes,Plasmids,Contigs,Genome Length,GC Content,RefSeq CDS,Isolation Source,Isolation Comments,Collection Date,Isolation Country,Geographic Location,Latitude,Longitude,Altitude,Depth,Other Environmental,Host Name,Host Gender,Host Age,Host Health,Body Sample Site,Other Clinical,Gram Stain,Cell Shape,Motility,Sporulation,Temperature Range,Optimal Temperature,Salinity,Oxygen Requirement,Habitat,Disease,Comments,Additional Metadata,Genome Quality Flags
0,Muricauda taeanensis strain JCM 17757,,1005926,WGS,JCM 17757,,,,,,,2018-09-20T00:00:00Z,,PRJNA437398,SAMN09939642,GCA_003584105.1,,QXFO01000000,,"Second Institute of Oceanography, State Oceani...",,Illumina HiSeq X-Ten,308x,ABySS v. 1.9.0,,,28,4101660.0,45.60536,3644.0,tidal mudflat,,,South Korea,South Korea: Taean coast of the Yellow Sea,,,,5m,env_biome:marine biome,,,,,,,,,,,,,,,,,This project was used for calculating ANI valu...,,
1,Myroides injenensis M09-0166,Myroides injenensis M09-0166,1008457,WGS,,,,,,,,2012-02-04T00:00:00Z,22535932.0,PRJDA171989,SAMD00036610,GCA_000246945.2,,BAEX00000000,-,Korea Research Institute of Bioscience and Bio...,WGS,454,36x,,,,226,3458160.0,31.35,,urine of a patient with fever,,,,,,,,,,"Human, Homo sapiens",,,,,,,,,,,,,,,,The Myroides strains are not normal human micr...,,


In [17]:
patric_filtered = pd.merge(genomes, patric, how="left", left_on="Genome_ID", right_on="Assembly Accession", indicator="merge")
patric_filtered["merge"].value_counts()

both          2081
left_only      605
right_only       0
Name: merge, dtype: int64

In [18]:
patric_filtered["Additional Metadata"] = patric_filtered["Additional Metadata"].str.replace("�","")
patric_filtered.drop(columns=["merge"], inplace=True)
patric_filtered.head(2)

Unnamed: 0,Genome_ID,Genome Name,Organism Name,NCBI Taxon ID,Genome Status,Strain,Serovar,Pathovar,MLST,Other Typing,Culture Collection,Type Strain,Completion Date,Publication,BioProject Accession,BioSample Accession,Assembly Accession,SRA Accession,GenBank Accessions,RefSeq Accessions,Sequencing Centers,Sequencing Status,Sequencing Platform,Sequencing Depth,Assembly Method,Chromosomes,Plasmids,Contigs,Genome Length,GC Content,RefSeq CDS,Isolation Source,Isolation Comments,Collection Date,Isolation Country,Geographic Location,Latitude,Longitude,Altitude,Depth,Other Environmental,Host Name,Host Gender,Host Age,Host Health,Body Sample Site,Other Clinical,Gram Stain,Cell Shape,Motility,Sporulation,Temperature Range,Optimal Temperature,Salinity,Oxygen Requirement,Habitat,Disease,Comments,Additional Metadata,Genome Quality Flags
0,GCA_000016645.1,Flavobacterium johnsoniae UW101,Flavobacterium johnsoniae UW101,376686.0,Complete,ATCC 17061,,,,,ATCC 17061,Yes,2007-05-01T00:00:00Z,,PRJNA16082,SAMN02598357,GCA_000016645.1,,CP000685,NC_009441,DOE Joint Genome Institute,complete,,,,1.0,,1.0,6096872.0,34.1,5017.0,soil,isolated from soil in England and is the type ...,,United Kingdom,United Kingdom: England,,,,,,,,,,,,-,Rod,Yes,No,Mesophilic,20-30,Non-halophilic,Aerobic,Multiple,,Flavobacterium johnsoniae ATCC 17061. Flavobac...,,
1,GCA_000023285.1,Capnocytophaga ochracea DSM 7271,Capnocytophaga ochracea DSM 7271,521097.0,Complete,DSM 7271,,,,,DSM 7271,Yes,2009-04-30T00:00:00Z,21304645.0,PRJNA29403,SAMN00001911,GCA_000023285.1,"SRR013476,SRR013477",CP001632,NC_013162,US DOE Joint Genome Institute (JGI-PGF)|DOE Jo...,complete,,,,1.0,,1.0,2612925.0,39.6,2171.0,the human oral cavity,isolated from the human oral cavity,,,,,,,,,"Human, Homo sapiens",,,,,,-,Rod,Yes,No,Mesophilic,35-37,,Facultative,Host-associated,,Capnocytophaga ochracea DSM 7271. Capnocytoph...,,


In [19]:
#REMOVE empty columns
null_cols = patric_filtered.isnull().all()
patric_filtered = patric_filtered[patric_filtered.columns[~null_cols]]

### 2.2.1 Split Patric column: Additional Metadata

In [21]:
assembly_accession = patric_filtered["Genome_ID"].to_list()

In [22]:
splitted = patric_filtered["Additional Metadata"].str.split(";", expand=True)
splitted = splitted.rename(index=patric_filtered["Genome_ID"])
splitted = splitted.where(pd.notnull(splitted), None)

In [23]:
col_list = []
for i, cell in splitted.iteritems():
    for x in cell:
        if x is None:   #while y == nan:
            continue
        else:
            key = x.split(":")[0]
            value = x.split(":")[1:]
            if key not in col_list:
                col_list.append(key)
print(col_list)

['collected_by', 'sample_type', 'biomaterial_provider', 'identified_by', 'passage_history', 'isolate', 'specimen_voucher', '', 'lab_host', 'mating_type']


In [24]:
patric_addit_metadata = pd.DataFrame(columns=col_list, index=patric_filtered["Genome_ID"])

In [25]:
for i, cell in splitted.iterrows():
    for x in cell:
        cell = x
        if x is None:
            continue
        else:
            if ":" in x:
                value = x.split(":")[1]
                key = x.split(":")[0]
                if key in col_list:
                    patric_addit_metadata.loc[i,key] = str(value)

In [26]:
patric_addit_metadata = patric_addit_metadata.reset_index()
patric_addit_metadata = patric_addit_metadata.drop_duplicates(subset=['Genome_ID'])
patric_addit_metadata.tail(2)

Unnamed: 0,Genome_ID,collected_by,sample_type,biomaterial_provider,identified_by,passage_history,isolate,specimen_voucher,Unnamed: 9,lab_host,mating_type
2684,GCA_903819445.1,,,,,,,,,,
2685,GCA_903969135.1,Yvan Rahb,,,,,,,,,


## 2.3 Biosamples

BioSamples stores and supplies descriptions and metadata about biological samples used in research and development by academia and industry. Samples are either 'reference' samples (e.g. from 1000 Genomes, HipSci, FAANG) or have been used in an assay database such as the European Nucleotide Archive (ENA) or ArrayExpress.

In [27]:
biosamples = pd.read_csv("Biosamples_db.csv", low_memory=False)
l = biosamples.columns.to_list()
#print(l)

In [28]:
biosamples.head(2)

Unnamed: 0,biosample_title,contact_email,first_name,growth phase,last_name,name,organism,publication_date,sample_accession,source_name,strain,submission_model,submission_package,submission_package_name,submitter,taxonomy_id,treatment,collection_date,culture_collection,geo_loc_name,isolation_source,sample_name,sample_type,type-material,comment_paragraph,depth,derived-from,env_broad_scale,env_local_scale,env_medium,environmental-sample,isolate,lat_lon,metagenome-source,metagenomic,sample-type,elev,environmetal-sample,collected_by,host,host_tissue_sampled,temp,env_biome,env_feature,env_material,isol_growth_condt,locus_tag_prefix,num_replicons,project_name,ref_biomaterial,16S recovered,16S recovery software,analysis project type,assembly quality,assembly software,bin parameters,binning software,completeness score,completeness software,contamination score,contig L50,decontamination software,note,number of contigs,reassembly post binning,relative coverage Anaerobic (SRR10097246),relative coverage Anoxic (SRR10375098),relative coverage Infiltration (SRR10097247),relative coverage Low pH (SRR10097245),relative coverage Oxic (SRR10375099),relative coverage Reference (SRR10375097),taxa id,total assembly size,nat_host,assembly_method_and_version,completeness_estimated,contamination_estimated,genome_coverage,mapping_method_and_version,quality_assessment_method_and_version,ArrayExpress-SPECIES,ENA-FIRST-PUBLIC,ENA-LAST-UPDATE,SUBJECT_ID,contact_lab,identified_by,isolate-name-alias,environmental_sample,metagenome_source,biomaterial_provider,culture media,replicate,biome,feature,material,alternate_ID,culture-collection,altitude,Genus,ProjectAccession,PublicAccession,Species,attribute_package,isolate_name_alias,host_disease,rel_to_oxygen,Alias,ENA checklist,INSDC center alias,INSDC center name,INSDC first public,INSDC last update,INSDC status,SRA accession,Sample Name,Title,serotype,subsrc_note,annotation_method,assembly_method,biosamplemodel,geographic location (country and/or atlantic ocean),geographic location (region and locality),mapping_method,quality_assessment_method,refinement_method,Cell Shape,Funding Program,GOLD Stamp ID,Gene Calling Method,Gram Staining,Isolation Comments,Isolation Site,Motility,Temperature Optimum,Temperature Range,Type Strain,alt_elev,biotic_relationship,country,environment,investigation_type,isolation-source,trophic_level,pH,Body Sample Site,Body Sample SubSite,Diseases,Host Name,Host Health,Sporulation,experimental_factor,breeding_method,lat-lon,ENA-CHECKLIST,External Id,Submitter Id,subsrc-note,collection date,environment (biome),environment (feature),environment (material),geographic location (country and/or sea),geographic location (latitude),geographic location (longitude),host common name,host-associated environmental package,investigation type,project name,sequencing method,STRAIN,co_assembly_bin,derived from,description,Sequence processing,completeness,contamination,host_age,host_sex,oxy_stat_samp,samp_store_dur,samp_store_temp,FDA CVM ID,bio_material,common name,host scientific name,samp_mat_process,estimated_size,ArrayExpress-STRAIN_OR_LINE,Tax ID,Collaborator ID,host_description,host_disease_outcome,host_disease_stage,host_health_state,identification method,strain_name_alias,WGA amplification approach,metagenomic source,sample derived from,single cell or viral particle lysis approach,sorting technology,taxonomic identity marker,passage_history,samp_size,timepoint,Description,subsource_note,lab_host,collection_room,relative_week,assembly_tool,bin_id,Broker name,number_of_identified_antimicrobial_resistance_genes,MAG_number,derived_from,alkalinity (carbonate/bicarbonate),salinity,original-organism-name,Phenotypes,env_package,label,specific_host,binning parameters,plant-associated environmental package,culture collection,repository,risk group,type status,Assembly Method,Genome Coverage,Sequencing Technology,nat-host,collection-date,Historical Monobactam Produced,Unique Monobactam Clusters Identified,extrachrom_elements,pathogenicity,source_material_id,genotype,mating_type,serovar,specimen_voucher,Salinity (PSU),collected-by,orgmod_note,assembly,sequencing_meth,finishing_strategy,geo-loc-name,finishing strategy (depth of coverage),host taxid,misc_param: HMP body site,misc_param: HMP supersite,nucleic acid extraction,project_type,sop,source_mat_id,Salinity,age,body_site,disease,health_state,Isolation source,Host,Sample name,"geographic location (country and/or sea,region)",isolation source,environmental package,specific host,lat lon,host_subject_id,pathotype,subgroup,subtype,isolate name alias,zoonotic agent,Extraction Date,Extraction Method,Laboratory Host,Passage Date,sample type,HOST TAXON NAME,sample_description,host health state,geographic location (elevation),sediment environmental package,anonymized_name,host_status,supplier_name,plant product,ecotype,growth condition,abs_air_humidity,air_temp,build_occup_type,building_setting,carb_dioxide,filter_type,heat_cool_type,indoor_space,light_type,occup_samp,occupant_dens_samp,organism_count,rel_air_humidity,space_typ_state,typ_occupant_dens,ventilation_type,Type-material,samp_collect_device,Depth,Size fraction,genomovar,geographic location (altitude),env_packatge,lib_reads_seqd,metagenomic_source,env material,other identifiers,assembly_method_version,mapping_method_version,quality_assessment_method_version,value,state,time,method,dna treatment,pacbio sequencing chemistry,"samp_size (Library conc., pM)",cyanobacterial_culture,missing,samp_store_loc,sample_alias,ethnicity,vector,host_growth_cond,host_life_stage,c-source,host_body_habitat,host_taxid,misc_param,lab-host,v_type,Replicate,DIP (uM),NH4 (uM),NO2 (uM),NO3 (uM),O2 (uM),Salinity (psu),ph,beta_lactamase_family,carbapenemase,edta_inhibitor_tested,Sample Number,health_disease_stat,host_spec_range,assembly-note,metagenome-source seawater metagenome,type_status,indoor_surf,samp_sort_meth,samp_vol_we_dna_ext,surf_material,time point,Strain,sample comment,growth_med,host_genotype,plant_body_site,plant_product,BioSampleModel,closest_match,percent_match,Environmental feature,Environmental material,ammonium,biomass,carb_nitro_ratio,chlorophyll,diss_inorg_nitro,diss_inorg_phosp,diss_org_carb,diss_org_nitro,nitrate,nitrite,phosphate,silicate,tidal_stage,tot_diss_nitro,Alistipes,Anaerophaga,Anaerostipes,Bacteroidaceae,Bacteroidales,Bacteroides,Bacteroidetes,Bacteroidia,Barnesiella,Blautia,Butyrivibrio,Campylobacterales,Clostridia,Clostridiales,Deferribacteraceae,Deferribacterales,Deferribacteres,Deltaproteobacteria,Desulfovibrionaceae,Desulfovibrionales,Epsilonproteobacteria,Firmicutes,Flammeovirgaceae,Flavobacteria,Flavobacteriaceae,Flavobacteriales,Flexithrix,Helicobacter,Helicobacteraceae,Hespellia,Incertae_Sedis_XIV,Lachnospiraceae,Lawsonia,Marinilabiaceae,Mollicutes,Mucispirillum,Oscillibacter,Parasporobacterium,Peptococcaceae,Porphyromonadaceae,Proteobacteria,Rikenellaceae,Robinsoniella,Ruminococcaceae,Sandarakinotalea,Sphingobacteria,Sphingobacteriales,Sporobacterium,Tenericutes,X1,X103,X10763,X111,X119,X1291,X13004,X14,X14089,X168,X2,X229,X230,X2660,X28,X321,X3422,X4,X5217,X5802,X6075,X62,X6209,X7,X8,X8841,X9,breed,sex,tissue,locus_tag prefix,sample ID,strains,host sex,cell_type,"geographic location (country and/or sea, region)"
0,Iron_minus_A,shicheng@msu.edu,Shicheng,Log,Chen,"Microbiology and Molecular Genetics, Michigan ...",Elizabethkingia anophelis,2020-05-07T00:00:00.000,SAMN12086807,bacterial culture_iron_minus,Ag1,Generic,Generic.1.0,Generic,"Microbiology and Molecular Genetics, Michigan ...",1117645,Minus iron supplement,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,Iron_minus_B,shicheng@msu.edu,Shicheng,Log,Chen,"Microbiology and Molecular Genetics, Michigan ...",Elizabethkingia anophelis,2020-05-07T00:00:00.000,SAMN12086806,bacterial culture_iron_minus,Ag1,Generic,Generic.1.0,Generic,"Microbiology and Molecular Genetics, Michigan ...",1117645,Minus iron supplement,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [29]:
biosamples_filtered = pd.merge(patric_filtered, biosamples, how="left", left_on="BioSample Accession", right_on="sample_accession", indicator="merge")
biosamples_filtered["merge"].value_counts()

both          1973
left_only      713
right_only       0
Name: merge, dtype: int64

In [30]:
biosamples_filtered.drop(columns=["merge"], inplace=True)
biosamples_filtered.head(2)

Unnamed: 0,Genome_ID,Genome Name,Organism Name,NCBI Taxon ID,Genome Status,Strain_x,Serovar,Pathovar,MLST,Other Typing,Culture Collection,Type Strain_x,Completion Date,Publication,BioProject Accession,BioSample Accession,Assembly Accession,SRA Accession,GenBank Accessions,RefSeq Accessions,Sequencing Centers,Sequencing Status,Sequencing Platform,Sequencing Depth,Assembly Method_x,Chromosomes,Plasmids,Contigs,Genome Length,GC Content,RefSeq CDS,Isolation Source,Isolation Comments_x,Collection Date,Isolation Country,Geographic Location,Latitude,Longitude,Altitude,Depth_x,Other Environmental,Host Name_x,Host Gender,Host Age,Host Health_x,Body Sample Site_x,Other Clinical,Gram Stain,Cell Shape_x,Motility_x,Sporulation_x,Temperature Range_x,Optimal Temperature,Salinity_x,Oxygen Requirement,Habitat,Disease,Comments,Additional Metadata,Genome Quality Flags,biosample_title,contact_email,first_name,growth phase,last_name,name,organism,publication_date,sample_accession,source_name,strain,submission_model,submission_package,submission_package_name,submitter,taxonomy_id,treatment,collection_date,culture_collection,geo_loc_name,isolation_source,sample_name,sample_type,type-material,comment_paragraph,depth,derived-from,env_broad_scale,env_local_scale,env_medium,environmental-sample,isolate,lat_lon,metagenome-source,metagenomic,sample-type,elev,environmetal-sample,collected_by,host,host_tissue_sampled,temp,env_biome,env_feature,env_material,isol_growth_condt,locus_tag_prefix,num_replicons,project_name,ref_biomaterial,16S recovered,16S recovery software,analysis project type,assembly quality,assembly software,bin parameters,binning software,completeness score,completeness software,contamination score,contig L50,decontamination software,note,number of contigs,reassembly post binning,relative coverage Anaerobic (SRR10097246),relative coverage Anoxic (SRR10375098),relative coverage Infiltration (SRR10097247),relative coverage Low pH (SRR10097245),relative coverage Oxic (SRR10375099),relative coverage Reference (SRR10375097),taxa id,total assembly size,nat_host,assembly_method_and_version,completeness_estimated,contamination_estimated,genome_coverage,mapping_method_and_version,quality_assessment_method_and_version,ArrayExpress-SPECIES,ENA-FIRST-PUBLIC,ENA-LAST-UPDATE,SUBJECT_ID,contact_lab,identified_by,isolate-name-alias,environmental_sample,metagenome_source,biomaterial_provider,culture media,replicate,biome,feature,material,alternate_ID,culture-collection,altitude,Genus,ProjectAccession,PublicAccession,Species,attribute_package,isolate_name_alias,host_disease,rel_to_oxygen,Alias,ENA checklist,INSDC center alias,INSDC center name,INSDC first public,INSDC last update,INSDC status,SRA accession,Sample Name,Title,serotype,subsrc_note,annotation_method,assembly_method,biosamplemodel,geographic location (country and/or atlantic ocean),geographic location (region and locality),mapping_method,quality_assessment_method,refinement_method,Cell Shape_y,Funding Program,GOLD Stamp ID,Gene Calling Method,Gram Staining,Isolation Comments_y,Isolation Site,Motility_y,Temperature Optimum,Temperature Range_y,Type Strain_y,alt_elev,biotic_relationship,country,environment,investigation_type,isolation-source,trophic_level,pH,Body Sample Site_y,Body Sample SubSite,Diseases,Host Name_y,Host Health_y,Sporulation_y,experimental_factor,breeding_method,lat-lon,ENA-CHECKLIST,External Id,Submitter Id,subsrc-note,collection date,environment (biome),environment (feature),environment (material),geographic location (country and/or sea),geographic location (latitude),geographic location (longitude),host common name,host-associated environmental package,investigation type,project name,sequencing method,STRAIN,co_assembly_bin,derived from,description,Sequence processing,completeness,contamination,host_age,host_sex,oxy_stat_samp,samp_store_dur,samp_store_temp,FDA CVM ID,bio_material,common name,host scientific name,samp_mat_process,estimated_size,ArrayExpress-STRAIN_OR_LINE,Tax ID,Collaborator ID,host_description,host_disease_outcome,host_disease_stage,host_health_state,identification method,strain_name_alias,WGA amplification approach,metagenomic source,sample derived from,single cell or viral particle lysis approach,sorting technology,taxonomic identity marker,passage_history,samp_size,timepoint,Description,subsource_note,lab_host,collection_room,relative_week,assembly_tool,bin_id,Broker name,number_of_identified_antimicrobial_resistance_genes,MAG_number,derived_from,alkalinity (carbonate/bicarbonate),salinity,original-organism-name,Phenotypes,env_package,label,specific_host,binning parameters,plant-associated environmental package,culture collection,repository,risk group,type status,Assembly Method_y,Genome Coverage,Sequencing Technology,nat-host,collection-date,Historical Monobactam Produced,Unique Monobactam Clusters Identified,extrachrom_elements,pathogenicity,source_material_id,genotype,mating_type,serovar,specimen_voucher,Salinity (PSU),collected-by,orgmod_note,assembly,sequencing_meth,finishing_strategy,geo-loc-name,finishing strategy (depth of coverage),host taxid,misc_param: HMP body site,misc_param: HMP supersite,nucleic acid extraction,project_type,sop,source_mat_id,Salinity_y,age,body_site,disease,health_state,Isolation source,Host,Sample name,"geographic location (country and/or sea,region)",isolation source,environmental package,specific host,lat lon,host_subject_id,pathotype,subgroup,subtype,isolate name alias,zoonotic agent,Extraction Date,Extraction Method,Laboratory Host,Passage Date,sample type,HOST TAXON NAME,sample_description,host health state,geographic location (elevation),sediment environmental package,anonymized_name,host_status,supplier_name,plant product,ecotype,growth condition,abs_air_humidity,air_temp,build_occup_type,building_setting,carb_dioxide,filter_type,heat_cool_type,indoor_space,light_type,occup_samp,occupant_dens_samp,organism_count,rel_air_humidity,space_typ_state,typ_occupant_dens,ventilation_type,Type-material,samp_collect_device,Depth_y,Size fraction,genomovar,geographic location (altitude),env_packatge,lib_reads_seqd,metagenomic_source,env material,other identifiers,assembly_method_version,mapping_method_version,quality_assessment_method_version,value,state,time,method,dna treatment,pacbio sequencing chemistry,"samp_size (Library conc., pM)",cyanobacterial_culture,missing,samp_store_loc,sample_alias,ethnicity,vector,host_growth_cond,host_life_stage,c-source,host_body_habitat,host_taxid,misc_param,lab-host,v_type,Replicate,DIP (uM),NH4 (uM),NO2 (uM),NO3 (uM),O2 (uM),Salinity (psu),ph,beta_lactamase_family,carbapenemase,edta_inhibitor_tested,Sample Number,health_disease_stat,host_spec_range,assembly-note,metagenome-source seawater metagenome,type_status,indoor_surf,samp_sort_meth,samp_vol_we_dna_ext,surf_material,time point,Strain_y,sample comment,growth_med,host_genotype,plant_body_site,plant_product,BioSampleModel,closest_match,percent_match,Environmental feature,Environmental material,ammonium,biomass,carb_nitro_ratio,chlorophyll,diss_inorg_nitro,diss_inorg_phosp,diss_org_carb,diss_org_nitro,nitrate,nitrite,phosphate,silicate,tidal_stage,tot_diss_nitro,Alistipes,Anaerophaga,Anaerostipes,Bacteroidaceae,Bacteroidales,Bacteroides,Bacteroidetes,Bacteroidia,Barnesiella,Blautia,Butyrivibrio,Campylobacterales,Clostridia,Clostridiales,Deferribacteraceae,Deferribacterales,Deferribacteres,Deltaproteobacteria,Desulfovibrionaceae,Desulfovibrionales,Epsilonproteobacteria,Firmicutes,Flammeovirgaceae,Flavobacteria,Flavobacteriaceae,Flavobacteriales,Flexithrix,Helicobacter,Helicobacteraceae,Hespellia,Incertae_Sedis_XIV,Lachnospiraceae,Lawsonia,Marinilabiaceae,Mollicutes,Mucispirillum,Oscillibacter,Parasporobacterium,Peptococcaceae,Porphyromonadaceae,Proteobacteria,Rikenellaceae,Robinsoniella,Ruminococcaceae,Sandarakinotalea,Sphingobacteria,Sphingobacteriales,Sporobacterium,Tenericutes,X1,X103,X10763,X111,X119,X1291,X13004,X14,X14089,X168,X2,X229,X230,X2660,X28,X321,X3422,X4,X5217,X5802,X6075,X62,X6209,X7,X8,X8841,X9,breed,sex,tissue,locus_tag prefix,sample ID,strains,host sex,cell_type,"geographic location (country and/or sea, region)"
0,GCA_000016645.1,Flavobacterium johnsoniae UW101,Flavobacterium johnsoniae UW101,376686.0,Complete,ATCC 17061,,,,,ATCC 17061,Yes,2007-05-01T00:00:00Z,,PRJNA16082,SAMN02598357,GCA_000016645.1,,CP000685,NC_009441,DOE Joint Genome Institute,complete,,,,1.0,,1.0,6096872.0,34.1,5017.0,soil,isolated from soil in England and is the type ...,,United Kingdom,United Kingdom: England,,,,,,,,,,,,-,Rod,Yes,No,Mesophilic,20-30,Non-halophilic,Aerobic,Multiple,,Flavobacterium johnsoniae ATCC 17061. Flavobac...,,,Sample from Flavobacterium johnsoniae UW101,,,,,NCBI,Flavobacterium johnsoniae UW101,2014-01-28T00:00:00.000,SAMN02598357,,UW101; ATCC 17061,Generic,Generic.1.0,Generic,NCBI,376686.0,,,,,,CP000685,,type strain of Flavobacterium johnsoniae,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,GCA_000023285.1,Capnocytophaga ochracea DSM 7271,Capnocytophaga ochracea DSM 7271,521097.0,Complete,DSM 7271,,,,,DSM 7271,Yes,2009-04-30T00:00:00Z,21304645.0,PRJNA29403,SAMN00001911,GCA_000023285.1,"SRR013476,SRR013477",CP001632,NC_013162,US DOE Joint Genome Institute (JGI-PGF)|DOE Jo...,complete,,,,1.0,,1.0,2612925.0,39.6,2171.0,the human oral cavity,isolated from the human oral cavity,,,,,,,,,"Human, Homo sapiens",,,,,,-,Rod,Yes,No,Mesophilic,35-37,,Facultative,Host-associated,,Capnocytophaga ochracea DSM 7271. Capnocytoph...,,,Generic sample from Capnocytophaga ochracea DS...,,,,,"Joint Genome Institute, U.S. Department of Energy",Capnocytophaga ochracea DSM 7271,2009-02-19T00:00:00.000,SAMN00001911,,DSM 7271,Generic,Generic.1.0,Generic,"Joint Genome Institute, U.S. Department of Energy",521097.0,,,,,,FWCB,,type strain of Capnocytophaga ochracea,Capnocytophaga ochracea DSM 7271,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [31]:
#REMOVE empty columns
null_cols = biosamples_filtered.isnull().all()
biosamples_filtered = biosamples_filtered[biosamples_filtered.columns[~null_cols]]

In [32]:
biosamples_filtered.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2686 entries, 0 to 2685
Columns: 377 entries, Genome_ID to locus_tag prefix
dtypes: float64(34), object(343)
memory usage: 7.7+ MB


# 2.4 SRA

**SRA data manipulation was done in jupyter notebook:** 2.1.1_SRA_attribute

In [33]:
sra = pd.read_csv("SRA_metadata.csv")
sra.head(2)

Unnamed: 0,assembly_accession,strain,finishing strategy (depth of coverage),biotic_relationship,geo_loc_name,host,collected_by,host_disease,isolation_source,sample_name,isolation-source,sample_type,FDA CVM ID,culture_collection,assembly_method,collection_date,env_feature,synonym,Strain,SUBJECT_ID,Alias,type-material,misc_param,collection date,Isolation Site,Isolation source,country,note,GOLD Stamp ID,investigation_type,culture-collection,lat_lon,alt_elev,locus_tag_prefix,nat-host,assembly_method_version,env_material,model,sample_description,Sample Description,ArrayExpress-STRAIN_OR_LINE,ENA checklist,project_type,estimated_size,Temperature Optimum,isol_growth_condt,Motility,feature,project_name,assembly,num_replicons,isolate,depth,env_biome,altitude,ref_biomaterial,package,geographic location,ArrayExpress-StrainOrLine,ArrayExpress-Phenotype,ArrayExpress-SPECIES,INSDC center alias,geo-loc-name,environment,host_sex,elev,PublicAccession,completeness_estimated,geographic location (country and/or sea),latitude and longitude,specific_host,ArrayExpress-Species,ENA-FIRST-PUBLIC,INSDC center name,specific host,Body Sample Site,pH,biome,biomaterial_provider,isolate_name_alias,contamination_estimated,host scientific name,anonymized_name,ENA-LAST-UPDATE,INSDC first public,Cell Shape,BioSampleModel,environmental_sample,material,identified_by,host health state,INSDC last update,Diseases,strain_name_alias,genome_coverage,INSDC status,host taxid,isolation source,Gram Staining,Temperature Range,Sporulation,sequencing_meth,Historical Monobactam Produced,identification method,supplier_name,SRA accession,host_disease_outcome,Unique Monobactam Clusters Identified,ProjectAccession,Sample Name,Type Strain,Phenotypes,Gene Calling Method,temp,Title,investigation type,Host,rel_to_oxygen,host_age,Species,mapping_method,serovar,sop,Isolation Comments,finishing_strategy,Genus,mapping_method_version,environment (biome),attribute_package,metagenome_source,environment (feature),trophic_level,metagenomic,environment (material),Funding Program,quality_assessment_method,quality_assessment_method_version,geographic location (latitude),source_mat_id,host_tissue_sampled,geographic location (longitude),Host Name,subsrc_note,Host Health,value,plant-associated environmental package,project name,sequencing method,Body Sample SubSite
0,GCA_000023285.1,DSM 7271,,,,,,,,,,,,,,,,,,,,type strain of Capnocytophaga ochracea,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,GCA_000143785.1,ATCC 35910,"Level 6, Finished33.84x;7",,,,,,,,,,,,,,,,,,,type strain of Chryseobacterium gleum,"HMP body site, Vaginal",,,,,,,,ATCC:35910,,,,,,,,,,,,Reference Genome,5562000.0,,,,Homo sapiens-associated habitat ENVO:00009003,Chryseobacterium gleum F93,Newbler v. 2.0.0-Eclipse,,,,,,,,,,,,,,,,,,,,,,,,,Homo sapiens,,,terrestrial biome ENVO:00000446,,,,,,,,,MIGS.ba,,biological product ENVO:02000043,,,,,,,,9606.0,,,,,,,,,,,,,,,,,,,,,,,,,,http://hmpdacc.org/doc/CommonGeneAnnotation_SO...,,,,,,,,,,,,,,,,ATCC 35910,,,,,,,,,454-GS-FLX,


----

# 2.5 MERGE

In [34]:
merge1 = pd.merge(genbank_filtered, biosamples_filtered, how="left", left_on="Genome_ID", right_on="Genome_ID", indicator="merge")
merge1["merge"].value_counts()

both          2686
right_only       0
left_only        0
Name: merge, dtype: int64

In [35]:
merge2 = pd.merge(merge1, sra, how="left", left_on="Genome_ID", right_on="assembly_accession", indicator="merge2")
merge2["merge2"].value_counts()

left_only     2149
both           537
right_only       0
Name: merge2, dtype: int64

In [36]:
merge2.head(2)

Unnamed: 0,Genome_ID,# assembly_accession,bioproject,biosample,wgs_master,refseq_category,taxid,species_taxid,organism_name,infraspecific_name,isolate_x,version_status,assembly_level,release_type,genome_rep,seq_rel_date,asm_name,submitter_x,gbrs_paired_asm,paired_asm_comp,ftp_path,excluded_from_refseq,relation_to_type_material,Genome Name,Organism Name,NCBI Taxon ID,Genome Status,Strain_x,Serovar,Pathovar,MLST,Other Typing,Culture Collection,Type Strain_x,Completion Date,Publication,BioProject Accession,BioSample Accession,Assembly Accession,SRA Accession,GenBank Accessions,RefSeq Accessions,Sequencing Centers,Sequencing Status,Sequencing Platform,Sequencing Depth,Assembly Method_x,Chromosomes,Plasmids,Contigs,Genome Length,GC Content,RefSeq CDS,Isolation Source,Isolation Comments_x,Collection Date,Isolation Country,Geographic Location,Latitude,Longitude,Altitude,Depth_x,Other Environmental,Host Name_x,Host Gender,Host Age,Host Health_x,Body Sample Site_x,Other Clinical,Gram Stain,Cell Shape_x,Motility_x,Sporulation_x,Temperature Range_x,Optimal Temperature,Salinity_x,Oxygen Requirement,Habitat,Disease,Comments,Additional Metadata,Genome Quality Flags,biosample_title,contact_email,first_name,last_name,name,organism,publication_date,sample_accession,strain_x,submission_model,submission_package,submission_package_name,submitter_y,taxonomy_id,collection_date_x,culture_collection_x,geo_loc_name_x,isolation_source_x,sample_name_x,sample_type_x,type-material_x,comment_paragraph,depth_x,derived-from,env_broad_scale,env_local_scale,env_medium,environmental-sample,isolate_y,lat_lon_x,metagenome-source,metagenomic_x,sample-type,elev_x,environmetal-sample,collected_by_x,host_x,host_tissue_sampled_x,temp_x,env_biome_x,env_feature_x,env_material_x,isol_growth_condt_x,locus_tag_prefix_x,num_replicons_x,project_name_x,ref_biomaterial_x,16S recovered,16S recovery software,analysis project type,assembly quality,assembly software,bin parameters,binning software,completeness score,completeness software,contamination score,contig L50,decontamination software,note_x,number of contigs,reassembly post binning,relative coverage Anaerobic (SRR10097246),relative coverage Anoxic (SRR10375098),relative coverage Infiltration (SRR10097247),relative coverage Low pH (SRR10097245),relative coverage Oxic (SRR10375099),relative coverage Reference (SRR10375097),taxa id,total assembly size,nat_host,assembly_method_and_version,completeness_estimated_x,contamination_estimated_x,genome_coverage_x,mapping_method_and_version,quality_assessment_method_and_version,ENA-FIRST-PUBLIC_x,ENA-LAST-UPDATE_x,SUBJECT_ID_x,contact_lab,identified_by_x,environmental_sample_x,metagenome_source_x,biomaterial_provider_x,biome_x,feature_x,material_x,culture-collection_x,altitude_x,isolate_name_alias_x,host_disease_x,rel_to_oxygen_x,Alias_x,ENA checklist_x,INSDC center alias_x,INSDC center name_x,INSDC first public_x,INSDC last update_x,INSDC status_x,SRA accession_x,Sample Name_x,Title_x,serotype,subsrc_note_x,annotation_method,assembly_method_x,biosamplemodel,geographic location (country and/or atlantic ocean),geographic location (region and locality),mapping_method_x,quality_assessment_method_x,refinement_method,Cell Shape_y,Funding Program_x,GOLD Stamp ID_x,Gene Calling Method_x,Gram Staining_x,Isolation Comments_y,Isolation Site_x,Motility_y,Temperature Optimum_x,Temperature Range_y,Type Strain_y,alt_elev_x,biotic_relationship_x,country_x,environment_x,investigation_type_x,isolation-source_x,trophic_level_x,pH_x,Body Sample Site_y,Body Sample SubSite_x,Diseases_x,Host Name_y,Host Health_y,Sporulation_y,experimental_factor,lat-lon,ENA-CHECKLIST,External Id,Submitter Id,subsrc-note,collection date_x,environment (biome)_x,environment (feature)_x,environment (material)_x,geographic location (country and/or sea)_x,geographic location (latitude)_x,geographic location (longitude)_x,host common name,host-associated environmental package,investigation type_x,project name_x,sequencing method_x,STRAIN,description,completeness,contamination,host_age_x,host_sex_x,oxy_stat_samp,samp_store_dur,samp_store_temp,FDA CVM ID_x,bio_material,common name,host scientific name_x,samp_mat_process,estimated_size_x,Tax ID,Collaborator ID,host_description,host_disease_outcome_x,host_disease_stage,host_health_state,identification method_x,strain_name_alias_x,WGA amplification approach,metagenomic source,sample derived from,single cell or viral particle lysis approach,sorting technology,taxonomic identity marker,passage_history,samp_size,Description,subsource_note,lab_host,collection_room,relative_week,assembly_tool,bin_id,Broker name,MAG_number,derived_from,alkalinity (carbonate/bicarbonate),salinity,Phenotypes_x,env_package,specific_host_x,binning parameters,plant-associated environmental package_x,culture collection,repository,risk group,type status,Assembly Method_y,Genome Coverage,Sequencing Technology,nat-host_x,collection-date,Historical Monobactam Produced_x,Unique Monobactam Clusters Identified_x,extrachrom_elements,pathogenicity,source_material_id,genotype,mating_type,serovar_x,specimen_voucher,Salinity (PSU),collected-by,orgmod_note,assembly_x,sequencing_meth_x,finishing_strategy_x,geo-loc-name_x,finishing strategy (depth of coverage)_x,host taxid_x,misc_param: HMP body site,misc_param: HMP supersite,nucleic acid extraction,project_type_x,sop_x,source_mat_id_x,Salinity_y,body_site,health_state,Isolation source_x,Host_x,isolation source_x,environmental package,specific host_x,lat lon,host_subject_id,pathotype,subgroup,subtype,Extraction Date,Extraction Method,Laboratory Host,Passage Date,sample type,host health state_x,anonymized_name_x,host_status,supplier_name_x,abs_air_humidity,air_temp,build_occup_type,building_setting,carb_dioxide,filter_type,heat_cool_type,indoor_space,light_type,occup_samp,occupant_dens_samp,organism_count,rel_air_humidity,space_typ_state,typ_occupant_dens,ventilation_type,Type-material,samp_collect_device,geographic location (altitude),env_packatge,lib_reads_seqd,metagenomic_source,env material,other identifiers,assembly_method_version_x,mapping_method_version_x,quality_assessment_method_version_x,value_x,cyanobacterial_culture,missing,samp_store_loc,ethnicity,host_body_habitat,host_taxid,misc_param_x,ph,metagenome-source seawater metagenome,type_status,indoor_surf,samp_sort_meth,samp_vol_we_dna_ext,surf_material,BioSampleModel_x,ammonium,biomass,carb_nitro_ratio,chlorophyll,diss_inorg_nitro,diss_inorg_phosp,diss_org_carb,diss_org_nitro,nitrate,nitrite,phosphate,silicate,tidal_stage,tot_diss_nitro,locus_tag prefix,merge,assembly_accession,strain_y,finishing strategy (depth of coverage)_y,biotic_relationship_y,geo_loc_name_y,host_y,collected_by_y,host_disease_y,isolation_source_y,sample_name_y,isolation-source_y,sample_type_y,FDA CVM ID_y,culture_collection_y,assembly_method_y,collection_date_y,env_feature_y,synonym,Strain,SUBJECT_ID_y,Alias_y,type-material_y,misc_param_y,collection date_y,Isolation Site_y,Isolation source_y,country_y,note_y,GOLD Stamp ID_y,investigation_type_y,culture-collection_y,lat_lon_y,alt_elev_y,locus_tag_prefix_y,nat-host_y,assembly_method_version_y,env_material_y,model,sample_description,Sample Description,ArrayExpress-STRAIN_OR_LINE,ENA checklist_y,project_type_y,estimated_size_y,Temperature Optimum_y,isol_growth_condt_y,Motility,feature_y,project_name_y,assembly_y,num_replicons_y,isolate,depth_y,env_biome_y,altitude_y,ref_biomaterial_y,package,geographic location,ArrayExpress-StrainOrLine,ArrayExpress-Phenotype,ArrayExpress-SPECIES,INSDC center alias_y,geo-loc-name_y,environment_y,host_sex_y,elev_y,PublicAccession,completeness_estimated_y,geographic location (country and/or sea)_y,latitude and longitude,specific_host_y,ArrayExpress-Species,ENA-FIRST-PUBLIC_y,INSDC center name_y,specific host_y,Body Sample Site,pH_y,biome_y,biomaterial_provider_y,isolate_name_alias_y,contamination_estimated_y,host scientific name_y,anonymized_name_y,ENA-LAST-UPDATE_y,INSDC first public_y,Cell Shape,BioSampleModel_y,environmental_sample_y,material_y,identified_by_y,host health state_y,INSDC last update_y,Diseases_y,strain_name_alias_y,genome_coverage_y,INSDC status_y,host taxid_y,isolation source_y,Gram Staining_y,Temperature Range,Sporulation,sequencing_meth_y,Historical Monobactam Produced_y,identification method_y,supplier_name_y,SRA accession_y,host_disease_outcome_y,Unique Monobactam Clusters Identified_y,ProjectAccession,Sample Name_y,Type Strain,Phenotypes_y,Gene Calling Method_y,temp_y,Title_y,investigation type_y,Host_y,rel_to_oxygen_y,host_age_y,Species,mapping_method_y,serovar_y,sop_y,Isolation Comments,finishing_strategy_y,Genus,mapping_method_version_y,environment (biome)_y,attribute_package,metagenome_source_y,environment (feature)_y,trophic_level_y,metagenomic_y,environment (material)_y,Funding Program_y,quality_assessment_method_y,quality_assessment_method_version_y,geographic location (latitude)_y,source_mat_id_y,host_tissue_sampled_y,geographic location (longitude)_y,Host Name,subsrc_note_y,Host Health,value_y,plant-associated environmental package_y,project name_y,sequencing method_y,Body Sample SubSite_y,merge2
0,GCA_000016645.1,GCA_000016645.1,PRJNA16082,SAMN02598357,,representative genome,376686.0,986.0,Flavobacterium johnsoniae UW101,strain=UW101; ATCC 17061,,latest,Complete Genome,Major,Full,2007/05/01,ASM1664v1,US DOE Joint Genome Institute,GCF_000016645.1,identical,ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000...,,assembly from type material,Flavobacterium johnsoniae UW101,Flavobacterium johnsoniae UW101,376686.0,Complete,ATCC 17061,,,,,ATCC 17061,Yes,2007-05-01T00:00:00Z,,PRJNA16082,SAMN02598357,GCA_000016645.1,,CP000685,NC_009441,DOE Joint Genome Institute,complete,,,,1.0,,1.0,6096872.0,34.1,5017.0,soil,isolated from soil in England and is the type ...,,United Kingdom,United Kingdom: England,,,,,,,,,,,,-,Rod,Yes,No,Mesophilic,20-30,Non-halophilic,Aerobic,Multiple,,Flavobacterium johnsoniae ATCC 17061. Flavobac...,,,Sample from Flavobacterium johnsoniae UW101,,,,NCBI,Flavobacterium johnsoniae UW101,2014-01-28T00:00:00.000,SAMN02598357,UW101; ATCC 17061,Generic,Generic.1.0,Generic,NCBI,376686.0,,,,,CP000685,,type strain of Flavobacterium johnsoniae,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,both,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,left_only
1,GCA_000023285.1,GCA_000023285.1,PRJNA29403,SAMN00001911,,representative genome,521097.0,1018.0,Capnocytophaga ochracea DSM 7271,strain=DSM 7271,,latest,Complete Genome,Major,Full,2009/08/26,ASM2328v1,US DOE Joint Genome Institute (JGI-PGF),GCF_000023285.1,identical,ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000...,,assembly from type material,Capnocytophaga ochracea DSM 7271,Capnocytophaga ochracea DSM 7271,521097.0,Complete,DSM 7271,,,,,DSM 7271,Yes,2009-04-30T00:00:00Z,21304645.0,PRJNA29403,SAMN00001911,GCA_000023285.1,"SRR013476,SRR013477",CP001632,NC_013162,US DOE Joint Genome Institute (JGI-PGF)|DOE Jo...,complete,,,,1.0,,1.0,2612925.0,39.6,2171.0,the human oral cavity,isolated from the human oral cavity,,,,,,,,,"Human, Homo sapiens",,,,,,-,Rod,Yes,No,Mesophilic,35-37,,Facultative,Host-associated,,Capnocytophaga ochracea DSM 7271. Capnocytoph...,,,Generic sample from Capnocytophaga ochracea DS...,,,,"Joint Genome Institute, U.S. Department of Energy",Capnocytophaga ochracea DSM 7271,2009-02-19T00:00:00.000,SAMN00001911,DSM 7271,Generic,Generic.1.0,Generic,"Joint Genome Institute, U.S. Department of Energy",521097.0,,,,,FWCB,,type strain of Capnocytophaga ochracea,Capnocytophaga ochracea DSM 7271,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,both,GCA_000023285.1,DSM 7271,,,,,,,,,,,,,,,,,,,,type strain of Capnocytophaga ochracea,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,both


In [37]:
merge2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2686 entries, 0 to 2685
Columns: 550 entries, Genome_ID to merge2
dtypes: category(2), float64(49), object(499)
memory usage: 11.3+ MB


In [38]:
merge3 = pd.merge(merge2, patric_addit_metadata, how="left", left_on="Genome_ID", right_on="Genome_ID", indicator="merge3", suffixes="_z")
merge3["merge3"].value_counts()

both          2686
right_only       0
left_only        0
Name: merge3, dtype: int64

In [39]:
merge = merge3.drop(columns=["merge", "merge2", "merge3", "Additional Metadata"])

In [40]:
to_replace=[r"^missing$", r"^Missing$",  r"^'Missing'", 
            r"^Unclassified$", r"^unclassified$",
            r"^NA$", r"^na$", r'^None$',
            r"^not determined$", r"^Not determined$",
            r"^Not collected$", r"^not collected$", r'^Not Collected$'
            r'^Not recorded$', r'^not recorded$',r'^Not Recorded$', 
            r"^unspecified$", r"^Unspecified$",
            r"^Not Applicable$",r"^Not applicable$", r'^not applicable$',
            r"^not available: to be reported later$", r"^not available$",
            r"^Unknown", r"^unknown", 
            r'^"$', r"^'$", r'^_$',r'^-$'
            ]

In [41]:
merge = merge.replace({w: np.nan for w in to_replace}, regex=True)

In [42]:
merge.head(2)

Unnamed: 0,Genome_ID,# assembly_accession,bioproject,biosample,wgs_master,refseq_category,taxid,species_taxid,organism_name,infraspecific_name,isolate_x,version_status,assembly_level,release_type,genome_rep,seq_rel_date,asm_name,submitter_x,gbrs_paired_asm,paired_asm_comp,ftp_path,excluded_from_refseq,relation_to_type_material,Genome Name,Organism Name,NCBI Taxon ID,Genome Status,Strain_x,Serovar,Pathovar,MLST,Other Typing,Culture Collection,Type Strain_x,Completion Date,Publication,BioProject Accession,BioSample Accession,Assembly Accession,SRA Accession,GenBank Accessions,RefSeq Accessions,Sequencing Centers,Sequencing Status,Sequencing Platform,Sequencing Depth,Assembly Method_x,Chromosomes,Plasmids,Contigs,Genome Length,GC Content,RefSeq CDS,Isolation Source,Isolation Comments_x,Collection Date,Isolation Country,Geographic Location,Latitude,Longitude,Altitude,Depth_x,Other Environmental,Host Name_x,Host Gender,Host Age,Host Health_x,Body Sample Site_x,Other Clinical,Gram Stain,Cell Shape_x,Motility_x,Sporulation_x,Temperature Range_x,Optimal Temperature,Salinity_x,Oxygen Requirement,Habitat,Disease,Comments,Genome Quality Flags,biosample_title,contact_email,first_name,last_name,name,organism,publication_date,sample_accession,strain_x,submission_model,submission_package,submission_package_name,submitter_y,taxonomy_id,collection_date_x,culture_collection_x,geo_loc_name_x,isolation_source_x,sample_name_x,sample_type_x,type-material_x,comment_paragraph,depth_x,derived-from,env_broad_scale,env_local_scale,env_medium,environmental-sample,isolate_y,lat_lon_x,metagenome-source,metagenomic_x,sample-type,elev_x,environmetal-sample,collected_by_x,host_x,host_tissue_sampled_x,temp_x,env_biome_x,env_feature_x,env_material_x,isol_growth_condt_x,locus_tag_prefix_x,num_replicons_x,project_name_x,ref_biomaterial_x,16S recovered,16S recovery software,analysis project type,assembly quality,assembly software,bin parameters,binning software,completeness score,completeness software,contamination score,contig L50,decontamination software,note_x,number of contigs,reassembly post binning,relative coverage Anaerobic (SRR10097246),relative coverage Anoxic (SRR10375098),relative coverage Infiltration (SRR10097247),relative coverage Low pH (SRR10097245),relative coverage Oxic (SRR10375099),relative coverage Reference (SRR10375097),taxa id,total assembly size,nat_host,assembly_method_and_version,completeness_estimated_x,contamination_estimated_x,genome_coverage_x,mapping_method_and_version,quality_assessment_method_and_version,ENA-FIRST-PUBLIC_x,ENA-LAST-UPDATE_x,SUBJECT_ID_x,contact_lab,identified_by_x,environmental_sample_x,metagenome_source_x,biomaterial_provider_x,biome_x,feature_x,material_x,culture-collection_x,altitude_x,isolate_name_alias_x,host_disease_x,rel_to_oxygen_x,Alias_x,ENA checklist_x,INSDC center alias_x,INSDC center name_x,INSDC first public_x,INSDC last update_x,INSDC status_x,SRA accession_x,Sample Name_x,Title_x,serotype,subsrc_note_x,annotation_method,assembly_method_x,biosamplemodel,geographic location (country and/or atlantic ocean),geographic location (region and locality),mapping_method_x,quality_assessment_method_x,refinement_method,Cell Shape_y,Funding Program_x,GOLD Stamp ID_x,Gene Calling Method_x,Gram Staining_x,Isolation Comments_y,Isolation Site_x,Motility_y,Temperature Optimum_x,Temperature Range_y,Type Strain_y,alt_elev_x,biotic_relationship_x,country_x,environment_x,investigation_type_x,isolation-source_x,trophic_level_x,pH_x,Body Sample Site_y,Body Sample SubSite_x,Diseases_x,Host Name_y,Host Health_y,Sporulation_y,experimental_factor,lat-lon,ENA-CHECKLIST,External Id,Submitter Id,subsrc-note,collection date_x,environment (biome)_x,environment (feature)_x,environment (material)_x,geographic location (country and/or sea)_x,geographic location (latitude)_x,geographic location (longitude)_x,host common name,host-associated environmental package,investigation type_x,project name_x,sequencing method_x,STRAIN,description,completeness,contamination,host_age_x,host_sex_x,oxy_stat_samp,samp_store_dur,samp_store_temp,FDA CVM ID_x,bio_material,common name,host scientific name_x,samp_mat_process,estimated_size_x,Tax ID,Collaborator ID,host_description,host_disease_outcome_x,host_disease_stage,host_health_state,identification method_x,strain_name_alias_x,WGA amplification approach,metagenomic source,sample derived from,single cell or viral particle lysis approach,sorting technology,taxonomic identity marker,passage_history_,samp_size,Description,subsource_note,lab_host_,collection_room,relative_week,assembly_tool,bin_id,Broker name,MAG_number,derived_from,alkalinity (carbonate/bicarbonate),salinity,Phenotypes_x,env_package,specific_host_x,binning parameters,plant-associated environmental package_x,culture collection,repository,risk group,type status,Assembly Method_y,Genome Coverage,Sequencing Technology,nat-host_x,collection-date,Historical Monobactam Produced_x,Unique Monobactam Clusters Identified_x,extrachrom_elements,pathogenicity,source_material_id,genotype,mating_type_,serovar_x,specimen_voucher_,Salinity (PSU),collected-by,orgmod_note,assembly_x,sequencing_meth_x,finishing_strategy_x,geo-loc-name_x,finishing strategy (depth of coverage)_x,host taxid_x,misc_param: HMP body site,misc_param: HMP supersite,nucleic acid extraction,project_type_x,sop_x,source_mat_id_x,Salinity_y,body_site,health_state,Isolation source_x,Host_x,isolation source_x,environmental package,specific host_x,lat lon,host_subject_id,pathotype,subgroup,subtype,Extraction Date,Extraction Method,Laboratory Host,Passage Date,sample type,host health state_x,anonymized_name_x,host_status,supplier_name_x,abs_air_humidity,air_temp,build_occup_type,building_setting,carb_dioxide,filter_type,heat_cool_type,indoor_space,light_type,occup_samp,occupant_dens_samp,organism_count,rel_air_humidity,space_typ_state,typ_occupant_dens,ventilation_type,Type-material,samp_collect_device,geographic location (altitude),env_packatge,lib_reads_seqd,metagenomic_source,env material,other identifiers,assembly_method_version_x,mapping_method_version_x,quality_assessment_method_version_x,value_x,cyanobacterial_culture,missing,samp_store_loc,ethnicity,host_body_habitat,host_taxid,misc_param_x,ph,metagenome-source seawater metagenome,type_status,indoor_surf,samp_sort_meth,samp_vol_we_dna_ext,surf_material,BioSampleModel_x,ammonium,biomass,carb_nitro_ratio,chlorophyll,diss_inorg_nitro,diss_inorg_phosp,diss_org_carb,diss_org_nitro,nitrate,nitrite,phosphate,silicate,tidal_stage,tot_diss_nitro,locus_tag prefix,assembly_accession,strain_y,finishing strategy (depth of coverage)_y,biotic_relationship_y,geo_loc_name_y,host_y,collected_by_y,host_disease_y,isolation_source_y,sample_name_y,isolation-source_y,sample_type_y,FDA CVM ID_y,culture_collection_y,assembly_method_y,collection_date_y,env_feature_y,synonym,Strain,SUBJECT_ID_y,Alias_y,type-material_y,misc_param_y,collection date_y,Isolation Site_y,Isolation source_y,country_y,note_y,GOLD Stamp ID_y,investigation_type_y,culture-collection_y,lat_lon_y,alt_elev_y,locus_tag_prefix_y,nat-host_y,assembly_method_version_y,env_material_y,model,sample_description,Sample Description,ArrayExpress-STRAIN_OR_LINE,ENA checklist_y,project_type_y,estimated_size_y,Temperature Optimum_y,isol_growth_condt_y,Motility,feature_y,project_name_y,assembly_y,num_replicons_y,isolate_,depth_y,env_biome_y,altitude_y,ref_biomaterial_y,package,geographic location,ArrayExpress-StrainOrLine,ArrayExpress-Phenotype,ArrayExpress-SPECIES,INSDC center alias_y,geo-loc-name_y,environment_y,host_sex_y,elev_y,PublicAccession,completeness_estimated_y,geographic location (country and/or sea)_y,latitude and longitude,specific_host_y,ArrayExpress-Species,ENA-FIRST-PUBLIC_y,INSDC center name_y,specific host_y,Body Sample Site,pH_y,biome_y,biomaterial_provider_y,isolate_name_alias_y,contamination_estimated_y,host scientific name_y,anonymized_name_y,ENA-LAST-UPDATE_y,INSDC first public_y,Cell Shape,BioSampleModel_y,environmental_sample_y,material_y,identified_by_y,host health state_y,INSDC last update_y,Diseases_y,strain_name_alias_y,genome_coverage_y,INSDC status_y,host taxid_y,isolation source_y,Gram Staining_y,Temperature Range,Sporulation,sequencing_meth_y,Historical Monobactam Produced_y,identification method_y,supplier_name_y,SRA accession_y,host_disease_outcome_y,Unique Monobactam Clusters Identified_y,ProjectAccession,Sample Name_y,Type Strain,Phenotypes_y,Gene Calling Method_y,temp_y,Title_y,investigation type_y,Host_y,rel_to_oxygen_y,host_age_y,Species,mapping_method_y,serovar_y,sop_y,Isolation Comments,finishing_strategy_y,Genus,mapping_method_version_y,environment (biome)_y,attribute_package,metagenome_source_y,environment (feature)_y,trophic_level_y,metagenomic_y,environment (material)_y,Funding Program_y,quality_assessment_method_y,quality_assessment_method_version_y,geographic location (latitude)_y,source_mat_id_y,host_tissue_sampled_y,geographic location (longitude)_y,Host Name,subsrc_note_y,Host Health,value_y,plant-associated environmental package_y,project name_y,sequencing method_y,Body Sample SubSite_y,collected_by,sample_type,biomaterial_provider,identified_by,passage_historyz,isolatez,specimen_voucherz,Unnamed: 555,lab_hostz,mating_typez
0,GCA_000016645.1,GCA_000016645.1,PRJNA16082,SAMN02598357,,representative genome,376686.0,986.0,Flavobacterium johnsoniae UW101,strain=UW101; ATCC 17061,,latest,Complete Genome,Major,Full,2007/05/01,ASM1664v1,US DOE Joint Genome Institute,GCF_000016645.1,identical,ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000...,,assembly from type material,Flavobacterium johnsoniae UW101,Flavobacterium johnsoniae UW101,376686.0,Complete,ATCC 17061,,,,,ATCC 17061,Yes,2007-05-01T00:00:00Z,,PRJNA16082,SAMN02598357,GCA_000016645.1,,CP000685,NC_009441,DOE Joint Genome Institute,complete,,,,1.0,,1.0,6096872.0,34.1,5017.0,soil,isolated from soil in England and is the type ...,,United Kingdom,United Kingdom: England,,,,,,,,,,,,,Rod,Yes,No,Mesophilic,20-30,Non-halophilic,Aerobic,Multiple,,Flavobacterium johnsoniae ATCC 17061. Flavobac...,,Sample from Flavobacterium johnsoniae UW101,,,,NCBI,Flavobacterium johnsoniae UW101,2014-01-28T00:00:00.000,SAMN02598357,UW101; ATCC 17061,Generic,Generic.1.0,Generic,NCBI,376686.0,,,,,CP000685,,type strain of Flavobacterium johnsoniae,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,GCA_000023285.1,GCA_000023285.1,PRJNA29403,SAMN00001911,,representative genome,521097.0,1018.0,Capnocytophaga ochracea DSM 7271,strain=DSM 7271,,latest,Complete Genome,Major,Full,2009/08/26,ASM2328v1,US DOE Joint Genome Institute (JGI-PGF),GCF_000023285.1,identical,ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000...,,assembly from type material,Capnocytophaga ochracea DSM 7271,Capnocytophaga ochracea DSM 7271,521097.0,Complete,DSM 7271,,,,,DSM 7271,Yes,2009-04-30T00:00:00Z,21304645.0,PRJNA29403,SAMN00001911,GCA_000023285.1,"SRR013476,SRR013477",CP001632,NC_013162,US DOE Joint Genome Institute (JGI-PGF)|DOE Jo...,complete,,,,1.0,,1.0,2612925.0,39.6,2171.0,the human oral cavity,isolated from the human oral cavity,,,,,,,,,"Human, Homo sapiens",,,,,,,Rod,Yes,No,Mesophilic,35-37,,Facultative,Host-associated,,Capnocytophaga ochracea DSM 7271. Capnocytoph...,,Generic sample from Capnocytophaga ochracea DS...,,,,"Joint Genome Institute, U.S. Department of Energy",Capnocytophaga ochracea DSM 7271,2009-02-19T00:00:00.000,SAMN00001911,DSM 7271,Generic,Generic.1.0,Generic,"Joint Genome Institute, U.S. Department of Energy",521097.0,,,,,FWCB,,type strain of Capnocytophaga ochracea,Capnocytophaga ochracea DSM 7271,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,GCA_000023285.1,DSM 7271,,,,,,,,,,,,,,,,,,,,type strain of Capnocytophaga ochracea,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [43]:
#merge.columns.duplicated()
merge_reordered = merge.reindex(sorted(merge.columns), axis=1)
merge_reordered.head(2)

Unnamed: 0,Unnamed: 1,# assembly_accession,16S recovered,16S recovery software,Alias_x,Alias_y,Altitude,ArrayExpress-Phenotype,ArrayExpress-SPECIES,ArrayExpress-STRAIN_OR_LINE,ArrayExpress-Species,ArrayExpress-StrainOrLine,Assembly Accession,Assembly Method_x,Assembly Method_y,BioProject Accession,BioSample Accession,BioSampleModel_x,BioSampleModel_y,Body Sample Site,Body Sample Site_x,Body Sample Site_y,Body Sample SubSite_x,Body Sample SubSite_y,Broker name,Cell Shape,Cell Shape_x,Cell Shape_y,Chromosomes,Collaborator ID,Collection Date,Comments,Completion Date,Contigs,Culture Collection,Depth_x,Description,Disease,Diseases_x,Diseases_y,ENA checklist_x,ENA checklist_y,ENA-CHECKLIST,ENA-FIRST-PUBLIC_x,ENA-FIRST-PUBLIC_y,ENA-LAST-UPDATE_x,ENA-LAST-UPDATE_y,External Id,Extraction Date,Extraction Method,FDA CVM ID_x,FDA CVM ID_y,Funding Program_x,Funding Program_y,GC Content,GOLD Stamp ID_x,GOLD Stamp ID_y,GenBank Accessions,Gene Calling Method_x,Gene Calling Method_y,Genome Coverage,Genome Length,Genome Name,Genome Quality Flags,Genome Status,Genome_ID,Genus,Geographic Location,Gram Stain,Gram Staining_x,Gram Staining_y,Habitat,Historical Monobactam Produced_x,Historical Monobactam Produced_y,Host Age,Host Gender,Host Health,Host Health_x,Host Health_y,Host Name,Host Name_x,Host Name_y,Host_x,Host_y,INSDC center alias_x,INSDC center alias_y,INSDC center name_x,INSDC center name_y,INSDC first public_x,INSDC first public_y,INSDC last update_x,INSDC last update_y,INSDC status_x,INSDC status_y,Isolation Comments,Isolation Comments_x,Isolation Comments_y,Isolation Country,Isolation Site_x,Isolation Site_y,Isolation Source,Isolation source_x,Isolation source_y,Laboratory Host,Latitude,Longitude,MAG_number,MLST,Motility,Motility_x,Motility_y,NCBI Taxon ID,Optimal Temperature,Organism Name,Other Clinical,Other Environmental,Other Typing,Oxygen Requirement,Passage Date,Pathovar,Phenotypes_x,Phenotypes_y,Plasmids,ProjectAccession,PublicAccession,Publication,RefSeq Accessions,RefSeq CDS,SRA Accession,SRA accession_x,SRA accession_y,STRAIN,SUBJECT_ID_x,SUBJECT_ID_y,Salinity (PSU),Salinity_x,Salinity_y,Sample Description,Sample Name_x,Sample Name_y,Sequencing Centers,Sequencing Depth,Sequencing Platform,Sequencing Status,Sequencing Technology,Serovar,Species,Sporulation,Sporulation_x,Sporulation_y,Strain,Strain_x,Submitter Id,Tax ID,Temperature Optimum_x,Temperature Optimum_y,Temperature Range,Temperature Range_x,Temperature Range_y,Title_x,Title_y,Type Strain,Type Strain_x,Type Strain_y,Type-material,Unique Monobactam Clusters Identified_x,Unique Monobactam Clusters Identified_y,WGA amplification approach,abs_air_humidity,air_temp,alkalinity (carbonate/bicarbonate),alt_elev_x,alt_elev_y,altitude_x,altitude_y,ammonium,analysis project type,annotation_method,anonymized_name_x,anonymized_name_y,asm_name,assembly quality,assembly software,assembly_accession,assembly_level,assembly_method_and_version,assembly_method_version_x,assembly_method_version_y,assembly_method_x,assembly_method_y,assembly_tool,assembly_x,assembly_y,attribute_package,bin parameters,bin_id,binning parameters,binning software,bio_material,biomass,biomaterial_provider,biomaterial_provider_x,biomaterial_provider_y,biome_x,biome_y,bioproject,biosample,biosample_title,biosamplemodel,biotic_relationship_x,biotic_relationship_y,body_site,build_occup_type,building_setting,carb_dioxide,carb_nitro_ratio,chlorophyll,collected-by,collected_by,collected_by_x,collected_by_y,collection date_x,collection date_y,collection-date,collection_date_x,collection_date_y,collection_room,comment_paragraph,common name,completeness,completeness score,completeness software,completeness_estimated_x,completeness_estimated_y,contact_email,contact_lab,contamination,contamination score,contamination_estimated_x,contamination_estimated_y,contig L50,country_x,country_y,culture collection,culture-collection_x,culture-collection_y,culture_collection_x,culture_collection_y,cyanobacterial_culture,decontamination software,depth_x,depth_y,derived-from,derived_from,description,diss_inorg_nitro,diss_inorg_phosp,diss_org_carb,diss_org_nitro,elev_x,elev_y,env material,env_biome_x,env_biome_y,env_broad_scale,env_feature_x,env_feature_y,env_local_scale,env_material_x,env_material_y,env_medium,env_package,env_packatge,environment (biome)_x,environment (biome)_y,environment (feature)_x,environment (feature)_y,environment (material)_x,environment (material)_y,environment_x,environment_y,environmental package,environmental-sample,environmental_sample_x,environmental_sample_y,environmetal-sample,estimated_size_x,estimated_size_y,ethnicity,excluded_from_refseq,experimental_factor,extrachrom_elements,feature_x,feature_y,filter_type,finishing strategy (depth of coverage)_x,finishing strategy (depth of coverage)_y,finishing_strategy_x,finishing_strategy_y,first_name,ftp_path,gbrs_paired_asm,genome_coverage_x,genome_coverage_y,genome_rep,genotype,geo-loc-name_x,geo-loc-name_y,geo_loc_name_x,geo_loc_name_y,geographic location,geographic location (altitude),geographic location (country and/or atlantic ocean),geographic location (country and/or sea)_x,geographic location (country and/or sea)_y,geographic location (latitude)_x,geographic location (latitude)_y,geographic location (longitude)_x,geographic location (longitude)_y,geographic location (region and locality),health_state,heat_cool_type,host common name,host health state_x,host health state_y,host scientific name_x,host scientific name_y,host taxid_x,host taxid_y,host-associated environmental package,host_age_x,host_age_y,host_body_habitat,host_description,host_disease_outcome_x,host_disease_outcome_y,host_disease_stage,host_disease_x,host_disease_y,host_health_state,host_sex_x,host_sex_y,host_status,host_subject_id,host_taxid,host_tissue_sampled_x,host_tissue_sampled_y,host_x,host_y,identification method_x,identification method_y,identified_by,identified_by_x,identified_by_y,indoor_space,indoor_surf,infraspecific_name,investigation type_x,investigation type_y,investigation_type_x,investigation_type_y,isol_growth_condt_x,isol_growth_condt_y,isolate_,isolate_name_alias_x,isolate_name_alias_y,isolate_x,isolate_y,isolatez,isolation source_x,isolation source_y,isolation-source_x,isolation-source_y,isolation_source_x,isolation_source_y,lab_host_,lab_hostz,last_name,lat lon,lat-lon,lat_lon_x,lat_lon_y,latitude and longitude,lib_reads_seqd,light_type,locus_tag prefix,locus_tag_prefix_x,locus_tag_prefix_y,mapping_method_and_version,mapping_method_version_x,mapping_method_version_y,mapping_method_x,mapping_method_y,material_x,material_y,mating_type_,mating_typez,metagenome-source,metagenome-source seawater metagenome,metagenome_source_x,metagenome_source_y,metagenomic source,metagenomic_source,metagenomic_x,metagenomic_y,misc_param: HMP body site,misc_param: HMP supersite,misc_param_x,misc_param_y,missing,model,name,nat-host_x,nat-host_y,nat_host,nitrate,nitrite,note_x,note_y,nucleic acid extraction,num_replicons_x,num_replicons_y,number of contigs,occup_samp,occupant_dens_samp,organism,organism_count,organism_name,orgmod_note,other identifiers,oxy_stat_samp,pH_x,pH_y,package,paired_asm_comp,passage_history_,passage_historyz,pathogenicity,pathotype,ph,phosphate,plant-associated environmental package_x,plant-associated environmental package_y,project name_x,project name_y,project_name_x,project_name_y,project_type_x,project_type_y,publication_date,quality_assessment_method_and_version,quality_assessment_method_version_x,quality_assessment_method_version_y,quality_assessment_method_x,quality_assessment_method_y,reassembly post binning,ref_biomaterial_x,ref_biomaterial_y,refinement_method,refseq_category,rel_air_humidity,rel_to_oxygen_x,rel_to_oxygen_y,relation_to_type_material,relative coverage Anaerobic (SRR10097246),relative coverage Anoxic (SRR10375098),relative coverage Infiltration (SRR10097247),relative coverage Low pH (SRR10097245),relative coverage Oxic (SRR10375099),relative coverage Reference (SRR10375097),relative_week,release_type,repository,risk group,salinity,samp_collect_device,samp_mat_process,samp_size,samp_sort_meth,samp_store_dur,samp_store_loc,samp_store_temp,samp_vol_we_dna_ext,sample derived from,sample type,sample-type,sample_accession,sample_description,sample_name_x,sample_name_y,sample_type,sample_type_x,sample_type_y,seq_rel_date,sequencing method_x,sequencing method_y,sequencing_meth_x,sequencing_meth_y,serotype,serovar_x,serovar_y,silicate,single cell or viral particle lysis approach,sop_x,sop_y,sorting technology,source_mat_id_x,source_mat_id_y,source_material_id,space_typ_state,species_taxid,specific host_x,specific host_y,specific_host_x,specific_host_y,specimen_voucher_,specimen_voucherz,strain_name_alias_x,strain_name_alias_y,strain_x,strain_y,subgroup,submission_model,submission_package,submission_package_name,submitter_x,submitter_y,subsource_note,subsrc-note,subsrc_note_x,subsrc_note_y,subtype,supplier_name_x,supplier_name_y,surf_material,synonym,taxa id,taxid,taxonomic identity marker,taxonomy_id,temp_x,temp_y,tidal_stage,tot_diss_nitro,total assembly size,trophic_level_x,trophic_level_y,typ_occupant_dens,type status,type-material_x,type-material_y,type_status,value_x,value_y,ventilation_type,version_status,wgs_master
0,,GCA_000016645.1,,,,,,,,,,,GCA_000016645.1,,,PRJNA16082,SAMN02598357,,,,,,,,,,Rod,,1.0,,,Flavobacterium johnsoniae ATCC 17061. Flavobac...,2007-05-01T00:00:00Z,1.0,ATCC 17061,,,,,,,,,,,,,,,,,,,,34.1,,,CP000685,,,,6096872.0,Flavobacterium johnsoniae UW101,,Complete,GCA_000016645.1,,United Kingdom: England,,,,Multiple,,,,,,,,,,,,,,,,,,,,,,,,isolated from soil in England and is the type ...,,United Kingdom,,,soil,,,,,,,,,Yes,,376686.0,20-30,Flavobacterium johnsoniae UW101,,,,Aerobic,,,,,,,,,NC_009441,5017.0,,,,,,,,Non-halophilic,,,,,DOE Joint Genome Institute,,,complete,,,,,No,,,ATCC 17061,,,,,,Mesophilic,,,,,Yes,,,,,,,,,,,,,,,,,,ASM1664v1,,,,Complete Genome,,,,,,,,,,,,,,,,,,,,,PRJNA16082,SAMN02598357,Sample from Flavobacterium johnsoniae UW101,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000...,GCF_000016645.1,,,Full,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,strain=UW101; ATCC 17061,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,NCBI,,,,,,,,,,,,,,Flavobacterium johnsoniae UW101,,Flavobacterium johnsoniae UW101,,,,,,,identical,,,,,,,,,,,,,,,2014-01-28T00:00:00.000,,,,,,,,,,representative genome,,,,assembly from type material,,,,,,,,Major,,,,,,,,,,,,,,,SAMN02598357,,CP000685,,,,,2007/05/01,,,,,,,,,,,,,,,,,986.0,,,,,,,,,UW101; ATCC 17061,,,Generic,Generic.1.0,Generic,US DOE Joint Genome Institute,NCBI,,,,,,,,,,,376686.0,,376686.0,,,,,,,,,,type strain of Flavobacterium johnsoniae,,,,,,latest,
1,,GCA_000023285.1,,,,,,,,,,,GCA_000023285.1,,,PRJNA29403,SAMN00001911,,,,,,,,,,Rod,,1.0,,,Capnocytophaga ochracea DSM 7271. Capnocytoph...,2009-04-30T00:00:00Z,1.0,DSM 7271,,,,,,,,,,,,,,,,,,,,39.6,,,CP001632,,,,2612925.0,Capnocytophaga ochracea DSM 7271,,Complete,GCA_000023285.1,,,,,,Host-associated,,,,,,,,,"Human, Homo sapiens",,,,,,,,,,,,,,,isolated from the human oral cavity,,,,,the human oral cavity,,,,,,,,,Yes,,521097.0,35-37,Capnocytophaga ochracea DSM 7271,,,,Facultative,,,,,,,,21304645.0,NC_013162,2171.0,"SRR013476,SRR013477",,,,,,,,,,,,US DOE Joint Genome Institute (JGI-PGF)|DOE Jo...,,,complete,,,,,No,,,DSM 7271,,,,,,Mesophilic,,,,,Yes,,,,,,,,,,,,,,,,,,ASM2328v1,,,GCA_000023285.1,Complete Genome,,,,,,,,,,,,,,,,,,,,,PRJNA29403,SAMN00001911,Generic sample from Capnocytophaga ochracea DS...,,,,,,,,,,,,,,,,,,,,Capnocytophaga ochracea DSM 7271,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000...,GCF_000023285.1,,,Full,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,strain=DSM 7271,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"Joint Genome Institute, U.S. Department of Energy",,,,,,,,,,,,,,Capnocytophaga ochracea DSM 7271,,Capnocytophaga ochracea DSM 7271,,,,,,,identical,,,,,,,,,,,,,,,2009-02-19T00:00:00.000,,,,,,,,,,representative genome,,,,assembly from type material,,,,,,,,Major,,,,,,,,,,,,,,,SAMN00001911,,FWCB,,,,,2009/08/26,,,,,,,,,,,,,,,,,1018.0,,,,,,,,,DSM 7271,DSM 7271,,Generic,Generic.1.0,Generic,US DOE Joint Genome Institute (JGI-PGF),"Joint Genome Institute, U.S. Department of Energy",,,,,,,,,,,521097.0,,521097.0,,,,,,,,,,type strain of Capnocytophaga ochracea,type strain of Capnocytophaga ochracea,,,,,latest,


In [44]:
columns = merge_reordered.columns.to_list()

In [45]:
with open('Attributes.csv', 'w') as f:
    for item in columns:
        f.write("%s\n" % item)

**Open file that has columns to be joined together:**

In [46]:
with open("Attributes_reformated.csv", "r+") as f:
    lines = f.readlines()

In [47]:
#create dictionary with info from loaded file
d={}
for line in lines:
    words = line.split(",")
    words = [x.strip() for x in words if x.strip()]
    d[words[0]] = words[1:]

In [48]:
new_cols=[]
to_drop = []
dummy = merge_reordered.copy() # copy of the dataframe (for security reasons)

In [49]:
for i,a in d.items(): #i - name of new column; a - columns to merge; d : dictionary[i] = a
    #print("New column: " + i + "_merged; combining: " +  str(a))
    #print(i)
    to_drop.append(a[0])
    dummy[i + "_merged"] = dummy[a[0]]
    for x in a[1:]:
        dummy[i + "_merged"] = dummy[i + "_merged"].combine_first(dummy[x])
        #print("added " +  x)
        to_drop.append(x)
    new_cols.append(str(i) + "_merged")

In [50]:
to_look = dummy.reindex(sorted(dummy.columns), axis=1)
to_look.to_csv("before_cleaning.csv")

In [51]:
# drop columns already used in the joined columns:
dummy3 = dummy.drop(columns=to_drop)

# rename columns created by the join to the final name:
dummy3.columns = dummy3.columns.str.replace('_merged', '')
dummy3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2686 entries, 0 to 2685
Columns: 299 entries,  to WGS master
dtypes: float64(44), object(255)
memory usage: 6.1+ MB


In [52]:
length = len(dummy3.index)

In [53]:
dummy3.loc["SUM", :] = dummy3.isnull().sum(axis = 0)
#dummy3.to_csv("Dummy3.csv")

#remove columns that have less than 10 entries
dummy4 = dummy3[dummy3.columns[dummy3.loc["SUM"] <(length - 10)]]
dummy4.drop(index="SUM", inplace=True)
dummy4.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2686 entries, 0 to 2685
Columns: 169 entries, Alias to WGS master
dtypes: float64(9), object(160)
memory usage: 3.5+ MB


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [54]:
dummy4.head(2)

Unnamed: 0,Alias,Altitude,Anonymized name,Arrayexpress species,Arrayexpress strain or line,ASM name,Assembly accession,Assembly method,Assembly level,Biomaterial provider,Bioproject accession,Biosample accession,Biosample model,Biosample title,Biotic relationship,Body sample site,Cell shape,Chromosomes,Collected by,Collection date,Comment paragraph,Completeness estimated,Completion date,Contact email,Contact lab,Contamination estimated,Contigs,Country,Culture collection,Depth,Derived from,Description,Disease,Elevation,ENA checklist,ENA-first-public,ENA-last-update,Env biome,Env broad scale,Env feature,Env local scale,Env material,Env medium,Environment,Environmental sample,Estimated size,Excluded from refseq,FDA cvm ID,Finishing strategy (depth of coverage),First name,Ftp path,Gbrs paired ASM,Gc content,Genbank accessions,Gene calling method,Genome coverage,Genome ID,Genome length,Genome name,Genome quality flags,Genome rep,Genome status,Genotype,Geo loc name,Geographic location,Geographic location (country and/or atlantic ocean),GOLD stamp ID,Gram stain,Habitat,Host,Host age,Host disease,Host disease outcome,Host gender,Host health,Host name,Host scientific name,Host sex,Host taxid,Host tissue sampled,Identification method,Identified by,Infraspecific name,INSDC center alias,INSDC center name,INSDC first public,INSDC last update,INSDC status,Investigation type,Isol growth condt,Isolate,Isolation comments,Isolation country,Isolation site,Isolation source,Last name,lat lon,Latitude,Locus tag prefix,Longitude,Mapping method and version,Material,Metagenome source,Metagenomic,Misc param,Misc param: hmp supersite,Mlst,Motility,Name,Ncbi taxon ID,Note,Num replicons,Organism,Organism name,Other clinical,Other environmental,Oxygen requirement,Paired ASM comp,pH,Phenotypes,Plasmids,Project name,Project type,Publication,Publication date,Quality assessment method and version,Ref biomaterial,Refseq accessions,Refseq category,Refseq cds,Rel to oxygen,Relation to type material,Release type,Salinity,Samp size,Sample accession,Sample name,Sample type,Seq rel date,Sequencing centers,Sequencing depth,Sequencing method,Sequencing platform,Sequencing status,Sop,Source mat ID,Species taxid,Specific host,Sporulation,SRA accession,Strain,Strain name alias,Subject ID,Submission model,Submission package,Submitter,Subsrc note,Supplier name,Taxid,Temperature,Temperature optimum,Temperature range,Title,Trophic level,Type strain,Type material,Value,Version status,WGS master
0,,,,,,ASM1664v1,GCA_000016645.1,,Complete Genome,,PRJNA16082,SAMN02598357,,Sample from Flavobacterium johnsoniae UW101,,,Rod,1.0,,,Flavobacterium johnsoniae ATCC 17061. Flavobac...,,2007-05-01T00:00:00Z,,,,1.0,,ATCC 17061,,,,,,,,,,,,,,,,,,,,,,ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000...,GCF_000016645.1,34.1,CP000685,,,GCA_000016645.1,6096872.0,Flavobacterium johnsoniae UW101,,Full,Complete,,,United Kingdom: England,,,,Multiple,,,,,,,,,,,,,,strain=UW101; ATCC 17061,,,,,,,,,isolated from soil in England and is the type ...,United Kingdom,,soil,,,,,,,,,,,,,Yes,NCBI,376686.0,,,Flavobacterium johnsoniae UW101,Flavobacterium johnsoniae UW101,,,Aerobic,identical,,,,,,,2014-01-28T00:00:00.000,,,NC_009441,representative genome,5017.0,,assembly from type material,Major,Non-halophilic,,SAMN02598357,CP000685,,2007/05/01,DOE Joint Genome Institute,,,,complete,,,986.0,,No,,ATCC 17061,,,Generic,Generic.1.0,US DOE Joint Genome Institute,,,376686,,20-30,Mesophilic,,,Yes,type strain of Flavobacterium johnsoniae,,latest,
1,,,,,,ASM2328v1,GCA_000023285.1,,Complete Genome,,PRJNA29403,SAMN00001911,,Generic sample from Capnocytophaga ochracea DS...,,,Rod,1.0,,,Capnocytophaga ochracea DSM 7271,,2009-04-30T00:00:00Z,,,,1.0,,DSM 7271,,,,,,,,,,,,,,,,,,,,,,ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000...,GCF_000023285.1,39.6,CP001632,,,GCA_000023285.1,2612925.0,Capnocytophaga ochracea DSM 7271,,Full,Complete,,,,,,,Host-associated,,,,,,,"Human, Homo sapiens",,,,,,,strain=DSM 7271,,,,,,,,,isolated from the human oral cavity,,,the human oral cavity,,,,,,,,,,,,,Yes,"Joint Genome Institute, U.S. Department of Energy",521097.0,,,Capnocytophaga ochracea DSM 7271,Capnocytophaga ochracea DSM 7271,,,Facultative,identical,,,,,,21304645.0,2009-02-19T00:00:00.000,,,NC_013162,representative genome,2171.0,,assembly from type material,Major,,,SAMN00001911,FWCB,,2009/08/26,US DOE Joint Genome Institute (JGI-PGF)|DOE Jo...,,,,complete,,,1018.0,,No,"SRR013476,SRR013477",DSM 7271,,,Generic,Generic.1.0,US DOE Joint Genome Institute (JGI-PGF),,,521097,,35-37,Mesophilic,,,Yes,type strain of Capnocytophaga ochracea,,latest,


In [55]:
dummy4.drop(columns=["Completeness estimated","Contamination estimated","Gc content","Genome length","Genome coverage"], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [56]:
dummy4.to_csv("Metadata.csv", index=False)

## 2.6 Check if sample is marine or not 
From coordinates (latitude and longitude). \
Python package: global_land_mask

In [57]:
metadata = dummy4.copy()

In [58]:
pattern = '|'.join([" N"," W", " S", " E", " DD", "N", "E"])
metadata["latitude_longitude"] = metadata["lat lon"].str.replace(pattern, "")

In [59]:
metadata["Lat"] = metadata["Latitude"].str.replace(" DD", "")
metadata["Long"] = metadata["Longitude"].str.replace(" DD", "")

In [60]:
new = metadata["latitude_longitude"].str.split(" ", expand=True)

In [61]:
# making separate column names
metadata["lat"]= new[0] 
metadata["lon"]= new[1] 

In [62]:
metadata["Latitude"] = metadata["Lat"].combine_first(metadata["lat"])
metadata["Longitude"] = metadata["Long"].combine_first(metadata["lon"])
metadata.drop(columns=["latitude_longitude","Lat", "Long", "lat lon", "lat", "lon"], inplace=True)

In [63]:
df = metadata[["Assembly accession", "Latitude", "Longitude"]]

In [64]:
from global_land_mask import globe
d = {}
for i in df.iterrows():
    for x in i[1:]:
        accession = x[0]
        #print(accession)
        lat = (x[1])
        #print(lat)
        lon = (x[2])
        #print(lon)
        try:
            lat = float(lat)
            lon = float(lon)
            try:
                is_on_land = globe.is_land(lat,lon)
                d[accession] = is_on_land
                #print("added to dic")
            except:
                d[accession] = None
                #print("out of bonds")
        except ValueError:
            d[accession] = None
            #print("Not a float")
        #print("\n")

In [65]:
is_marine = pd.DataFrame.from_dict(d, orient='index', columns=["Marine_coordinates"])

In [66]:
merge = pd.merge(metadata, is_marine, how="left", left_on="Assembly accession", right_index=True, indicator="merge")
merge["merge"].value_counts()

both          2686
right_only       0
left_only        0
Name: merge, dtype: int64

In [67]:
merge["Marine_coordinates"].value_counts()

True     393
False    270
Name: Marine_coordinates, dtype: int64

In [68]:
merge = merge.drop(columns=["merge"])
merge.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2686 entries, 0 to 2685
Columns: 164 entries, Alias to Marine_coordinates
dtypes: float64(7), object(157)
memory usage: 3.4+ MB


In [69]:
merge = merge.set_index("Assembly accession")
merge.to_csv("Metadata.csv")

## 2.7 Classifier

after running classifier_v3.0.py:

In [86]:
metadata_class = pd.read_csv("Metadata_classification.csv")
metadata_class.head(2)

Unnamed: 0,Assembly accession,Alias,Altitude,Anonymized name,Arrayexpress species,Arrayexpress strain or line,ASM name,Assembly method,Assembly level,Biomaterial provider,Bioproject accession,Biosample accession,Biosample model,Biosample title,Biotic relationship,Body sample site,Cell shape,Chromosomes,Collected by,Collection date,Comment paragraph,Completion date,Contact email,Contact lab,Contigs,Country,Culture collection,Depth,Derived from,Description,Disease,Elevation,ENA checklist,ENA-first-public,ENA-last-update,Env biome,Env broad scale,Env feature,Env local scale,Env material,Env medium,Environment,Environmental sample,Estimated size,Excluded from refseq,FDA cvm ID,Finishing strategy (depth of coverage),First name,Ftp path,Gbrs paired ASM,Genbank accessions,Gene calling method,Genome ID,Genome name,Genome quality flags,Genome rep,Genome status,Genotype,Geo loc name,Geographic location,Geographic location (country and/or atlantic ocean),GOLD stamp ID,Gram stain,Habitat,Host,Host age,Host disease,Host disease outcome,Host gender,Host health,Host name,Host scientific name,Host sex,Host taxid,Host tissue sampled,Identification method,Identified by,Infraspecific name,INSDC center alias,INSDC center name,INSDC first public,INSDC last update,INSDC status,Investigation type,Isol growth condt,Isolate,Isolation comments,Isolation country,Isolation site,Isolation source,Last name,Latitude,Locus tag prefix,Longitude,Mapping method and version,Material,Metagenome source,Metagenomic,Misc param,Misc param: hmp supersite,Mlst,Motility,Name,Ncbi taxon ID,Note,Num replicons,Organism,Organism name,Other clinical,Other environmental,Oxygen requirement,Paired ASM comp,pH,Phenotypes,Plasmids,Project name,Project type,Publication,Publication date,Quality assessment method and version,Ref biomaterial,Refseq accessions,Refseq category,Refseq cds,Rel to oxygen,Relation to type material,Release type,Salinity,Samp size,Sample accession,Sample name,Sample type,Seq rel date,Sequencing centers,Sequencing depth,Sequencing method,Sequencing platform,Sequencing status,Serovar,Sop,Source mat ID,Species taxid,Specific host,Sporulation,SRA accession,Strain,Strain name alias,Subject ID,Submission model,Submission package,Submitter,Subsrc note,Supplier name,Taxid,Temperature,Temperature optimum,Temperature range,Title,Trophic level,Type strain,Type material,Value,Version status,WGS master,Marine_coordinates,MAG_words,MAG,Human-associated_words,Human-associated,Terrestrial_words,Terrestrial,Marine_words,Marine,Freshwater_words,Freshwater,Origin,Conflict_words,Conflict?
0,GCA_000016645.1,,,,,,ASM1664v1,,Complete Genome,,PRJNA16082,SAMN02598357,,Sample from Flavobacterium johnsoniae UW101,,,Rod,1.0,,,Flavobacterium johnsoniae ATCC 17061. Flavobac...,2007-05-01T00:00:00Z,,,1.0,,ATCC 17061,,,,,,,,,,,,,,,,,,,,,,ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000...,GCF_000016645.1,CP000685,,GCA_000016645.1,Flavobacterium johnsoniae UW101,,Full,Complete,,,United Kingdom: England,,,,Multiple,,,,,,,,,,,,,,strain=UW101; ATCC 17061,,,,,,,,,isolated from soil in England and is the type ...,United Kingdom,,soil,,,,,,,,,,,,Yes,NCBI,376686.0,,,Flavobacterium johnsoniae UW101,Flavobacterium johnsoniae UW101,,,Aerobic,identical,,,,,,,2014-01-28T00:00:00.000,,,NC_009441,representative genome,5017.0,,assembly from type material,Major,Non-halophilic,,SAMN02598357,CP000685,,2007/05/01,DOE Joint Genome Institute,,,,complete,,,,986.0,,No,,ATCC 17061,,,Generic,Generic.1.0,US DOE Joint Genome Institute,,,376686.0,,20-30,Mesophilic,,,Yes,type strain of Flavobacterium johnsoniae,,latest,,,,,,,['soil'],Terrestrial,,,,,Terrestrial,Terrestrial,
1,GCA_000023285.1,,,,,,ASM2328v1,,Complete Genome,,PRJNA29403,SAMN00001911,,Generic sample from Capnocytophaga ochracea DS...,,,Rod,1.0,,,Capnocytophaga ochracea DSM 7271,2009-04-30T00:00:00Z,,,1.0,,DSM 7271,,,,,,,,,,,,,,,,,,,,,,ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000...,GCF_000023285.1,CP001632,,GCA_000023285.1,Capnocytophaga ochracea DSM 7271,,Full,Complete,,,,,,,Host-associated,,,,,,,"Human, Homo sapiens",,,,,,,strain=DSM 7271,,,,,,,,,isolated from the human oral cavity,,,the human oral cavity,,,,,,,,,,,,Yes,"Joint Genome Institute, U.S. Department of Energy",521097.0,,,Capnocytophaga ochracea DSM 7271,Capnocytophaga ochracea DSM 7271,,,Facultative,identical,,,,,,21304645.0,2009-02-19T00:00:00.000,,,NC_013162,representative genome,2171.0,,assembly from type material,Major,,,SAMN00001911,FWCB,,2009/08/26,US DOE Joint Genome Institute (JGI-PGF)|DOE Jo...,,,,complete,,,,1018.0,,No,"SRR013476,SRR013477",DSM 7271,,,Generic,Generic.1.0,US DOE Joint Genome Institute (JGI-PGF),,,521097.0,,35-37,Mesophilic,,,Yes,type strain of Capnocytophaga ochracea,,latest,,,,,"['""human', 'homo sapiens""']",Human-associated,,,,,,,Human-associated,Human-associated,


In [87]:
metadata_class["Conflict?"].value_counts()

Yes    7
Name: Conflict?, dtype: int64

In [88]:
metadata_class["Conflict?"] = ["Unclassified" if str(x) == "Yes" else None for x in metadata_class['Conflict?']]
metadata_class["Conflict?"].value_counts()

Unclassified    7
Name: Conflict?, dtype: int64

In [89]:
metadata_class["Origin"] = metadata_class["Conflict?"].combine_first(metadata_class["Origin"])
metadata_class["Origin"] = ["Non_marine" if  str(x) == "Terrestrial" or str(x) =="Freshwater" or str(x) =="Human-associated" else "Marine" if str(x)=="Marine" else "Unclassified" for x in metadata_class['Origin']]
metadata_class["Origin"].value_counts()

Unclassified    1594
Marine           660
Non_marine       432
Name: Origin, dtype: int64

In [90]:
metadata_class.drop(columns=["MAG_words","Human-associated_words","Human-associated","Terrestrial_words","Terrestrial","Marine_words","Marine","Freshwater_words","Freshwater","Conflict_words","Conflict?"], inplace=True)

In [91]:
metadata_class.head()

Unnamed: 0,Assembly accession,Alias,Altitude,Anonymized name,Arrayexpress species,Arrayexpress strain or line,ASM name,Assembly method,Assembly level,Biomaterial provider,Bioproject accession,Biosample accession,Biosample model,Biosample title,Biotic relationship,Body sample site,Cell shape,Chromosomes,Collected by,Collection date,Comment paragraph,Completion date,Contact email,Contact lab,Contigs,Country,Culture collection,Depth,Derived from,Description,Disease,Elevation,ENA checklist,ENA-first-public,ENA-last-update,Env biome,Env broad scale,Env feature,Env local scale,Env material,Env medium,Environment,Environmental sample,Estimated size,Excluded from refseq,FDA cvm ID,Finishing strategy (depth of coverage),First name,Ftp path,Gbrs paired ASM,Genbank accessions,Gene calling method,Genome ID,Genome name,Genome quality flags,Genome rep,Genome status,Genotype,Geo loc name,Geographic location,Geographic location (country and/or atlantic ocean),GOLD stamp ID,Gram stain,Habitat,Host,Host age,Host disease,Host disease outcome,Host gender,Host health,Host name,Host scientific name,Host sex,Host taxid,Host tissue sampled,Identification method,Identified by,Infraspecific name,INSDC center alias,INSDC center name,INSDC first public,INSDC last update,INSDC status,Investigation type,Isol growth condt,Isolate,Isolation comments,Isolation country,Isolation site,Isolation source,Last name,Latitude,Locus tag prefix,Longitude,Mapping method and version,Material,Metagenome source,Metagenomic,Misc param,Misc param: hmp supersite,Mlst,Motility,Name,Ncbi taxon ID,Note,Num replicons,Organism,Organism name,Other clinical,Other environmental,Oxygen requirement,Paired ASM comp,pH,Phenotypes,Plasmids,Project name,Project type,Publication,Publication date,Quality assessment method and version,Ref biomaterial,Refseq accessions,Refseq category,Refseq cds,Rel to oxygen,Relation to type material,Release type,Salinity,Samp size,Sample accession,Sample name,Sample type,Seq rel date,Sequencing centers,Sequencing depth,Sequencing method,Sequencing platform,Sequencing status,Serovar,Sop,Source mat ID,Species taxid,Specific host,Sporulation,SRA accession,Strain,Strain name alias,Subject ID,Submission model,Submission package,Submitter,Subsrc note,Supplier name,Taxid,Temperature,Temperature optimum,Temperature range,Title,Trophic level,Type strain,Type material,Value,Version status,WGS master,Marine_coordinates,MAG,Origin
0,GCA_000016645.1,,,,,,ASM1664v1,,Complete Genome,,PRJNA16082,SAMN02598357,,Sample from Flavobacterium johnsoniae UW101,,,Rod,1.0,,,Flavobacterium johnsoniae ATCC 17061. Flavobac...,2007-05-01T00:00:00Z,,,1.0,,ATCC 17061,,,,,,,,,,,,,,,,,,,,,,ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000...,GCF_000016645.1,CP000685,,GCA_000016645.1,Flavobacterium johnsoniae UW101,,Full,Complete,,,United Kingdom: England,,,,Multiple,,,,,,,,,,,,,,strain=UW101; ATCC 17061,,,,,,,,,isolated from soil in England and is the type ...,United Kingdom,,soil,,,,,,,,,,,,Yes,NCBI,376686.0,,,Flavobacterium johnsoniae UW101,Flavobacterium johnsoniae UW101,,,Aerobic,identical,,,,,,,2014-01-28T00:00:00.000,,,NC_009441,representative genome,5017.0,,assembly from type material,Major,Non-halophilic,,SAMN02598357,CP000685,,2007/05/01,DOE Joint Genome Institute,,,,complete,,,,986.0,,No,,ATCC 17061,,,Generic,Generic.1.0,US DOE Joint Genome Institute,,,376686.0,,20-30,Mesophilic,,,Yes,type strain of Flavobacterium johnsoniae,,latest,,,,Non_marine
1,GCA_000023285.1,,,,,,ASM2328v1,,Complete Genome,,PRJNA29403,SAMN00001911,,Generic sample from Capnocytophaga ochracea DS...,,,Rod,1.0,,,Capnocytophaga ochracea DSM 7271,2009-04-30T00:00:00Z,,,1.0,,DSM 7271,,,,,,,,,,,,,,,,,,,,,,ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000...,GCF_000023285.1,CP001632,,GCA_000023285.1,Capnocytophaga ochracea DSM 7271,,Full,Complete,,,,,,,Host-associated,,,,,,,"Human, Homo sapiens",,,,,,,strain=DSM 7271,,,,,,,,,isolated from the human oral cavity,,,the human oral cavity,,,,,,,,,,,,Yes,"Joint Genome Institute, U.S. Department of Energy",521097.0,,,Capnocytophaga ochracea DSM 7271,Capnocytophaga ochracea DSM 7271,,,Facultative,identical,,,,,,21304645.0,2009-02-19T00:00:00.000,,,NC_013162,representative genome,2171.0,,assembly from type material,Major,,,SAMN00001911,FWCB,,2009/08/26,US DOE Joint Genome Institute (JGI-PGF)|DOE Jo...,,,,complete,,,,1018.0,,No,"SRR013476,SRR013477",DSM 7271,,,Generic,Generic.1.0,US DOE Joint Genome Institute (JGI-PGF),,,521097.0,,35-37,Mesophilic,,,Yes,type strain of Capnocytophaga ochracea,,latest,,,,Non_marine
2,GCA_000023465.1,,,,,,ASM2346v1,,Complete Genome,,PRJNA38641,SAMN02603855,,Sample from Zunongwangia profunda SM-A87,,,Rod,1.0,,,Zunongwangia profunda SM-A87.This strain will ...,2010-04-21T00:00:00Z,,,1.0,,DSM:18752,,,,,,,,,,,,,,,,,,,,,,ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000...,GCF_000023465.1,CP001650,,GCA_000023465.1,Zunongwangia profunda SM-A87,,Full,Complete,,,,,,,Aquatic,,,,,,,,,,,,,,strain=SMA-87,,,,,,,,,,,,,,,,,,,,,,,,No,NCBI,655815.0,type strain of Zunongwangia profunda,,Zunongwangia profunda SM-A87,Zunongwangia profunda SM-A87,,,Aerobic,identical,,,,,,20398413.0,2014-01-30T00:00:00.000,,,NC_014041,representative genome,4653.0,,assembly from type material,Major,,,SAMN02603855,CP001650,,2010/04/21,ShanDong University|Beijing Institute of Genom...,,,,complete,,,,398743.0,,No,,SM-A87,,,Generic,Generic.1.0,ShanDong University,,,655815.0,,,Mesophilic,,,,type strain of Zunongwangia profunda,,latest,,,,Unclassified
3,GCA_000023725.1,,,,,,ASM2372v1,,Complete Genome,,PRJNA38559,SAMN02604231,,Sample from Flavobacteriaceae bacterium 3519-10,,,,1.0,,,Flavobacteriaceae bacterium 3519-10.This strai...,2009-08-05T00:00:00Z,,,1.0,Antarctica: Vostok Station,,,,,,,,,,,,,,,,,,,,,,,ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000...,GCF_000023725.1,CP001673,,GCA_000023725.1,Flavobacteriaceae bacterium 3519-10,,Full,Complete,,,Antarctica: Vostok Station,,,,,,,,,,,,,,,,,,strain=3519-10,,,,,,,,,,,,"glacial ice, 3519 m depth",,,,,,,,,,,,,NCBI,531844.0,,,Flavobacteriaceae bacterium 3519-10,Flavobacteriaceae bacterium 3519-10,,,,identical,,,,,,18622572.0,2014-01-30T15:13:19.717,,,NC_013062,,2534.0,,,Major,,,SAMN02604231,CP001673,,2009/08/05,University of Nevada Las Vegas,,,,complete,,,,531844.0,,,,3519,,,Generic,Generic.1.0,University of Nevada Las Vegas,,,531844.0,,,,,,,,,latest,,,,Non_marine
4,GCA_000024125.1,,,,,,ASM2412v1,,Complete Genome,,PRJNA13461,SAMN02603916,,Sample from Robiginitalea biformata HTCC2501,,,Rod,1.0,,,Robiginitalea biformata strain HTCC2501. Strai...,2006-02-24T00:00:00Z,,,1.0,,,,,,,,,,,,,,,,,,,,,,,,ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000...,GCF_000024125.1,CP001712,,GCA_000024125.1,Robiginitalea biformata HTCC2501,,Full,Complete,,,,,,,Aquatic,,,,,,,,,,,,,,strain=HTCC2501,,,,,,,,,,,,,,,,,,,,,,,,,NCBI,313596.0,,,Robiginitalea biformata HTCC2501,Robiginitalea biformata HTCC2501,,,Aerobic,identical,,,,,,19767438.0,2014-01-30T00:00:00.000,,,NC_013222,representative genome,3211.0,,assembly from type material,Major,Mesophilic,,SAMN02603916,CP001712,,2009/09/11,J. Craig Venter Institute,,,,complete,,,,252307.0,,,,HTCC2501,,,Generic,Generic.1.0,The Gordon and Betty Moore Foundation Marine M...,,,313596.0,,30,Mesophilic,,,Yes,type strain of Robiginitalea biformata,,latest,,,,Unclassified


In [92]:
metadata_class.to_csv("Metadata_final.csv", index=False)