In [205]:
import pandas as pd
import networkx as nx

refseq = pd.read_csv('assembly_summary_refseq.csv', header=1, sep='\t', low_memory=False)

refseq_assembly_level = refseq[refseq['assembly_level'] == 'Chromosome']

#--------------------Filters Eukaryotes----------------------
#Assembly Level
values = ['Chromosome','Complete Genome']
refseq_assembly_level = refseq[refseq['assembly_level'].isin(values)]
#Annotation Provider
refseq_assembly_level_NCBI = refseq_assembly_level[refseq_assembly_level['annotation_provider'] == 'NCBI RefSeq']
#Group
tax = ['invertebrate','vertebrate_mammalian','vertebrate_other','plant']
refseq_assembly_level_NCBI_group = refseq_assembly_level_NCBI[refseq_assembly_level_NCBI['group'].isin(tax)]



# Print 

#Columns of RefSeq
refseq.columns
refseq.iloc[:, refseq.columns.isin(['assembly_level','organism_name'])]
refseq.iloc[:10]
refseq_assembly_level.iloc[:10]
refseq.iloc[:10, refseq.columns.isin(['assembly_level','organism_name'])]
refseq_assembly_level.iloc[:10, refseq.columns.isin(['assembly_level','organism_name'])]
refseq_assembly_level_NCBI_group.iloc[180:200, refseq.columns.isin(['assembly_level','organism_name'])]
refseq_assembly_level_NCBI_group.iloc[180:200]
refseq.iloc[:20, refseq.columns.isin(['gbrs_paired_asm','organism_name','group'])]
#list(refseq_assembly_level_NCBI_group.columns)

#List of columns
#list(refseq_assembly_level_NCBI_group.columns)



#List of Taxonomical Groups in the column 'group'
refseq['group'].unique().tolist()
refseq_assembly_level_NCBI['group'].unique().tolist()
#Result: ['invertebrate','vertebrate_mammalian','plant','vertebrate_other','protozoa','fungi','bacteria','archaea','viral']



#Final list
refseq_assembly_level_NCBI.iloc[:20, refseq.columns.isin(['assembly_level','organism_name','group'])]
refseq_assembly_level_NCBI_group.iloc[:20, refseq.columns.isin(['assembly_level','organism_name','group'])]

#Save in a csv file
refseq_assembly_level_NCBI_group.to_csv('assembly_summary_refseq_Eukaryotes.csv', sep=" ")


#Names of species
names = refseq_assembly_level_NCBI_group['organism_name']
names.to_csv('organism_name.csv', sep=" ")

names.to_csv('organism_name.txt', sep=" ", index=False)




import copy
from io import StringIO

from Bio import Phylo
from Bio.Phylo.Applications import PhymlCommandline
from Bio.Phylo.PAML import codeml
from Bio.Phylo.PhyloXML import Phylogeny

tree = Phylo.read("phyliptree.phy", "newick")
print(tree)

common_ancestor_mammals = tree.clade[0,5,1]
common_ancestor_plants = tree.clade[1]
common_ancestor_arthropods = tree.clade[0,7]
common_ancestor_aves = tree.clade[0,5,2]
common_ancestor_fish = tree.clade[0,5,7]

term_names_mammals = [term.name for term in common_ancestor_mammals.get_terminals()]
term_names_plants = [term.name for term in common_ancestor_plants.get_terminals()]
term_names_arthropods = [term.name for term in common_ancestor_arthropods.get_terminals()]
term_names_aves = [term.name for term in common_ancestor_aves.get_terminals()]
term_names_fish = [term.name for term in common_ancestor_fish.get_terminals()]




term_names_mammals_df = pd.DataFrame([term_names_mammals])
term_names_mammals_df.to_csv('mammals.txt', sep='\t', index=False,header=None)

term_names_plants_df = pd.DataFrame([term_names_plants])
term_names_plants_df.to_csv('plants.txt', sep='\t', index=False,header=None)

term_names_arthropods_df = pd.DataFrame([term_names_arthropods])
term_names_arthropods_df.to_csv('arthropods.txt', sep='\t', index=False,header=None)

term_names_aves_df = pd.DataFrame([term_names_aves])
term_names_aves_df.to_csv('aves.txt', sep='\t', index=False,header=None)

term_names_fish_df = pd.DataFrame([term_names_fish])
term_names_fish_df.to_csv('fish.txt', sep='\t', index=False,header=None)





#1. RefSeq Group & Remove duplicated species & Add a Taxonomy Group
refseq_mammals = refseq_assembly_level_NCBI[refseq_assembly_level_NCBI['organism_name'].isin(term_names_mammals)]
refseq_mammals = refseq_mammals.drop_duplicates(subset=['organism_name'], keep='last')

refseq_plants = refseq_assembly_level_NCBI[refseq_assembly_level_NCBI['organism_name'].isin(term_names_plants)]
refseq_plants = refseq_plants.drop_duplicates(subset=['organism_name'], keep='last')


refseq_arthropods = refseq_assembly_level_NCBI[refseq_assembly_level_NCBI['organism_name'].isin(term_names_arthropods)]
refseq_arthropods = refseq_arthropods.drop_duplicates(subset=['organism_name'], keep='last')


refseq_aves = refseq_assembly_level_NCBI[refseq_assembly_level_NCBI['organism_name'].isin(term_names_aves)]
refseq_aves = refseq_aves.drop_duplicates(subset=['organism_name'], keep='last')


refseq_fish = refseq_assembly_level_NCBI[refseq_assembly_level_NCBI['organism_name'].isin(term_names_fish)]
refseq_fish = refseq_fish.drop_duplicates(subset=['organism_name'], keep='last')


#Assign a column with a new Taxonomical classification
list = ['Fish'] * len(refseq_fish)
refseq_fish=refseq_fish.assign(Taxonomy=list)

list = ['Arthropods'] * len(refseq_arthropods)
refseq_arthropods=refseq_arthropods.assign(Taxonomy=list)

list = ['Plants'] * len(refseq_plants)
refseq_plants=refseq_plants.assign(Taxonomy=list)

list = ['Mammals'] * len(refseq_mammals)
refseq_mammals=refseq_mammals.assign(Taxonomy=list)

list = ['Birds'] * len(refseq_aves)
refseq_aves=refseq_aves.assign(Taxonomy=list)

#Merge all eukaryotes in a file
refseq_eukaryotes = pd.concat([refseq_mammals, refseq_aves, refseq_arthropods, refseq_fish, refseq_plants], axis=0)


#Save in a csv file
refseq_eukaryotes.to_csv('Refseq_Eukaryotes.csv', sep='\t')






#----------------Other Taxonomical Groups
#Group Level
refseq_fungi = refseq[refseq['group'] == 'fungi']
refseq_protozoa = refseq[refseq['group'] == 'protozoa']
refseq_bacteria = refseq[refseq['group'] == 'bacteria']
refseq_archaea = refseq[refseq['group'] == 'archaea']


#Assembly Level
values = ['Chromosome','Complete Genome']
refseq_fungi_assembly = refseq_fungi[refseq_fungi['assembly_level'].isin(values)]
refseq_protozoa_assembly = refseq_protozoa[refseq_protozoa['assembly_level'].isin(values)]
refseq_bacteria_assembly = refseq_bacteria[refseq_bacteria['assembly_level'].isin(values)]
refseq_archaea_assembly = refseq_archaea[refseq_archaea['assembly_level'].isin(values)]

#Annotation Provider
refseq_fungi_assembly_annotation=refseq_fungi_assembly

refseq_protozoa_assembly_annotation = refseq_protozoa_assembly[refseq_protozoa_assembly['annotation_provider'] == 'NCBI RefSeq']
refseq_protozoa_assembly.iloc[:50, refseq.columns.isin(['annotation_name','annotation_provider'])]

refseq_bacteria_assembly_annotation = refseq_bacteria_assembly[refseq_bacteria_assembly['annotation_provider'] == 'NCBI RefSeq']
refseq_bacteria_assembly_annotation = refseq_bacteria_assembly_annotation[refseq_bacteria_assembly_annotation['annotation_name'] == 'NCBI Prokaryotic Genome Annotation Pipeline (PGAP)']

refseq_archaea_assembly_annotation = refseq_archaea_assembly[refseq_archaea_assembly['annotation_provider'] == 'NCBI RefSeq']
refseq_archaea_assembly_annotation = refseq_archaea_assembly_annotation[refseq_archaea_assembly_annotation['annotation_name'] == 'NCBI Prokaryotic Genome Annotation Pipeline (PGAP)']


#Nos quedamos con el filtro de "assembly". Nos quedamos con 200 elementos de bacteria y de archaea
refseq_bacteria_assembly_f = refseq_bacteria_assembly.iloc[:400]
refseq_archaea_assembly_f = refseq_archaea_assembly.iloc[:400]
refseq_protozoa_assembly_f = refseq_protozoa_assembly
refseq_fungi_assembly_f = refseq_fungi_assembly


#Assign New Taxonomical Groups
list = ['Fungi'] * len(refseq_fungi_assembly_f)
refseq_fungi_assembly_f=refseq_fungi_assembly_f.assign(Taxonomy=list)

list = ['Unicellular Eukaryotes'] * len(refseq_protozoa_assembly_f)
refseq_protozoa_assembly_f=refseq_protozoa_assembly_f.assign(Taxonomy=list)

list = ['Bacteria'] * len(refseq_bacteria_assembly_f)
refseq_bacteria_assembly_f=refseq_bacteria_assembly_f.assign(Taxonomy=list)

list = ['Archaea'] * len(refseq_archaea_assembly_f)
refseq_archaea_assembly_f=refseq_archaea_assembly_f.assign(Taxonomy=list)


#Escribimos en un archivo
refseq_fungi_assembly_f.to_csv('Refseq_Fungi.csv', sep='\t')
refseq_protozoa_assembly_f.to_csv('Refseq_UnicellularEukaryotes.csv', sep='\t')
refseq_bacteria_assembly_f.to_csv('Refseq_Bacteria.csv', sep='\t')
refseq_archaea_assembly_f.to_csv('Refseq_Archaea.csv', sep='\t')



Tree(rooted=False, weight=1.0)
    Clade(branch_length=4.0, name='Eukaryota')
        Clade(branch_length=4.0, name='Metazoa')
            Clade(branch_length=4.0, name='Gordionus sp. m RMFG-2023')
            Clade(branch_length=4.0, name='Bolinopsis microptera')
            Clade(branch_length=4.0, name='Watersipora subatra')
            Clade(branch_length=4.0, name='Lineus longissimus')
            Clade(branch_length=4.0, name='Symsagittifera roscoffensis')
            Clade(branch_length=4.0, name='Chordata')
                Clade(branch_length=4.0, name='Polypteriformes')
                    Clade(branch_length=4.0, name='Polypterus senegalus')
                    Clade(branch_length=4.0, name='Erpetoichthys calabaricus')
                Clade(branch_length=4.0, name='Mammalia')
                    Clade(branch_length=4.0, name='Manis pentadactyla')
                    Clade(branch_length=4.0, name='Cynocephalus volans')
                    Clade(branch_length=4.0, name='Dromici

In [206]:
prueba = refseq_bacteria_assembly.iloc[:2]
prueba.to_csv('prueba.csv', sep='\t')
