In [99]:
import pandas as pd
import numpy as np

In [100]:
df = pd.read_csv('Orthogroups.GeneCount.tsv', sep='\t')

In [170]:
grcs = ['bcop_grc', 'bimp_grc', 'ling_grc']
cores = ['aaphi','bcop_core','bimp_core','contarinia','dmel','ling_core','orobi']#, 'phyg']
df['core_SCO'] = (df.loc[:,cores]==1).sum(axis=1)==df.loc[:,cores].shape[1]
df['all_SCO'] = (df.loc[:,cores+grcs]==1).sum(axis=1)==df.loc[:,cores+grcs].shape[1]
df['grc_genome_presence_count'] = (df[grcs]>0).sum(axis=1)

In [174]:
print(f"There are {df.shape[0]} orthogroups")
print(f"There are {np.sum(df['core_SCO'])} core genome SCOs")
print(f"There are {np.sum(df['all_SCO'])} SCOs across all genomes")

print("\n")

for n_genomes, n_orthogroups in df[df['core_SCO']==True]['grc_genome_presence_count'].value_counts().sort_index().iteritems():
    print(f"There are {n_orthogroups} core genome SCOs with at least 1 ortholog in {n_genomes} GRC genome(s)")

print("\n")    

print(f"There are {(df[df['core_SCO']==True][grcs[0:2]]>0).sum(axis=1).value_counts()[2]} core genome SCOs\
 with at least one ortholog in both 'bcop_grc' and 'bimp_grc'")
print(f"There are {(df[df['core_SCO']==True][grcs[1:3]]>0).sum(axis=1).value_counts()[2]} core genome SCOs\
 with at least one ortholog in both 'bimp_grc' and 'ling_grc'")
print(f"There are {(df[df['core_SCO']==True][['bcop_grc','ling_grc']]>0).sum(axis=1).value_counts()[2]} core genome SCOs\
 with at least one ortholog in both 'bcop_grc' and 'ling_grc'")

print("\n")    

print(f"There are {(df[df['core_SCO']==True]['bcop_grc']==1).sum()} core genome SCOs\
 that have 1 copy in 'bcop_grc'")
print(f"There are {(df[df['core_SCO']==True]['bimp_grc']==1).sum()} core genome SCOs\
 that have 1 copy in 'bimp_grc'")
print(f"There are {(df[df['core_SCO']==True]['ling_grc']==1).sum()} core genome SCOs\
 that have 1 copy in 'ling_grc'")

print("\n")    

print(f"There are {(df[df['core_SCO']==True][grcs[0:2]]==1).sum(axis=1).value_counts()[2]} core genome SCOs\
 that are also SCOs in both 'bcop_grc' and 'bimp_grc'")
print(f"There are {(df[df['core_SCO']==True][grcs[1:3]]==1).sum(axis=1).value_counts()[2]} core genome SCOs\
 that are also SCOs in both 'bimp_grc' and 'ling_grc'")
print(f"There are {(df[df['core_SCO']==True][['bcop_grc','ling_grc']]==1).sum(axis=1).value_counts()[2]} core genome SCOs\
 that are also SCOs in both 'bcop_grc' and 'ling_grc'")

There are 18984 orthogroups
There are 3888 core genome SCOs
There are 122 SCOs across all genomes


There are 767 core genome SCOs with at least 1 ortholog in 0 GRC genome(s)
There are 1549 core genome SCOs with at least 1 ortholog in 1 GRC genome(s)
There are 1268 core genome SCOs with at least 1 ortholog in 2 GRC genome(s)
There are 304 core genome SCOs with at least 1 ortholog in 3 GRC genome(s)


There are 710 core genome SCOs with at least one ortholog in both 'bcop_grc' and 'bimp_grc'
There are 399 core genome SCOs with at least one ortholog in both 'bimp_grc' and 'ling_grc'
There are 1071 core genome SCOs with at least one ortholog in both 'bcop_grc' and 'ling_grc'


There are 1477 core genome SCOs that have 1 copy in 'bcop_grc'
There are 854 core genome SCOs that have 1 copy in 'bimp_grc'
There are 1166 core genome SCOs that have 1 copy in 'ling_grc'


There are 341 core genome SCOs that are also SCOs in both 'bcop_grc' and 'bimp_grc'
There are 276 core genome SCOs that are als

In [172]:
for drop_genome in cores:
    remaining_cores = [genome for genome in cores if genome not in [drop_genome]]
    count = (df.loc[:,remaining_cores]==1).sum(axis=1)==df.loc[:,remaining_cores].shape[1]
    print(f"There are {np.sum(count)} core genome SCOs excluding {drop_genome}")

There are 4085 core genome SCOs excluding aaphi
There are 4168 core genome SCOs excluding bcop_core
There are 4111 core genome SCOs excluding bimp_core
There are 4128 core genome SCOs excluding contarinia
There are 4514 core genome SCOs excluding dmel
There are 4095 core genome SCOs excluding ling_core
There are 4294 core genome SCOs excluding orobi


In [173]:
for drop_genome in cores:
    remaining_cores = [genome for genome in cores if genome not in [drop_genome, 'phyg']]
    count = (df.loc[:,remaining_cores]==1).sum(axis=1)==df.loc[:,remaining_cores].shape[1]
    print(f"There are {np.sum(count)} core genome SCOs excluding {drop_genome} and 'phyg'")

There are 4085 core genome SCOs excluding aaphi and 'phyg'
There are 4168 core genome SCOs excluding bcop_core and 'phyg'
There are 4111 core genome SCOs excluding bimp_core and 'phyg'
There are 4128 core genome SCOs excluding contarinia and 'phyg'
There are 4514 core genome SCOs excluding dmel and 'phyg'
There are 4095 core genome SCOs excluding ling_core and 'phyg'
There are 4294 core genome SCOs excluding orobi and 'phyg'


In [166]:
# I think including Pseudolycoriella hygida (phyg) is not helping, excluding it takes us up to 3888 and its internal, 
# prob just poor genome
# Excluding dmel takes up to 4500, makes sense cos deepest split and its the outgroup
# Have a quick look at analyse_alignments_v2.R since its run before this step
# This lines up with Fede saying :
# Regarding completeness, we can see much lower quality gene predictions for GRCs and Pseudolycoriella. 
# But we are interested in GRCs, are we interested in Pseudolycoriella?

1246

In [176]:
df['Orthogroup']

0        OG0000000
1        OG0000001
2        OG0000002
3        OG0000003
4        OG0000004
           ...    
18979    OG0018979
18980    OG0018980
18981    OG0018981
18982    OG0018982
18983    OG0018983
Name: Orthogroup, Length: 18984, dtype: object