In [3]:
#import libraries
import pandas as pd
import glob
import os

#reader tables function (with filtration)
def tsv_reader_filter(path):
    sample = pd.read_csv(path, sep="\t")
    filename = os.path.basename(path) #get file name
    samplename = os.path.splitext(filename)[0] #get sample name
    print(samplename)
    sample = sample[sample.Status != "Missing"] #remowe missings
    sample = sample[sample.Status != "Fragmented"] #remowe Fragmented

    try: #for samples with coverage value
        sample[['Node', 'Cov']] = sample['Sequence'].str.split('_cov_', expand=True) #split sequence name
        sample['Cov'] = pd.to_numeric(sample['Cov']) #convert Cov column to numeric
        sample = sample.groupby('Busco id', group_keys=False).apply(lambda x: x.loc[x.Cov.idxmax()]) #drop duplicates of busco id, only that with the largest coverage left
        sample = sample.drop(['Cov', 'Node'], inplace=False, axis=1) #delete columns

    except ValueError:
        sample = sample.groupby('Busco id', group_keys=False).apply(lambda x: x.loc[x.Score.idxmax()]) #drop duplicates of busco id, only that with the largest Score left
 
    sample['Sample'] = samplename #to add sample name
    return sample

#reader tables function (without filtration)
def tsv_reader(path):
  sample = pd.read_csv(path, sep="\t")
  sample['Sample'] = path[-11:-4]
  sample['Sample'] = sample['Sample'].str.replace(r'/', '') #sample name fix
  sample = sample[sample.Status != "Missing"] #remowe missings
  #sample = sample[sample.Status != "Fragmented"] #remowe Fragmented
  return sample


In [None]:
#import with filtration
#/content/drive/MyDrive/full_tables/selected/
dir = input("Provide the path to the sample directory: ")
allpaths = os.path.join(dir, "*.txt") #path to dir with files
allfiles = glob.glob(allpaths) #get list of files
concat_df = pd.concat((tsv_reader_filter(file) for file in allfiles)) #read files, filter, add samplenames column and concatenate
concat_df


In [None]:
#import without(!!!) filtration
dir = input("Provide the path to the sample directory: ")
allpaths = os.path.join(dir, "*.txt") #path to dir with files
allfiles = glob.glob(allpaths) #get list of files
concat_df = pd.concat((tsv_reader(file) for file in allfiles)) #read files, filter, add samplenames column and concatenate
concat_df


In [None]:
# procession of concat table
concat_df.rename(columns={'Busco id':'Busco_id'}, inplace=True) #rename Busco id column
filtered_buscoids = concat_df.groupby('Busco_id').filter(lambda x: len(x) == 61) #left only IDs that are not missed in 100 samples
unique_buscoids = filtered_buscoids.Busco_id.unique().tolist() #get array of unique IDs

with open("/content/unique_buscoids.list", "w") as txt_file: #save unique IDs
  txt_file.write("Busco_id" + "\n")
  for line in unique_buscoids:
    txt_file.write("".join(line) + "\n")

filtered_buscoids.to_csv('/content/filtered_buscoids.tsv', sep='\t', index=False) #save output


oldgenes = pd.read_csv("/content/drive/MyDrive/full_tables/90genes.list") #import 90genes list
newgenes = pd.read_csv("/content/unique_buscoids.list") #import filtered busco IDs list
compare = pd.merge(oldgenes, newgenes)
#concat_df_90genes = pd.concat((tsv_reader(file) for file in allfiles)) #read files, add samplenames column and concatenate
#concat_df_90genes.rename(columns={'Busco id':'Busco_id'}, inplace=True) #rename Busco id column
#filtered_buscoids_90genes = pd.merge(concat_df_90genes, compare) #get only Busco ID from "compare file"
filtered_buscoids_90genes = pd.merge(concat_df, oldgenes) #get only Busco ID from 90genes list
filtered_buscoids_90genes.to_csv('/content/filtered_buscoids_90genes.tsv', sep='\t', index=False) #save output


In [None]:
#list of 90genes per sample
concat_df_filtered = concat_df[concat_df['Busco_id'].isin(oldgenes['Busco_id'])]
gene_counts = concat_df_filtered.groupby('Sample')['Busco_id'].nunique()
gene_counts.to_csv('/content/genes90persample.tsv', sep='\t',)

In [None]:
#save per-samples count of orthologs from 90genes list with duplications
counttab_wdups = pd.crosstab(filtered_buscoids_90genes["Sample"], filtered_buscoids_90genes["Status"]).T
counttab_wdups.to_csv('/content/counttab_wdups.tsv', sep='\t',)

In [None]:
#save #per-samples count of orthologs from 90genes list without duplications
filtered_buscoids_90genes_nodups = pd.merge(filtered_buscoids, compare)
pd.crosstab(filtered_buscoids_90genes_nodups["Sample"], filtered_buscoids_90genes_nodups["Status"]).T
counttab_nodups = pd.crosstab(filtered_buscoids_90genes_nodups["Sample"], filtered_buscoids_90genes_nodups["Status"]).T
counttab_nodups.to_csv('/content/counttab_nodups.tsv', sep='\t',)


In [None]:
#print number of orthologs, common for all samples
unique_buscoids_90genes_nodups = filtered_buscoids.Busco_id.unique().tolist()
len(unique_buscoids_90genes_nodups)


In [None]:
#print orthologs, common for all samples
newgenes


In [None]:
#orthologs from 90genes list, found in all samples
compare = pd.merge(oldgenes, newgenes)
compare.to_csv('/content/compare.tsv', sep='\t', index=False)
compare
