In [1]:
import pandas as pd
import os

In [2]:
file = 'NRPS_clustering_c0.30.tsv'

In [3]:
df = pd.read_csv(file, sep='\t', header=0)

In [4]:
df

Unnamed: 0,#BGC Name,Family Number
0,BGC0000302,282
1,BGC0000308,288
2,Bacterial_SRR17413116_c00608_SRR1741...region001,288
3,BGC0001028,1887
4,BGC0002061,1887
...,...,...
13354,Fungal_SRR9610076_c00080_SRR9610...region001,55229
13355,Fungal_SRR9610076_c00315_SRR9610...region001,55235
13356,Fungal_SRR9610076_c00600_SRR9610...region001,55240
13357,Fungal_SRR9610076_c01032_SRR9610...region001,55244


In [5]:
base_name = os.path.splitext(os.path.basename(file))[0]

In [6]:
type = base_name.split("_")[0]

In [7]:
type

'NRPS'

In [8]:
df['Type'] = type

In [9]:
df

Unnamed: 0,#BGC Name,Family Number,Type
0,BGC0000302,282,NRPS
1,BGC0000308,288,NRPS
2,Bacterial_SRR17413116_c00608_SRR1741...region001,288,NRPS
3,BGC0001028,1887,NRPS
4,BGC0002061,1887,NRPS
...,...,...,...
13354,Fungal_SRR9610076_c00080_SRR9610...region001,55229,NRPS
13355,Fungal_SRR9610076_c00315_SRR9610...region001,55235,NRPS
13356,Fungal_SRR9610076_c00600_SRR9610...region001,55240,NRPS
13357,Fungal_SRR9610076_c01032_SRR9610...region001,55244,NRPS


In [10]:
df.rename(columns={'#BGC Name': 'BGC_name', 'Family Number':'raw_GCF'}, inplace=True)

In [11]:
df['Origin'] = df.BGC_name.str.split('_', expand=True)[0]

In [12]:
df.loc[df['Origin'].str.startswith('BGC'), 'Origin'] = 'Reference'

In [13]:
df['Sample'] = df.BGC_name.str.split('_', expand=True)[1]

In [14]:
df.Sample.fillna('Reference', inplace = True)

In [15]:
df

Unnamed: 0,BGC_name,raw_GCF,Type,Origin,Sample
0,BGC0000302,282,NRPS,Reference,Reference
1,BGC0000308,288,NRPS,Reference,Reference
2,Bacterial_SRR17413116_c00608_SRR1741...region001,288,NRPS,Bacterial,SRR17413116
3,BGC0001028,1887,NRPS,Reference,Reference
4,BGC0002061,1887,NRPS,Reference,Reference
...,...,...,...,...,...
13354,Fungal_SRR9610076_c00080_SRR9610...region001,55229,NRPS,Fungal,SRR9610076
13355,Fungal_SRR9610076_c00315_SRR9610...region001,55235,NRPS,Fungal,SRR9610076
13356,Fungal_SRR9610076_c00600_SRR9610...region001,55240,NRPS,Fungal,SRR9610076
13357,Fungal_SRR9610076_c01032_SRR9610...region001,55244,NRPS,Fungal,SRR9610076


In [16]:
df = df.astype({'raw_GCF':'str'})

In [17]:
df['GCF'] = df['Type'] + "_"+df['raw_GCF']

In [18]:
df

Unnamed: 0,BGC_name,raw_GCF,Type,Origin,Sample,GCF
0,BGC0000302,282,NRPS,Reference,Reference,NRPS_282
1,BGC0000308,288,NRPS,Reference,Reference,NRPS_288
2,Bacterial_SRR17413116_c00608_SRR1741...region001,288,NRPS,Bacterial,SRR17413116,NRPS_288
3,BGC0001028,1887,NRPS,Reference,Reference,NRPS_1887
4,BGC0002061,1887,NRPS,Reference,Reference,NRPS_1887
...,...,...,...,...,...,...
13354,Fungal_SRR9610076_c00080_SRR9610...region001,55229,NRPS,Fungal,SRR9610076,NRPS_55229
13355,Fungal_SRR9610076_c00315_SRR9610...region001,55235,NRPS,Fungal,SRR9610076,NRPS_55235
13356,Fungal_SRR9610076_c00600_SRR9610...region001,55240,NRPS,Fungal,SRR9610076,NRPS_55240
13357,Fungal_SRR9610076_c01032_SRR9610...region001,55244,NRPS,Fungal,SRR9610076,NRPS_55244


In [19]:
import pandas as pd
import os

final_df = pd.DataFrame(columns=['BGC_name','raw_GCF','Type','Origin','Sample','GCF'])

files = [file for file in os.listdir("./") if file.endswith("_clustering_c0.30.tsv")]

for file in files:
    df = pd.read_csv(file, sep='\t', header=0)
    base_name = os.path.splitext(os.path.basename(file))[0]
    type = base_name.split("_")[0]
    df['Type'] = type
    df.rename(columns={'#BGC Name': 'BGC_name', 'Family Number':'raw_GCF'}, inplace=True)
    df['Origin'] = df.BGC_name.str.split('_', expand=True)[0]
    df.loc[df['Origin'].str.startswith('BGC'), 'Origin'] = 'Reference'
    df.loc[df['Origin'].str.startswith('Umbi'), 'Origin'] = 'Reference'
    df['Sample'] = df.BGC_name.str.split('_', expand=True)[1]
    df.Sample.fillna('Reference', inplace = True)
    df.loc[df['Sample'].str.startswith('Umbi'), 'Sample'] = 'Reference'
    df = df.astype({'raw_GCF':'str'})
    df['GCF'] = df['Type'] + "_"+df['raw_GCF']
    final_df = pd.concat([final_df, df], ignore_index=True, axis=0)

In [20]:
final_df

Unnamed: 0,BGC_name,raw_GCF,Type,Origin,Sample,GCF
0,BGC0000194,181,PKSother,Reference,Reference,PKSother_181
1,Bacterial_SRR14722161_c00718_SRR1472...region001,181,PKSother,Bacterial,SRR14722161,PKSother_181
2,BGC0000201,216,PKSother,Reference,Reference,PKSother_216
3,BGC0000202,216,PKSother,Reference,Reference,PKSother_216
4,BGC0000203,216,PKSother,Reference,Reference,PKSother_216
...,...,...,...,...,...,...
58794,Fungal_SRR9610076_c00080_SRR9610...region001,55229,NRPS,Fungal,SRR9610076,NRPS_55229
58795,Fungal_SRR9610076_c00315_SRR9610...region001,55235,NRPS,Fungal,SRR9610076,NRPS_55235
58796,Fungal_SRR9610076_c00600_SRR9610...region001,55240,NRPS,Fungal,SRR9610076,NRPS_55240
58797,Fungal_SRR9610076_c01032_SRR9610...region001,55244,NRPS,Fungal,SRR9610076,NRPS_55244


In [21]:
summary_df = final_df.pivot_table(index='GCF', columns='Origin', aggfunc='size', fill_value=0)

In [22]:
summary_df

Origin,Bacterial,Fungal,Reference
GCF,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
NRPS_10003,1,0,0
NRPS_10005,1,0,0
NRPS_10006,1,0,0
NRPS_10007,1,0,0
NRPS_10009,1,0,0
...,...,...,...
Terpene_9968,1,0,0
Terpene_9970,1,0,0
Terpene_9972,1,0,0
Terpene_9973,1,0,0


In [23]:
summary_df['Total'] = summary_df[['Bacterial', 'Fungal', 'Reference']].sum(axis=1)

In [31]:
summary_df = summary_df.sort_values(by=['Reference'], ascending=False)

In [32]:
summary_df

Origin,Bacterial,Fungal,Reference,Total
GCF,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
PKSother_216,0,0,13,13
PKSI_55263,0,2,8,10
PKSother_1279,0,0,7,7
PKS-NRP_958,2,0,6,8
PKSother_42882,0,55,6,61
...,...,...,...,...
Terpene_29543,1,0,0,1
Terpene_29288,1,0,0,1
Terpene_29299,1,0,0,1
Terpene_29331,1,0,0,1


In [26]:
GCF_by_sample = final_df.pivot_table(index='GCF', columns='Sample', aggfunc='size', fill_value=0)

In [27]:
GCF_by_sample

Sample,DRR077248,DRR200332,ERR3588817,ERR3588818,ERR3588819,ERR3588820,ERR3588821,ERR3588822,ERR4179389,ERR4179391,...,SRR7346467,SRR7346468,SRR7346472,SRR8446868,SRR9278453,SRR9610072,SRR9610073,SRR9610074,SRR9610076,SRR9610078
GCF,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
NRPS_10003,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
NRPS_10005,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
NRPS_10006,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
NRPS_10007,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
NRPS_10009,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Terpene_9968,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Terpene_9970,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Terpene_9972,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Terpene_9973,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [28]:
GCF_by_sample['Total'] = GCF_by_sample.sum(axis=1)

In [29]:
GCF_by_sample.sort_values(by=['Total'], ascending=False)

Sample,DRR077248,DRR200332,ERR3588817,ERR3588818,ERR3588819,ERR3588820,ERR3588821,ERR3588822,ERR4179389,ERR4179391,...,SRR7346468,SRR7346472,SRR8446868,SRR9278453,SRR9610072,SRR9610073,SRR9610074,SRR9610076,SRR9610078,Total
GCF,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Terpene_32736,0,0,0,1,0,0,1,0,0,0,...,0,0,0,1,0,0,1,0,0,133
Terpene_36860,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,114
PKSI_35802,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,84
Terpene_34580,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,73
Terpene_3485,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,72
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Others_54107,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
Others_54089,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
Others_54085,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
Others_5405,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [30]:
summary_df.to_csv('GCF_summary.txt', sep='\t', index=True)
GCF_by_sample.to_csv('GCF_by_sample.txt', sep='\t', index=True)
final_df.to_csv('BGC_per_GCF.txt', sep='\t', index=True)