In [1]:
import os
from os.path import join
import pickle
import numpy as np
import pandas as pd
from Bio import SeqIO
from collections import Counter
from scipy.stats import binom_test
from statsmodels.stats.multitest import fdrcorrection
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [2]:
"""
Imports and merges NanoARG Full Table and assembly information (length and read-count).
Calculates quantities and relative abundances of scaffolds with ARGs, MRGs, MGEs and 
pathogen scaffols. 

"""

'\nImports and merges NanoARG Full Table and assembly information (length and read-count).\nCalculates quantities and relative abundances of scaffolds with ARGs, MRGs, MGEs and \npathogen scaffols. \n\n'

In [3]:
def remove_overlap_not_ARG_strand_selection(df, 
                                            max_overlap=250, 
                                            verbose=False, 
                                            strand_selection='no'):
    """ 
    Rearrange open reading frames according to position.
    Find open reading frames identified as ARGs that overlap with other 
    (MGEs, MRGs, Functional Genes) open reading frames and 
    remove them. 
    It is necessary to run this before many of the other functions.
    For example, the same region identified as ARG and MGE can result in 
    misidentification of contigs with both ARG and MGE. Overlapping 
    open reading frame identifications also make it more difficult to analyze 
    contiguous regions.
    
    Example
    -------
          
    s1 ----ARGs------- e1 
         s2 --------- e2 removed if overlap > max_overlap
                  s3-------------e3 keep if overlap < max_overlap
                           s4 ------ e4 
    
    Parameters
    ----------
    df : pd.DataFrame
        data frame with NanoARG table
    max_overlap : int
        maximum allowed overlap between consequitive orfs
    
    strand_selection : string
        choose 'no','yes'
        
        default run with 'no', preferred for subsequent data analysis
        ignore +/- strand
        (11) scaffold_10 4879-6615 ARGs
           (12) 5947-6558  removing Functional Genes! overlap 611
        (27) scaffold_10 44937-45410 ARGs
        (30) scaffold_10 47395-49008 ARGs
           (31) 47395-48969  removing MRGs! overlap 1574
           (32) 47443-48996  removing Functional Genes! overlap 1553
        (36) scaffold_10 52079-52978 ARGs
        (37) scaffold_10 56260-57171 ARGs
           (38) 56260-57168  removing Functional Genes! overlap 908
        (42) scaffold_10002 34-753 ARGs
           (43) 40-768  removing Functional Genes! overlap 713
           
        run with 'yes'
        only remove if orfs have same 5'-> 3' direction
        
        (11) scaffold_10 4879-6615 ARGs
           (12) 5947-6558 strand + + removing Functional Genes! overlap 611
        (27) scaffold_10 44937-45410 ARGs
        (30) scaffold_10 47395-49008 ARGs
           (31) 47395-48969 strand + - overlap 1574
           (32) 47443-48996 strand + - overlap 1553
        (36) scaffold_10 52079-52978 ARGs
        (37) scaffold_10 56260-57171 ARGs
           (38) 56260-57168 strand + - overlap 908
        (42) scaffold_10002 34-753 ARGs
           (43) 40-768 strand + + removing Functional Genes! overlap 713
        

    verbose : bool
        print debug info
        
    Returns
    -------
    Dataframe with entries removed
    
    
   
    """
    if verbose: 
        print("sorting...")
        
    group_column = 2

    D_sorted = df.groupby(df.index).apply(
        lambda x: x.sort_values('start', ascending=True))
    
    Ddrop = D_sorted.droplevel(level=0).copy()

    if verbose: 
        print("searching")

    for i in range(len(Ddrop)-1):
        if Ddrop.group.iloc[i] == 'ARGs':
            scaffold = Ddrop.index[i]

            if verbose: 
                print("({}) {} {}-{} {}".format(
                    i, scaffold, Ddrop.start.iloc[i], Ddrop.end.iloc[i], Ddrop.group.iloc[i]))

            j = i + 1
            while (j<len(Ddrop) and (Ddrop.start.iloc[j] <= Ddrop.end.iloc[i])
                   and (Ddrop.index[j] == scaffold)):

                if verbose:
                    print("   ({}) {}-{} ".format(
                        j, Ddrop.start.iloc[j], Ddrop.end.iloc[j]), end='')

                overlap = np.min([Ddrop.end.iloc[j], Ddrop.end.iloc[i]]) - Ddrop.start.iloc[j]
                if overlap >= max_overlap:
                    if Ddrop.group.iloc[j] != 'ARGs':
                        if strand_selection == 'yes':
                            if verbose:
                                print("strand {} {}".format(
                                    Ddrop.strand.iloc[i], Ddrop.strand.iloc[j]), end = '')
                               
                            if Ddrop.strand.iloc[j] == Ddrop.strand.iloc[i]:
                                if verbose:
                                    print(" removing {}!".format(Ddrop.group.iloc[j]), end='')
                                Ddrop.iat[j, group_column] = np.nan
                        elif strand_selection == 'no':
                            if verbose:
                                print(" removing {}!".format(Ddrop.group.iloc[j]), end='')
                            Ddrop.iat[j, group_column] = np.nan
                else:

                    if verbose:
                        print("keeping {}!".format(Ddrop.group.iloc[j]), end='')

                if verbose:
                    print(" overlap {}".format(overlap))

                j = j + 1      
                
    for i in range(1, len(Ddrop)):
        if Ddrop.group.iloc[i] == 'ARGs':
            scaffold = Ddrop.index[i]

            if verbose: 
                print("({}) {} {}-{} {}".format(
                    i, scaffold, Ddrop.start.iloc[i], Ddrop.end.iloc[i], Ddrop.group.iloc[i]))

     
            j = i - 1
            while (j>=0 and (Ddrop.start.iloc[i] <= Ddrop.end.iloc[j])
                   and (Ddrop.index[j] == scaffold)):

                if verbose:
                    print("   ({}) {}-{} ".format(
                        j, Ddrop.start.iloc[j], Ddrop.end.iloc[j]), end='')

                overlap = np.min([Ddrop.end.iloc[j], Ddrop.end.iloc[i]]) - Ddrop.start.iloc[i]
                if overlap >= max_overlap:
                    if Ddrop.group.iloc[j] != 'ARGs':
                        if strand_selection == 'yes':
                            if verbose:
                                print("strand {} {}".format(
                                    Ddrop.strand.iloc[i], Ddrop.strand.iloc[j]), end='')
                            if Ddrop.strand.iloc[j] == Ddrop.strand.iloc[i]:
                                if verbose:
                                    print(" removing {}!".format(Ddrop.group.iloc[j]), end='')
                                Ddrop.iat[j, group_column] = np.nan
                        elif strand_selection == 'no':
                            if verbose:
                                print(" removing {}!".format(Ddrop.group.iloc[j]), end='')
                            Ddrop.iat[j, group_column] = np.nan
                    
                else:

                    if verbose:
                        print("keeping {}!".format(Ddrop.group.iloc[j]), end='')

                if verbose:
                    print(" overlap {}".format(overlap))

                j = j - 1
                
    df_remove_overlap = Ddrop.dropna(subset=['group']) 
    
    return df_remove_overlap       

In [4]:
"""
Necessary to run 'remove_overlap_not_ARG_strand_selection' before. 
Calculates quantities and relative abundances of scaffolds with ARGs,
MGEs and scaffolds identified as pathogens.
"""

def count_scaffolds_with_MGEs_and_ARGs_normalized(df, min_score=40000, 
                                                  min_length = 500):
    """
    Count
        ARG_counts
        scaffold_ARGs
        scaffold_ARGs_normalized
        scaffold_MGEs
        scaffold_MGEs_with_ARGs
        scaffold_MGEs_with_ARGs_normalized
        %_scaffold_MGEs_with_ARGs/scaffold_ARGs
        scaffold_pathogens
        scaffold_pathogens_with_args
        scaffold_pathogens_with_mges
        scaffold_pathogens_with_args_mges
        scaffold_has_pathogens_ARGs_MGEs_normalized
    
    
    Parameters
    ----------
    df : pd.dataframe
        dataframe with scaffold, ARGs and MGEs data
    min_score : int
        taxa_centrifuge_score cutoff for taxonomy classification
    min_length : int
        minimum length of scaffold to include in analysis
    
    Returns
    -------
    dict:
        dictionary with ratio and counts
        
        
    
    
    """
    
    df_length_filtered = df[df['length'] > min_length]
    
    scaffold_has_args_normalized = 0
    scaffold_has_args=0
    old_index_arg = 0
    for i in range(len(df_length_filtered)):
        if df_length_filtered.group.iloc[i] == 'ARGs':
            if df_length_filtered.index[i] != old_index_arg:
                scaffold_has_args += 1
                scaffold_has_args_normalized  += (
                    df_length_filtered.normalized_read_count_per_kb.iloc[i])
                old_index_arg = df_length_filtered.index[i]
    
    
    scaffold_has_mges = df_length_filtered.groupby(level=0).apply(
        lambda x: (x['group']=='MGEs').any()
    ).sum()
    
    scaffold_has_mges_with_args=0
    scaffold_has_mges_with_args_norm=0
    same_index=0
    for scaffold, data in df_length_filtered.groupby(level=0):
        groups = np.array(data.group)
        normalized_read_count_per_kbs = np.array(data.normalized_read_count_per_kb)
        indices = np.array(data.index)
        for i in range(len(groups)):
            if (groups[i] == 'ARGs'):
                for j in range(len(groups)):
                    if (groups[j] == 'MGEs'):
                        if indices[j] != same_index:
                            scaffold_has_mges_with_args += 1
                            scaffold_has_mges_with_args_norm += normalized_read_count_per_kbs[j]
                            same_index = indices[j]
        
    
    scaffold_has_pathogens = df_length_filtered.groupby(level=0).apply(
        lambda x: 
            (x['is_pathogen']==1).any() 
            and (x['taxa_centrifuge_score']>min_score).any()
    ).sum()
    
    scaffold_has_pathogens_args = df_length_filtered.groupby(level=0).apply(
        lambda x: 
            (x['is_pathogen']==1).any() 
            and (x['taxa_centrifuge_score']>min_score).any() 
            and (x['group']=='ARGs').any()
    ).sum()
    
    scaffold_has_pathogens_mges = df_length_filtered.groupby(level=0).apply(
        lambda x: 
            (x['is_pathogen']==1).any() 
            and (x['taxa_centrifuge_score']>min_score).any()
            and (x['group']=='MGEs').any() 
    ).sum()
    
    df_length_taxa_filtered = df_length_filtered[df_length_filtered['taxa_centrifuge_score'] > min_score]
    scaffold_has_pathogens_args_mges=0
    scaffold_has_pathogens_args_mges_norm=0
    same_index_pathogen=0
    for scaffold, data in df_length_taxa_filtered.groupby(level=0):
        groups = np.array(data.group)
        normalized_read_count_per_kbs = np.array(data.normalized_read_count_per_kb)
        indices = np.array(data.index)
        pathogens = np.array(data.is_pathogen)
        for i in range(len(groups)):
            if (pathogens[i] == 1):
                for j in range(len(groups)):
                    if (groups[j] == 'ARGs'):
                        for k in range(len(groups)):
                            if (groups[k] == 'MGEs'):
                                if indices[k] != same_index_pathogen:
                                    scaffold_has_pathogens_args_mges += 1
                                    scaffold_has_pathogens_args_mges_norm += normalized_read_count_per_kbs[k]
                                    same_index_pathogen = indices[k]
    
    ARG_counts = len(df_length_filtered[df_length_filtered['group'] == 'ARGs'])
    
    
    return {
        'ARG_counts' : ARG_counts,
        'scaffold_ARGs': scaffold_has_args,
        'scaffold_ARGs_normalized' : scaffold_has_args_normalized,
        'scaffold_MGEs': scaffold_has_mges,
        'scaffold_MGEs_with_ARGs': scaffold_has_mges_with_args,
        'scaffold_MGEs_with_ARGs_normalized' :scaffold_has_mges_with_args_norm,
        '%_scaffold_MGEs_with_ARGs/scaffold_ARGs': (
            scaffold_has_mges_with_args / scaffold_has_args * 100
        ),
        'scaffold_pathogens' : scaffold_has_pathogens,
        'scaffold_pathogens_with_ARGs' : scaffold_has_pathogens_args,
        'scaffold_pathogens_with_MGEs' : scaffold_has_pathogens_mges,
        'scaffold_pathogens_with_ARGs_MGEs' : scaffold_has_pathogens_args_mges,
        'scaffold_has_pathogens_ARGs_MGEs_normalized' : scaffold_has_pathogens_args_mges_norm
    }

In [5]:
"""
Returns quantity and relative abundance of desired ARG subtype scaffold. 
"""

def ARG_subtype_count(df, min_bitscore=50, 
                      min_id_arg=25, min_coverage_arg=0.4, arg_name = 'sul1'):
    
    """
    Count
        arg_subtype_count = count # of ARG subtype occurance
        scaffold_arg_subtype = count # of scaffolds with ARG_subtype
        scaffold_has_arg_subtype_normalized = scaffolds with ARG_subtype in RPKM
    
    
    Parameters
    ----------
    df : pd.dataframe
        dataframe with scaffold, ARGs and MGEs data
    min_bitscore : int
        minimum bitscore, default is 50
    min_id_arg : int
        minimum identity of ARG, default is 25    
    min_coverage_arg : int
        minimum coverage of ARG, default is 0.4 in NanoARG
    arg_name : name of ARG subtype to be counted
    
    Returns
    -------
    dict:
        dictionary with counts   
    """
    
    
    
    df_bitscore_adjusted = df[df['bitscore'] > min_bitscore]
    
    df_bit_arg_adjusted = df_bitscore_adjusted.drop(df_bitscore_adjusted[ 
        (df_bitscore_adjusted['group'] == 'ARGs') 
        & ((df_bitscore_adjusted['identity'] < min_id_arg)
            | (df_bitscore_adjusted['coverage'] < min_coverage_arg))].index)
    
    
    arg_subtype_count = len(df_bit_arg_adjusted[df_bit_arg_adjusted['gene_name'] == arg_name])
    
    scaffold_has_arg_subtype = df_bit_arg_adjusted.groupby(level=0).apply(
        lambda x: (x['gene_name']==arg_name).any()
    ).sum()
    
    
    scaffold_has_arg_subtype_normalized = 0
    old_index = 0
    for i in range(len(df_bit_arg_adjusted)):
        if df_bit_arg_adjusted.gene_name.iloc[i] == arg_name:
            if df_bit_arg_adjusted.index[i] != old_index:              
                scaffold_has_arg_subtype_normalized  += (
                    df_bit_arg_adjusted.normalized_read_count_per_kb.iloc[i])
                old_index = df_bit_arg_adjusted.index[i]
                
    return {
        'arg_subtype_count' : arg_subtype_count,
        'scaffold_arg_subtype' : scaffold_has_arg_subtype,
        'scaffold_has_arg_subtype_normalized' : scaffold_has_arg_subtype_normalized
    }

In [6]:
def count_scaffolds(df, min_length=500):
    """    
    Count number of scaffolds longer than min_length 
    
    Parameters
    ----------
    df : pd.dataframe
        dataframe with scaffold length and read-count
    min_length : int
        minimum length of scaffold to include in analysis
    
    Returns
    -------
    dict:
        dictionary with counts
           
    """
    
    df_length_filtered = df[df['length'] > min_length]
    scaffold_counts = len(df_length_filtered.groupby(level=0))
    
    return {
        '# scaffolds' : scaffold_counts,
    }

In [7]:
"""
INPUT REQUIRED. IMPORTING NANOARG DATA.
------------------------------------------------------

Import multiple NanoARG Full Tables (.tsv) and save in a dictionary.

Input required:
1. Path to the directory with NanoARG full tables

Output:
1. Disctionary called all_full_tables, each sample is saved 
    in a different dataframe with sample name.
 
"""



"""
***provide PATH to the directory with files*********

"""
path = '/Users/sudeshnaghosh/Dropbox/VT/test2/'


#------------------------------------------------


DATA_DIR = path

all_full_tables = {}

for file in os.listdir(DATA_DIR):
    df_name = file.split('.')[0]
    # printing sample name indicates progress
    print(df_name)
    all_full_tables[df_name] = pd.read_csv(DATA_DIR + file, sep=',', header=0, index_col=0)
    
del all_full_tables[""]


After_GAC_1
Before_GAC_1
HCL0_W2
HCL0_W3


In [8]:
"""
INPUT REQUIRED. IMPORTING ASSEMBLY INFORMATION.
------------------------------------------------------

Extract length and read_count information about each scaffold 
in a sample and store as dataframe. Store dataframes in a dictionary.


Input:

Input file format, from IDBA-UD

>scaffold_0 length_285165 read_count_39166
CGGGCCCTACCGTAGCAGCCGCTACGGTAGGGCCCGTGTCCAGT......
>scaffold_1 length_297291 read_count_43258
GAGAGCGTCATCGATCATCGCGACGCAGGTTAACAGCGATTGCCGATGTTTACAACC........

Output:
    
Dictionary called assembly


"""

"""
***provide PATH to the directory with IDBA-UD files**********

"""

path = '/Users/sudeshnaghosh/Dropbox/VT/assembly_test'
#---------------------------------------------------


DATA_DIR = path

assembly = {}


for file in os.listdir(DATA_DIR):
    if file.endswith(".fa"):
        file_path = join(DATA_DIR, file)
        print(file_path)
        df_name = file.split('.')[0]
        scaffold = []
        length = []
        read_count = []
        
        for record in SeqIO.parse(file_path, "fasta"):
            scaffold.append(record.description.split()[0])
            length.append(record.description.split()[1].split('_')[1])
            read_count.append(record.description.rsplit()[2].split('_')[2])
            
            
        assembly[df_name] = pd.DataFrame({
            'read':scaffold,
            'length':length,
            'read_count':read_count
        })
        assembly[df_name].set_index('read', inplace=True)

/Users/sudeshnaghosh/Dropbox/VT/assembly_test/After_GAC_1.fa
/Users/sudeshnaghosh/Dropbox/VT/assembly_test/Before_GAC_1.fa
/Users/sudeshnaghosh/Dropbox/VT/assembly_test/HCL0_W2.fa
/Users/sudeshnaghosh/Dropbox/VT/assembly_test/HCL0_W3.fa


In [9]:
"""
For each scaffold, calculate rpm and rpkm 
-----------------------------------------


For each scaffold, calculate rpm and rpkm and add to the disctionary with 
length and read_count.

normalized_read_count (rpm) = 
read_count of scaffold / (total # of read_counts in a sample / 1,000,000)
normalized_read_count_per_kb (rpkm) = 
normalized_read_count (rpm) / length of scaffold in kb

The updated dataframes are stored in the dictionary called assembly.


"""


for name, keys in assembly.items():    
    df_name = name
    # printing sample name indicates progress
    print(df_name)
    SUM = ((assembly[df_name]['read_count']).astype(float)).sum()
    assembly[df_name] = assembly[df_name].astype(float)
    assembly[df_name]['normalized_read_count'] = (
        (assembly[df_name]['read_count'] * 1000000) / SUM)
    assembly[df_name]['normalized_read_count_per_kb'] = (
        (assembly[df_name]['normalized_read_count'] * 1000)/
        assembly[df_name]['length'])

After_GAC_1
Before_GAC_1
HCL0_W2
HCL0_W3


In [10]:
assembly['After_GAC_1'].head()

Unnamed: 0_level_0,length,read_count,normalized_read_count,normalized_read_count_per_kb
read,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
scaffold_0,709020.0,97243.0,6486.089388,9.147964
scaffold_1,551495.0,312878.0,20868.902395,37.840601
scaffold_2,954535.0,669486.0,44654.587376,46.781509
scaffold_3,303743.0,175890.0,11731.829155,38.624196
scaffold_4,302455.0,32823.0,2189.287784,7.238392


In [11]:
"""
COMBINING NANOARG FULL TABLES AND ASSEMBLY INFORMATION
------------------------------------------------------

Add scaffold length, read_count and normalized scaffold read_count 
(rpm and rpkm) to NanoARG table.
Store dataframes in a dictionary called all_full_tables_assembly_data.

"""

all_full_tables_assembly_data = {}

for name, keys in all_full_tables.items():    
    df_name = name
    # printing sample name indicates progress
    print(df_name)
    all_full_tables_assembly_data[df_name] = (
        pd.concat([all_full_tables[df_name], assembly[df_name]], 
                  axis=1, sort=False, join='inner'))

After_GAC_1
Before_GAC_1
HCL0_W2
HCL0_W3


In [12]:
all_full_tables_assembly_data['After_GAC_1'].head()

Unnamed: 0_level_0,gene_id,gene_name,group,category,start,end,strand,identity,bitscore,evalue,NCBI_taxa_id,taxa_centrifuge_score,species,coverage,is_pathogen,length,read_count,normalized_read_count,normalized_read_count_per_kb
read,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
scaffold_16508,WP_007306505.1,IS1634 family transposase [Crocosphaera watsonii],MGEs,transposase,1058,1459,+,35.1,102.4,6.5e-21,undefined,0.0,undefined,0.724324,0,1743.0,74.0,4.935786,2.831776
scaffold_9236,YP_002467958,PBP-1B,ARGs,beta-lactam,326,1816,+,26.6,137.9,1.3000000000000001e-32,undefined,0.0,undefined,0.718421,0,2392.0,81.0,5.402684,2.258647
scaffold_9236,UniRef90_Q07259,Cluster: Putative transglycosylase H16 A0665,Functional Genes,Cluster: Putative transglycosylase H16 A0665,395,862,+,37.2,107.8,1.5e-22,undefined,0.0,undefined,0.675325,0,2392.0,81.0,5.402684,2.258647
scaffold_16501,BAC0162,Gallium,MRGs,Gallium,15,659,+,31.0,102.8,9.4e-24,1486262,196.0,Martelella endophytica,0.606742,0,1743.0,41.0,2.734692,1.568957
scaffold_16501,BAC0604,Molybdenum,MRGs,Molybdenum,778,1440,+,30.0,102.8,9.5e-24,1486262,196.0,Martelella endophytica,0.648256,0,1743.0,41.0,2.734692,1.568957


In [13]:
"""
SAVE DICTIONARY
---------------
save dictionary in all_full_tables_assembly_data_test.p

dictionary can be loaded in new notebook for analysis

with open('all_full_tables_remove_overlap_test.p', 'rb') as fp:
    all_full_tables_remove_overlap = pickle.load(fp)
"""

with open('all_full_tables_assembly_data_test.p', 'wb') as fp:
    pickle.dump(all_full_tables_assembly_data, fp, protocol=pickle.HIGHEST_PROTOCOL)

In [14]:
"""
Running 'remove_overlap_not_ARG_strand_selection'
------------------------------------------------
rearranging with ascending start
and storing updated dataframes in 
a dictionary called
all_full_tables_remove_overlap.

Set the max_overlap to desired value.

IMPORTANT: this step is necessary for running
many of the other functions.

"""
all_full_tables_remove_overlap = {}
for name, keys in all_full_tables_assembly_data.items():    
    df_name = name
    # printing sample name indicates progress
    print(df_name)
    all_full_tables_remove_overlap[df_name] = (
        remove_overlap_not_ARG_strand_selection(all_full_tables_assembly_data[df_name], 
                                            max_overlap=250, 
                                            strand_selection='no',
                                            verbose = False))
       

After_GAC_1
Before_GAC_1
HCL0_W2
HCL0_W3


In [15]:
"""
SAVE DICTIONARY
---------------

Helpful if you have a lot of samples
"""


with open('all_full_tables_remove_overlap_test.p', 'wb') as fp:
    pickle.dump(all_full_tables_remove_overlap, fp, protocol=pickle.HIGHEST_PROTOCOL)

In [16]:
"""
runs 'count_scaffolds'
---------------------

Returns dataframe with number of scaffolds in a sample.


"""

scaffold_count = pd.DataFrame(dtype=float)
for name, df in assembly.items():    
    count = count_scaffolds(df, min_length = 0)
    count_series = pd.Series(count)
    count_rename = count_series.rename(name)
    
    scaffold_count = scaffold_count.append(count_rename)

In [17]:
scaffold_count

Unnamed: 0,# scaffolds
After_GAC_1,304503.0
Before_GAC_1,257626.0
HCL0_W2,216141.0
HCL0_W3,235136.0


In [18]:
"""
runs 'count_scaffolds_with_MGEs_and_ARGs_normalized'
----------------------------------------------------

Necessary to run 'remove_overlap_not_ARG_strand_selection' and 'count_scaffolds' before

Estimate: 5 samples run in ~ 2 mins to run. More samples will take longer. 


Output
------
df : pd.dataframe
    dataframe with counts, ratios and percent values
    
    ARG_counts : Number of ARGs in a sample 
    scaffold_ARGs : Number of scaffolds with at least one ARG 
    scaffold_ARGs_normalized : Scaffold with at least one ARG 
                               rel. abundance (rpkm)
    scaffold_MGEs : Number of scaffolds with at least one MGE
    scaffold_MGEs_with_ARGs : Number of scaffolds with at least one AGR 
                              and at least one MGE rel. abundance (rpkm)
    scaffold_MGEs_with_ARGs_normalized : Scaffold with at least one each 
                                         of ARG and MGE
    %_scaffold_MGEs_with_ARGs/scaffold_ARGs : (scaffold_MGEs_with_ARGs x 100)/scaffold_ARGs
    scaffold_pathogens : Number of scaffolds identified as pathogen
    scaffold_pathogens_with_ARGs : Number of scaffolds identified as pathogen with 
                                at least one ARG
    scaffold_pathogens_with_MGEs : Number of scaffolds identified as pathogen with 
                                at least one MGE
    scaffold_pathogens_with_ARGs_MGEs : Number of scaffolds identified 
                                        as pathogen with at least one AGR 
                                        and at least one MGE
    scaffold_has_pathogens_ARGs_MGEs_normalized: Rel abundance of scaffolds identified 
                                        as pathogen with at least one AGR 
                                        and at least one MGE (rpkm)
    # scaffolds :  Number of scaffolds in a sample
    scaffold_MGEs/#scaffolds : scaffold_MGEs/# scaffolds
    %scaffold_ARGs : (scaffold_ARGs  x 100) / # scaffolds
    %scaffold_MGEs_with_ARGs : (scaffold_MGEs_with_ARGs  x 100) / # scaffolds 
    %pathogens : (scaffold_pathogens x 100) / # scaffolds 
    %pathogens_with_ARGs : (scaffold_pathogens_with_ARGs x 100) / # scaffolds
    %pathogens_with_MGEs : (scaffold_pathogens_with_MGEs x 100) / # scaffolds
    %pathogens_with_ARGs_MGEs : (scaffold_pathogens_with_ARGs_MGEs x 100) / # scaffolds 
    %scaffold_MGEs : (scaffold_MGEs x 100) / # scaffolds

"""

columns = [
           'ARG_counts', 'scaffold_ARGs',
           'scaffold_ARGs_normalized',
           'scaffold_MGEs',
           'scaffold_MGEs_with_ARGs',
           'scaffold_MGEs_with_ARGs_normalized',
           '%_scaffold_MGEs_with_ARGs/scaffold_ARGs', 'scaffold_pathogens',
           'scaffold_pathogens_with_ARGs', 'scaffold_pathogens_with_MGEs',
           'scaffold_pathogens_with_ARGs_MGEs',
           'scaffold_has_pathogens_ARGs_MGEs_normalized']
scaffolds_with_MGEs_and_ARGs = pd.DataFrame(columns=columns, dtype=float)
for name, df in all_full_tables_remove_overlap.items(): 
    print(name)
    count = count_scaffolds_with_MGEs_and_ARGs_normalized(df, min_length = 500)
    count_series = pd.Series(count)
    count_rename = count_series.rename(name)
    
    scaffolds_with_MGEs_and_ARGs = scaffolds_with_MGEs_and_ARGs.append(count_rename)
        

scaffolds_with_MGEs_and_ARGs_store = scaffolds_with_MGEs_and_ARGs

MGEs_and_ARGs_pathogens_normalized = (
    pd.concat([scaffolds_with_MGEs_and_ARGs_store, scaffold_count], 
              axis=1, join='inner'))

MGEs_and_ARGs_pathogens_normalized['scaffold_MGEs/#scaffolds']=\
MGEs_and_ARGs_pathogens_normalized['scaffold_MGEs']/\
MGEs_and_ARGs_pathogens_normalized['# scaffolds']

MGEs_and_ARGs_pathogens_normalized['%scaffold_ARGs']=\
MGEs_and_ARGs_pathogens_normalized['scaffold_ARGs']*100/\
MGEs_and_ARGs_pathogens_normalized['# scaffolds']

MGEs_and_ARGs_pathogens_normalized['%scaffold_MGEs_with_ARGs']=\
MGEs_and_ARGs_pathogens_normalized['scaffold_MGEs_with_ARGs']*100/\
MGEs_and_ARGs_pathogens_normalized['# scaffolds']

MGEs_and_ARGs_pathogens_normalized['%pathogens']=\
MGEs_and_ARGs_pathogens_normalized['scaffold_pathogens']*100/\
MGEs_and_ARGs_pathogens_normalized['# scaffolds']

MGEs_and_ARGs_pathogens_normalized['%pathogens_with_ARGs']=\
MGEs_and_ARGs_pathogens_normalized['scaffold_pathogens_with_ARGs']*100/\
MGEs_and_ARGs_pathogens_normalized['# scaffolds']

MGEs_and_ARGs_pathogens_normalized['%pathogens_with_MGEs']=\
MGEs_and_ARGs_pathogens_normalized['scaffold_pathogens_with_MGEs']*100/\
MGEs_and_ARGs_pathogens_normalized['# scaffolds']

MGEs_and_ARGs_pathogens_normalized['%pathogens_with_ARGs_MGEs']=\
MGEs_and_ARGs_pathogens_normalized['scaffold_pathogens_with_ARGs_MGEs']*100/\
MGEs_and_ARGs_pathogens_normalized['# scaffolds']

MGEs_and_ARGs_pathogens_normalized['%scaffold_MGEs']=\
MGEs_and_ARGs_pathogens_normalized['scaffold_MGEs']*100/\
MGEs_and_ARGs_pathogens_normalized['# scaffolds']

After_GAC_1
Before_GAC_1
HCL0_W2
HCL0_W3


In [19]:
"""
Prints result.
Save this file and plot using desired software
"""

MGEs_and_ARGs_pathogens_normalized

Unnamed: 0,ARG_counts,scaffold_ARGs,scaffold_ARGs_normalized,scaffold_MGEs,scaffold_MGEs_with_ARGs,scaffold_MGEs_with_ARGs_normalized,%_scaffold_MGEs_with_ARGs/scaffold_ARGs,scaffold_pathogens,scaffold_pathogens_with_ARGs,scaffold_pathogens_with_MGEs,...,scaffold_has_pathogens_ARGs_MGEs_normalized,# scaffolds,scaffold_MGEs/#scaffolds,%scaffold_ARGs,%scaffold_MGEs_with_ARGs,%pathogens,%pathogens_with_ARGs,%pathogens_with_MGEs,%pathogens_with_ARGs_MGEs,%scaffold_MGEs
After_GAC_1,5068.0,3974.0,13837.251645,3075.0,389.0,3212.788846,9.788626,3.0,1.0,2.0,...,0.0,304503.0,0.010098,1.305077,0.127749,0.000985,0.000328,0.000657,0.0,1.009842
Before_GAC_1,4130.0,3424.0,17877.630338,2425.0,205.0,3148.961065,5.98715,17.0,9.0,10.0,...,72.295621,257626.0,0.009413,1.329058,0.079573,0.006599,0.003493,0.003882,0.000776,0.941287
HCL0_W2,6325.0,4894.0,19977.884597,3447.0,463.0,5422.032241,9.460564,31.0,7.0,22.0,...,22.399722,216141.0,0.015948,2.264263,0.214212,0.014342,0.003239,0.010179,0.000925,1.594792
HCL0_W3,7336.0,5662.0,20610.972909,3938.0,547.0,5527.301233,9.660897,30.0,5.0,25.0,...,26.447524,235136.0,0.016748,2.407968,0.232631,0.012759,0.002126,0.010632,0.000851,1.674775


In [20]:
"""
runs 'ARG_subtype_count'
-----------------------


Run 'scaffold_count' before with desired scaffold length threshhold.

Returns a dataframe with count of ARG subtype.
"""

columns = ['arg_subtype_count', 'scaffold_arg_subtype', 
           'scaffold_has_arg_subtype_normalized']
ARG_subtypes = pd.DataFrame(columns=columns, dtype=float)
for name, df in all_full_tables_assembly_data.items():    
    count = ARG_subtype_count(df, min_bitscore=50, 
                      min_id_arg=50, min_coverage_arg = 0.5, arg_name = 'sul1')
    count_series = pd.Series(count)
    count_rename = count_series.rename(name)
    
    ARG_subtypes = ARG_subtypes.append(count_rename)
        

ARG_subtypes_store = ARG_subtypes 


ARG_subtypes_scaffold_count = \
pd.concat([ARG_subtypes_store, scaffold_count], axis=1, join='inner')

ARG_subtypes_scaffold_count['%scaffold_arg_subtype']=\
ARG_subtypes_scaffold_count['scaffold_arg_subtype']*100/\
ARG_subtypes_scaffold_count['# scaffolds']

In [21]:
ARG_subtypes_scaffold_count

Unnamed: 0,arg_subtype_count,scaffold_arg_subtype,scaffold_has_arg_subtype_normalized,# scaffolds,%scaffold_arg_subtype
After_GAC_1,0.0,0.0,0.0,304503.0,0.0
Before_GAC_1,1.0,1.0,13.291695,257626.0,0.000388
HCL0_W2,1.0,1.0,17.712378,216141.0,0.000463
HCL0_W3,1.0,1.0,14.809571,235136.0,0.000425
