In [1]:
import os
from os.path import join
import pickle
import numpy as np
import pandas as pd
from Bio import SeqIO
from collections import Counter
from scipy.stats import binom_test
from statsmodels.stats.multitest import fdrcorrection
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [2]:
"""
Explore associations between ARGs and MGEs.
Set different parameter values for data exploration.

Execute merge_nanoARG_full_table_assembly_length_read_count_calculate_abundance.ipynb before
to generate 'all_full_tables_remove_overlap_test.p'

"""

"\nExplore associations between ARGs and MGEs.\nSet different parameter values for data exploration.\n\nExecute merge_nanoARG_full_table_assembly_length_read_count_calculate_abundance.ipynb before\nto generate 'all_full_tables_remove_overlap_test.p'\n\n"

In [3]:
def remove_overlap_not_ARG_strand_selection(df, 
                                            max_overlap=250, 
                                            verbose=False, 
                                            strand_selection='no'):
    """ 
    Rearrange open reading frames according to position.
    Find open reading frames identified as ARGs that overlap with other 
    (MGEs, MRGs, Functional Genes) open reading frames and 
    remove them. 
    It is necessary to run this before many of the other functions.
    For example, the same region identified as ARG and MGE can result in 
    misidentification of contigs with both ARG and MGE. Overlapping 
    open reading frame identifications also make it more difficult to analyze 
    contiguous regions.
    
    Example
    -------
          
    s1 ----ARGs------- e1 
         s2 --------- e2 removed if overlap > max_overlap
                  s3-------------e3 keep if overlap < max_overlap
                           s4 ------ e4 
    
    Parameters
    ----------
    df : pd.DataFrame
        data frame with NanoARG table
    max_overlap : int
        maximum allowed overlap between consequitive orfs
    
    strand_selection : string
        choose 'no','yes'
        
        default run with 'no', preferred for subsequent data analysis
        ignore +/- strand
        (11) scaffold_10 4879-6615 ARGs
           (12) 5947-6558  removing Functional Genes! overlap 611
        (27) scaffold_10 44937-45410 ARGs
        (30) scaffold_10 47395-49008 ARGs
           (31) 47395-48969  removing MRGs! overlap 1574
           (32) 47443-48996  removing Functional Genes! overlap 1553
        (36) scaffold_10 52079-52978 ARGs
        (37) scaffold_10 56260-57171 ARGs
           (38) 56260-57168  removing Functional Genes! overlap 908
        (42) scaffold_10002 34-753 ARGs
           (43) 40-768  removing Functional Genes! overlap 713
           
        run with 'yes'
        only remove if orfs have same 5'-> 3' direction
        
        (11) scaffold_10 4879-6615 ARGs
           (12) 5947-6558 strand + + removing Functional Genes! overlap 611
        (27) scaffold_10 44937-45410 ARGs
        (30) scaffold_10 47395-49008 ARGs
           (31) 47395-48969 strand + - overlap 1574
           (32) 47443-48996 strand + - overlap 1553
        (36) scaffold_10 52079-52978 ARGs
        (37) scaffold_10 56260-57171 ARGs
           (38) 56260-57168 strand + - overlap 908
        (42) scaffold_10002 34-753 ARGs
           (43) 40-768 strand + + removing Functional Genes! overlap 713
        

    verbose : bool
        print debug info
        
    Returns
    -------
    Dataframe with entries removed
    
    
   
    """
    if verbose: 
        print("sorting...")
        
    group_column = 2

    D_sorted = df.groupby(df.index).apply(
        lambda x: x.sort_values('start', ascending=True))
    
    Ddrop = D_sorted.droplevel(level=0).copy()

    if verbose: 
        print("searching")

    for i in range(len(Ddrop)-1):
        if Ddrop.group.iloc[i] == 'ARGs':
            scaffold = Ddrop.index[i]

            if verbose: 
                print("({}) {} {}-{} {}".format(
                    i, scaffold, Ddrop.start.iloc[i], Ddrop.end.iloc[i], Ddrop.group.iloc[i]))

            j = i + 1
            while (j<len(Ddrop) and (Ddrop.start.iloc[j] <= Ddrop.end.iloc[i])
                   and (Ddrop.index[j] == scaffold)):

                if verbose:
                    print("   ({}) {}-{} ".format(
                        j, Ddrop.start.iloc[j], Ddrop.end.iloc[j]), end='')

                overlap = np.min([Ddrop.end.iloc[j], Ddrop.end.iloc[i]]) - Ddrop.start.iloc[j]
                if overlap >= max_overlap:
                    if Ddrop.group.iloc[j] != 'ARGs':
                        if strand_selection == 'yes':
                            if verbose:
                                print("strand {} {}".format(
                                    Ddrop.strand.iloc[i], Ddrop.strand.iloc[j]), end = '')
                               
                            if Ddrop.strand.iloc[j] == Ddrop.strand.iloc[i]:
                                if verbose:
                                    print(" removing {}!".format(Ddrop.group.iloc[j]), end='')
                                Ddrop.iat[j, group_column] = np.nan
                        elif strand_selection == 'no':
                            if verbose:
                                print(" removing {}!".format(Ddrop.group.iloc[j]), end='')
                            Ddrop.iat[j, group_column] = np.nan
                else:

                    if verbose:
                        print("keeping {}!".format(Ddrop.group.iloc[j]), end='')

                if verbose:
                    print(" overlap {}".format(overlap))

                j = j + 1      
                
    for i in range(1, len(Ddrop)):
        if Ddrop.group.iloc[i] == 'ARGs':
            scaffold = Ddrop.index[i]

            if verbose: 
                print("({}) {} {}-{} {}".format(
                    i, scaffold, Ddrop.start.iloc[i], Ddrop.end.iloc[i], Ddrop.group.iloc[i]))

     
            j = i - 1
            while (j>=0 and (Ddrop.start.iloc[i] <= Ddrop.end.iloc[j])
                   and (Ddrop.index[j] == scaffold)):

                if verbose:
                    print("   ({}) {}-{} ".format(
                        j, Ddrop.start.iloc[j], Ddrop.end.iloc[j]), end='')

                overlap = np.min([Ddrop.end.iloc[j], Ddrop.end.iloc[i]]) - Ddrop.start.iloc[i]
                if overlap >= max_overlap:
                    if Ddrop.group.iloc[j] != 'ARGs':
                        if strand_selection == 'yes':
                            if verbose:
                                print("strand {} {}".format(
                                    Ddrop.strand.iloc[i], Ddrop.strand.iloc[j]), end='')
                            if Ddrop.strand.iloc[j] == Ddrop.strand.iloc[i]:
                                if verbose:
                                    print(" removing {}!".format(Ddrop.group.iloc[j]), end='')
                                Ddrop.iat[j, group_column] = np.nan
                        elif strand_selection == 'no':
                            if verbose:
                                print(" removing {}!".format(Ddrop.group.iloc[j]), end='')
                            Ddrop.iat[j, group_column] = np.nan
                    
                else:

                    if verbose:
                        print("keeping {}!".format(Ddrop.group.iloc[j]), end='')

                if verbose:
                    print(" overlap {}".format(overlap))

                j = j - 1
                
    df_remove_overlap = Ddrop.dropna(subset=['group']) 
    
    return df_remove_overlap       

In [4]:
"""
It is necessary to run 'remove_overlap_not_ARG_strand_selection' before.
Finds likely association between ARG subtype and MGE.

"""

def ARG_MGE_association(df, min_bitscore=50, min_id_arg=25, min_id_mge=25,
                         f=0.01, max_fdr=0.05, min_coverage_arg=0.4,
                       min_coverage_mge=0.4, min_length=500):

    """
    
    Parameters
    ----------
    df : pd.dataframe
        dataframe with scaffold, ARGs and MGEs data
    min_bitscore : int
        minimum bitscore, default is 50
    min_id_arg : int
        minimum identity of ARG, default is 25    
    min_id_mge : int
        minimum identity of MGE, default is 25
    min_coverage_arg : int
        minimum coverage of ARG, default is 0.4 in NanoARG
    min_coverage_mge : int
        minimum coverage of MGE, default is 0.4 in NanoARG
    f : float
        ratio of scaffolds with MGEs and total number of scaffolds
        set f = 0 to bypass binom test with FDR (Benjamini/Hochberg) correction.
        Binom test checks if fraction of ARG subtype scaffold with MGEs 
        exceeds fraction of scaffolds with MGEs in the sample, in other words,
        if ARG subtype is associated with an MGE more often than expected 
    max_fdr : float
        maximum FDR value to include in output 
    min_length : int
        minimum length of scaffold to include in analysis
        
    Returns
    -------
    fraction_genes_with_mge_test_count: pd.DataFrame()
        Dataframe with ratio and counts
        fdr : corrected p-value
        fraction_genes_with_mge : ARG subtype scaffold with MGE / ARG subtype scaffold
        count_genes : count of ARG subtype
        
    
    """
    df_length_filtered = df[df['length'] > min_length]
    
    df_bitscore_adjusted = df_length_filtered[df_length_filtered['bitscore'] > min_bitscore]

    df_bit_arg_adjusted = df_bitscore_adjusted.drop(df_bitscore_adjusted[ 
        (df_bitscore_adjusted['group'] == 'ARGs') 
        & ((df_bitscore_adjusted['identity'] < min_id_arg)
            | (df_bitscore_adjusted['coverage'] < min_coverage_arg))].index)

    df_adjusted = df_bit_arg_adjusted.drop(df_bit_arg_adjusted[ 
        (df_bit_arg_adjusted['group'] == 'MGEs') 
        & ((df_bit_arg_adjusted['identity'] < min_id_mge)
            | (df_bit_arg_adjusted['coverage'] < min_coverage_mge))].index)
    
    D = df_adjusted[df_adjusted.group.isin(['ARGs','MGEs'])]
    D_args = df_adjusted[df_adjusted.group.isin(['ARGs'])]
    count_genes = Counter(D_args['gene_name'])
    scaffolds = D_args.index.unique()

    count_genes_with_mge = Counter()
    for i, scaffold in enumerate(scaffolds):
        print("{:.2f}%".format((i+1)/len(scaffolds)*100.), end='\r')

        in_scaffold = D[D.index==scaffold]
        if 'MGEs' in D[D.index==scaffold]['group'].values:

            for gene in count_genes:
                if gene in in_scaffold['gene_name'].values:
                    count_genes_with_mge[gene]+=1
    print("           ", end="\r")

    count_genes_once_per_scaffold = Counter()
    for i, scaffold in enumerate(scaffolds):
        print("{:.2f}%".format((i+1)/len(scaffolds)*100.), end='\r')
        in_scaffold = D[D.index==scaffold]
        for gene in count_genes:
            if gene in in_scaffold['gene_name'].values:
                count_genes_once_per_scaffold[gene]+=1
    
    

    fraction_genes_with_mge = {}
    for gene in count_genes_with_mge:
        fraction_genes_with_mge[gene]=count_genes_with_mge[gene]/count_genes_once_per_scaffold[gene]
    
    fraction_genes_with_mge_df = pd.DataFrame.from_dict(fraction_genes_with_mge, orient='index')
    
    fraction_genes_with_mge_df = fraction_genes_with_mge_df.rename(columns={0: "fraction_genes_with_mge"})
    
    count_genes_df = pd.DataFrame.from_dict(count_genes, orient='index')
    count_genes_df = count_genes_df.rename(columns={0: "count_genes"})

    test_enriched = {}
    for gene, count in count_genes_with_mge.items(): 
        test_enriched[gene] = binom_test(count, n=count_genes_once_per_scaffold[gene], p=f, alternative='greater')
    
    test_enriched = pd.Series(test_enriched)
    
    rejected, pvalue_corrected = (
        fdrcorrection(test_enriched, alpha=max_fdr, method='indep', is_sorted=False))

    test_enriched_fdr = {
        'rejected': rejected.tolist(), 
        'fdr': pvalue_corrected.tolist(), 
        'gene_name': test_enriched.index.tolist()
    }
    test_enriched_fdr_df = pd.DataFrame(test_enriched_fdr)
    test_enriched_fdr_df.index = test_enriched_fdr_df['gene_name']

    fraction_genes_with_mge_test = (
        pd.concat([test_enriched_fdr_df, fraction_genes_with_mge_df], axis=1, join='inner'))
    
    fraction_genes_with_mge_test = (
        fraction_genes_with_mge_test.sort_values(by=['fraction_genes_with_mge'], ascending=False))
    
    fraction_genes_with_mge_test_count = (
        pd.concat([fraction_genes_with_mge_test, count_genes_df], axis=1, join='inner'))
    
    fraction_genes_with_mge_test_count = (
        fraction_genes_with_mge_test_count[fraction_genes_with_mge_test_count['rejected']])
    
    fraction_genes_with_mge_test_count.drop(['rejected','gene_name'], axis=1, inplace=True)
    return fraction_genes_with_mge_test_count

In [5]:
"""
It is necessary to run 'remove_overlap_not_ARG_strand_selection' before.
Returns stretches of scaffolds with ARGs and MGEs.

"""

def neighbor_stretch(df, min_ARG_count=1, max_distance=5000, 
                     number_neighboring_orf=4, 
                     sliding_window_step_increase=4, min_bitscore=50, 
                     min_id_arg=25, min_id_mge=25, min_coverage_arg=0.4, 
                     min_coverage_mge=0.4, verbose=False):
    
    """
    Search for MGE and specified number of ARGs in a desired
    stretch of scaffold window. This window slides forward at a 
    specified step.
    Reports a dataframe of neighboring orfs with MGEs and ARGs.
    
    Exp. number_neighboring_orf=4, sliding_window_step_increase=1
    max_distance = maximum distance between s and e
    min_ARG_count=1
    
    s1-ARG--->  <-----   ------> --MGE-->e1    report
              s2<-----   ------> --MGE--> <------e2    no ARG, do not report  
              
    
    
    Parameters
    ----------
    df : pd.dataframe
        dataframe with scaffold, ARGs, MGEs and start end 
        gene data
    
    max_distance : int
        maximum length of scaffold segment with 
        neighboring MGEs and ARGs to explore
        
    min_ARG_count : int
        minimum number of ARGs in the scaffold segment with 
        neighboring MGEs and ARGs
        
    number_neighboring_orf : int
        maximum number of neighboring orfs to consider
        
    sliding_window_step_increase : int
        int between 1 and number_neighboring_orf
    
    min_bitscore : int
        minimum bitscore, default is 50
        
    min_id_arg : int
        minimum aa identity of ARG, default is 25
        
    min_id_mge : int
        minimum aa identity of MGE, default is 25
        
    min_coverage_arg : int
        minimum coverage of ARG, default is 0.4 in NanoARG
        
    min_coverage_mge : int
        minimum coverage of MGE, default is 0.4 in NanoARG
        
    verbose : bool
        print progress and debug information
    
    Returns
    -------
    neighbor_genes : dataframe
        dataframe with neighboring ARGs and MGEs
    
    """
    neighbor_genes = {}
    
    output_columns = ['gene_id', 'gene_name', 'group', 'category', 'start', 'end', 'strand',
       'identity', 'bitscore', 'evalue', 'NCBI_taxa_id',
       'taxa_centrifuge_score', 'species', 'coverage', 'is_pathogen']
    
    for column in output_columns:
        neighbor_genes[column] = []
        
    
    neighbor_genes['read'] = []

    df_bitscore_adjusted = df[df['bitscore'] > min_bitscore]
    
    df_bit_arg_adjusted = df_bitscore_adjusted.drop(df_bitscore_adjusted[ 
        (df_bitscore_adjusted['group'] == 'ARGs') 
        & ((df_bitscore_adjusted['identity'] < min_id_arg)
            | (df_bitscore_adjusted['coverage'] < min_coverage_arg))].index)

    df_adjusted = df_bit_arg_adjusted.drop(df_bit_arg_adjusted[ 
        (df_bit_arg_adjusted['group'] == 'MGEs') 
        & ((df_bit_arg_adjusted['identity'] < min_id_mge)
            | (df_bit_arg_adjusted['coverage'] < min_coverage_mge))].index)
    
    
    if verbose:
        print("\ndf1:", df_bitscore_adjusted)
        print("\ndf2:", df_bit_arg_adjusted)
        print("\ndf3:", df_adjusted)
    
    
    for scaffold, data in df_adjusted.groupby(level=0):
        starts = np.array(data.start)
        ends = np.array(data.end)
        groups = np.array(data.group)
        
        if verbose:
            print("-----------------")
            print("\nscaffold:", scaffold)
            print("\nstarts:", starts)
            print("\nends:", ends)
            print("\ngroups:", groups)
            print("-----------------")
            print(len(groups))
        
        for i in range(0, len(groups)-number_neighboring_orf+1, sliding_window_step_increase):
            max_neighbor_orfs = min(number_neighboring_orf, len(groups) - i)
            count = Counter(groups[i:i+max_neighbor_orfs] == 'ARGs')
            
            
            if count[True] >= min_ARG_count and (groups[i:i+max_neighbor_orfs] == 'MGEs').any():
            
                if (ends[i+max_neighbor_orfs-1] - starts[i]) < max_distance:
                                
                        neighbor_genes['read'] += [
                            data.index.to_list()[i + d] for d in range(max_neighbor_orfs)
                        ]
                        
                        for column in output_columns:
                            neighbor_genes[column] += [
                                data[column].to_list()[i+d] for d in range(max_neighbor_orfs)
                            ]
    neighbor_genes = pd.DataFrame(neighbor_genes)
    neighbor_genes.index = neighbor_genes['read']
    neighbor_genes = neighbor_genes.drop(['read'], axis =1)
    
    return neighbor_genes

In [6]:
def gene_names(df,  min_bitscore=50,
               arg_name='sul1',
               min_id_arg=25, 
               min_id_mge=25,
               min_coverage_arg=0.4,
               min_coverage_mge=0.4):
    
    """
    Generate a dataframe of scaffolds containing desired ARG subtype 
    
    Parameters
    ----------
    df : pd.dataframe
        dataframe with scaffold, ARGs, MGEs and start end 
        gene data
        
    arg_name : string
        name of gene as it appears in gene_name column of NanoARG
    
    min_bitscore : int
        minimum bitscore, default is 50
        
    min_id_arg : int
        minimum aa identity of ARG, default is 25
        
    min_id_mge : int
        minimum aa identity of MGE, default is 25
        
    min_coverage_arg : int
        minimum coverage of ARG, default is 0.4 in NanoARG
        
    min_coverage_mge : int
        minimum coverage of MGE, default is 0.4 in NanoARG
        
    
    Returns
    -------
    neighbor_genes : dataframe
        dataframe of scaffolds containing desired ARG subtype
    
    """
    arg_scaffold = {}
    
    output_columns = ['gene_id', 'gene_name', 'group', 'category', 'start', 'end', 'strand',
       'identity', 'bitscore', 'evalue', 'NCBI_taxa_id',
       'taxa_centrifuge_score', 'species', 'coverage', 'is_pathogen']
    
    for column in output_columns:
        arg_scaffold[column] = []
        
    
    arg_scaffold['read'] = []

    df_bitscore_adjusted = df[df['bitscore'] > min_bitscore]
    
    df_bit_arg_adjusted = df_bitscore_adjusted.drop(df_bitscore_adjusted[ 
        (df_bitscore_adjusted['group'] == 'ARGs') 
        & ((df_bitscore_adjusted['identity'] < min_id_arg)
            | (df_bitscore_adjusted['coverage'] < min_coverage_arg))].index)

    df_adjusted = df_bit_arg_adjusted.drop(df_bit_arg_adjusted[ 
        (df_bit_arg_adjusted['group'] == 'MGEs') 
        & ((df_bit_arg_adjusted['identity'] < min_id_mge)
            | (df_bit_arg_adjusted['coverage'] < min_coverage_mge))].index)
    
    
    for scaffold, data in df_adjusted.groupby(level=0):
        starts = np.array(data.start)
        ends = np.array(data.end)
        gene_names = np.array(data.gene_name)
                   
        if (gene_names == arg_name).any():   
                                           
                        arg_scaffold['read'] += [
                            data.index.to_list()
                        ]
                        
                        for column in output_columns:
                            arg_scaffold[column] += [
                                data[column].to_list()
                            ]

                            
    arg_scaffold = pd.DataFrame(arg_scaffold)
    arg_scaffold.index = arg_scaffold['read']
    arg_scaffold = arg_scaffold.drop(['read'], axis =1)
    
    return arg_scaffold

In [7]:
"""
It is necessary to run 'remove_overlap_not_ARG_strand_selection' before 
Returns stretches of scaffolds with ARGs, MRGs and MGEs.

"""



def neighbor_stretch_with_MRG(df, min_ARG_count=1, max_distance=5000, 
                     number_neighboring_orf=4, 
                     sliding_window_step_increase=4, min_bitscore=50, 
                     min_id_arg=25, min_id_mge=25, min_coverage_arg=0.4, 
                              min_coverage_mge=0.4, verbose=False):
    
    """
    Search for MGE, MRG and specified number of ARGs in a desired
    stretch of scaffold window. This window slides forward at a 
    specified step.
    Reports a dataframe of neighboring orfs with MGEs and ARGs.
    
    Exp. number_neighboring_orf=4, sliding_window_step_increase=1
    max_distance = maximum distance between s and e
    min_ARG_count=1
    
    s1-ARG--->  <----- ------> --MGE-->e1    no MRG, do not report
              s2<----- ------> --MGE--> <------e2    no ARG, do not report  
                     s3------> --MGE--> <------ --ARG--> no MRG, do not report
                             s4--MGE--> <------ --ARG--> --MRG--> report
              
    
    
    Parameters
    ----------
    df : pd.dataframe
        dataframe with scaffold, ARGs, MGEs and start end 
        gene data
    
    max_distance : int
        maximum length of scaffold segment with 
        neighboring MGEs and ARGs
        
    min_ARG_count : int
        minimum number of ARGs in the scaffold segment with 
        neighboring MGEs and ARGs
        
    number_neighboring_orf : int
        maximum number of neighboring orfs to consider
        
    sliding_window_step_increase : int
        int between 1 and number_neighboring_orf
    
    min_bitscore : int
        minimum bitscore, default is 50
        
    min_id_arg : int
        minimum aa identity of ARG, default is 25
        
    min_id_mge : int
        minimum aa identity of MGE, default is 25
        
    min_coverage_arg : int
        minimum coverage of ARG, default is 0.4 in NanoARG
        
    min_coverage_mge : int
        minimum coverage of MGE, default is 0.4 in NanoARG
        
    verbose : bool
        print progress and debug information
    
    Returns
    -------
    neighbor_genes : dataframe
        dataframe with neighboring ARGs and MGEs
    
    """
    
    neighbor_genes = {}
    
    output_columns = ['gene_id', 'gene_name', 'group', 'category', 'start', 'end', 'strand',
       'identity', 'bitscore', 'evalue', 'NCBI_taxa_id',
       'taxa_centrifuge_score', 'species', 'coverage', 'is_pathogen']
    
    for column in output_columns:
        neighbor_genes[column] = []
        
    
    neighbor_genes['read'] = []

    df_bitscore_adjusted = df[df['bitscore'] > min_bitscore]
    
    df_bit_arg_adjusted = df_bitscore_adjusted.drop(df_bitscore_adjusted[ 
        (df_bitscore_adjusted['group'] == 'ARGs') 
        & ((df_bitscore_adjusted['identity'] < min_id_arg)
            | (df_bitscore_adjusted['coverage'] < min_coverage_arg))].index)

    df_adjusted = df_bit_arg_adjusted.drop(df_bit_arg_adjusted[ 
        (df_bit_arg_adjusted['group'] == 'MGEs') 
        & ((df_bit_arg_adjusted['identity'] < min_id_mge)
            | (df_bit_arg_adjusted['coverage'] < min_coverage_mge))].index)

        
    for scaffold, data in df_adjusted.groupby(level=0):
        starts = np.array(data.start)
        ends = np.array(data.end)
        groups = np.array(data.group)
        
        if verbose:
            print("-----------------")
            print("\nscaffold:", scaffold)
            print("\nstarts:", starts)
            print("\nends:", ends)
            print("\ngroups:", groups)
            print("-----------------")
            print(len(groups))
        
        for i in range(0, len(groups)-number_neighboring_orf+1, sliding_window_step_increase):
            max_neighbor_orfs = min(number_neighboring_orf, len(groups) - i)
            count = Counter(groups[i:i+max_neighbor_orfs] == 'ARGs')
            
            
            if count[True] >= min_ARG_count and (groups[i:i+max_neighbor_orfs] == 'MGEs').any() and (groups[i:i+max_neighbor_orfs] == 'MRGs').any():
            
                if (ends[i+max_neighbor_orfs-1] - starts[i]) < max_distance:
                                
                        neighbor_genes['read'] += [
                            data.index.to_list()[i + d] for d in range(max_neighbor_orfs)
                        ]
                        
                        for column in output_columns:
                            neighbor_genes[column] += [
                                data[column].to_list()[i+d] for d in range(max_neighbor_orfs)
                            ]
    neighbor_genes = pd.DataFrame(neighbor_genes)
    neighbor_genes.index = neighbor_genes['read']
    neighbor_genes = neighbor_genes.drop(['read'], axis =1)
    
    return neighbor_genes

In [8]:
"""
INPORTING DATA WITH NANOARG FULL TABLES, ASSEMBLY READ-COUNT AND LENGTH
-----------------------------------------------------------------------
Execute merge_nanoARG_full_table_assembly_length_read_count_calculate_abundance.ipynb before
to generate 'all_full_tables_remove_overlap_test.p'

"""

with open('all_full_tables_remove_overlap_test.p', 'rb') as fp:
    all_full_tables_remove_overlap = pickle.load(fp)

In [9]:
"""
Step necessary before merging samples
-------------------------------------

Adds sample names (exp. 'HCL0_W2') to 
scaffold numbers in index (exp. scaffold_0).
The modified index will have both sample name 
and scaffold number.
Dataframes will be updated with the new combined index
and stored in the same dictionary called
all_full_tables_remove_overlap.

"""

for name, df in all_full_tables_remove_overlap.items():
    df['file_name'] = name
    df['index'] = df.index
    df['read'] = df['file_name'] + df['index']
    df.index = df['read']
    df = df.drop(['read', 'file_name', 'index'], axis=1 )

In [10]:
all_full_tables_remove_overlap['HCL0_W2'].head()

Unnamed: 0_level_0,gene_id,gene_name,group,category,start,end,strand,identity,bitscore,evalue,...,species,coverage,is_pathogen,length,read_count,normalized_read_count,normalized_read_count_per_kb,file_name,index,read
read,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
HCL0_W2scaffold_0,UniRef90_P45315,Cluster: Probable protease SohB,Functional Genes,Cluster: Probable protease SohB,160,864,-,58.3,289.3,3.2000000000000003e-75,...,Pseudomonas sp. UW4,0.665722,0,212693.0,249046.0,12021.241714,56.519216,HCL0_W2,scaffold_0,HCL0_W2scaffold_0
HCL0_W2scaffold_0,UniRef90_Q9Z985,Cluster: UvrABC system protein A,Functional Genes,Cluster: UvrABC system protein A,7366,9897,-,30.9,376.3,2e-101,...,Pseudomonas sp. UW4,0.513143,0,212693.0,249046.0,12021.241714,56.519216,HCL0_W2,scaffold_0,HCL0_W2scaffold_0
HCL0_W2scaffold_0,BAC0164,Cobalt,MRGs,Cobalt,13133,13804,+,31.3,105.1,2.3000000000000003e-22,...,Pseudomonas sp. UW4,0.913725,0,212693.0,249046.0,12021.241714,56.519216,HCL0_W2,scaffold_0,HCL0_W2scaffold_0
HCL0_W2scaffold_0,UniRef90_Q7VNG4,Cluster: Spermidine/putrescine import ATP-bind...,Functional Genes,Cluster: Spermidine/putrescine import ATP-bind...,13139,13855,+,61.5,288.9,4.2e-75,...,Pseudomonas sp. UW4,0.644205,0,212693.0,249046.0,12021.241714,56.519216,HCL0_W2,scaffold_0,HCL0_W2scaffold_0
HCL0_W2scaffold_0,BAC0599,Molybdenum,MRGs,Molybdenum,15299,15895,+,31.6,71.2,3.7e-12,...,Pseudomonas sp. UW4,0.922747,0,212693.0,249046.0,12021.241714,56.519216,HCL0_W2,scaffold_0,HCL0_W2scaffold_0


In [11]:
"""
MERGING SAMPLES TO DESIRED GROUPS
---------------------------------


Combine samples to be analyzed as a group
and store as pd.DataFrame.

For example,
HCL0_W2 and HCL0_W3 are two NanoARG tables, 
chlorine_unfiltered is the combined table.

"""


chlorine_unfiltered = pd.concat([
                                all_full_tables_remove_overlap['HCL0_W2'],
                                all_full_tables_remove_overlap['HCL0_W3']
                                ])

In [12]:
"""
Running 'ARG_MGE_association'
-----------------------------


Change 'chlorine_unfiltered' to your sample name.
Change parameters as desired. Parameter not specified runs with default.

Set the ARG aa idendity higher and lower, change max_fdr etc. to explore the data.
For example, setting high identity and coverage can return ARGs 
known to be associated with mobile genetic elements.

Import/deduce 'f' from dataframe 'MGEs_and_ARGs_pathogens_normalized'. 

Group samples differently for data exploration. 

"""


chlorine_unfiltered_ARG_MGE_association = ARG_MGE_association(
    chlorine_unfiltered, min_id_arg=50, 
    f = 0.01,
    min_coverage_arg=0.4)

100.00%    

In [13]:
chlorine_unfiltered_ARG_MGE_association

Unnamed: 0,fdr,fraction_genes_with_mge,count_genes
macB,0.031508,1.0,1
ADC-8,0.000475,1.0,2
vgaC,0.000475,1.0,2
mphD,0.031508,0.5,2
msrE,0.031508,0.5,2
sul1,0.031508,0.5,2
mtrD,0.031508,0.5,2
adeI,0.031508,0.5,2
macA,0.031508,0.5,2
acrB,0.000157,0.266667,15


In [14]:
"""
Running 'neighbor_stretch'
--------------------------
Change 'chlorine_unfiltered' to your sample name.
Change parameters as desired. 
For example, setting max_distance higher looks at a longer stretch of the scaffold.

"""


neighbor_stretch_unfiltered_chlorine = neighbor_stretch(
    chlorine_unfiltered, min_ARG_count=1, max_distance =5000, 
                     number_neighboring_orf=4, 
                     sliding_window_step_increase=4, min_bitscore=50, 
                     min_id_arg=80, min_id_mge=25, min_coverage_arg=0.5, verbose=False)

In [15]:
neighbor_stretch_unfiltered_chlorine

Unnamed: 0_level_0,gene_id,gene_name,group,category,start,end,strand,identity,bitscore,evalue,NCBI_taxa_id,taxa_centrifuge_score,species,coverage,is_pathogen
read,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
HCL0_W2scaffold_3375,ANP63073.1,mphD,ARGs,MLS,4070,4882,+,100.0,548.1,1.5999999999999999e-155,48296,302771.0,Acinetobacter pittii,0.921769,0
HCL0_W2scaffold_3375,YP_724476.1,msrE,ARGs,MLS,4941,6413,+,100.0,943.0,2.3e-274,48296,302771.0,Acinetobacter pittii,1.0,0
HCL0_W2scaffold_3375,UniRef90_P55373,Cluster: Putative transposase y4bF,Functional Genes,Cluster: Putative transposase y4bF,6968,8221,+,46.9,376.3,9.2e-103,48296,302771.0,Acinetobacter pittii,0.923414,0
HCL0_W2scaffold_3375,WP_067143567.1,transposase [Oceanivirga salmonicida],MGEs,transposase,7331,8017,+,30.9,100.1,1.7999999999999998e-19,48296,302771.0,Acinetobacter pittii,0.682493,0


In [16]:
"""
Running 'gene_names'
-------------------
Change 'chlorine_unfiltered' to your sample name.
Change parameters as desired. 

"""

gene_names_unfiltered_chlorine = gene_names(
    chlorine_unfiltered, arg_name = 'sul1', min_id_arg = 80, min_id_mge = 25, 
    min_coverage_arg=0.6)

In [17]:
gene_names_unfiltered_chlorine

Unnamed: 0_level_0,gene_id,gene_name,group,category,start,end,strand,identity,bitscore,evalue,NCBI_taxa_id,taxa_centrifuge_score,species,coverage,is_pathogen
read,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
"[HCL0_W2scaffold_35234, HCL0_W2scaffold_35234]","[YP_009163984.1, Q49184]",[resolvase/recombinase (plasmid) [Citrobacter ...,"[MGEs, ARGs]","[recombinase, sulfonamide]","[1, 1037]","[450, 1645]","[+, +]","[100.0, 97.0]","[301.6, 395.2]","[6.899999999999999e-81, 2.999999999999999e-110]","[61648, 61648]","[1096209.0, 1096209.0]","[Kluyvera intermedia, Kluyvera intermedia]","[0.78125, 0.717314487633]","[0, 0]"
[HCL0_W3scaffold_49022],[Q49184],[sul1],[ARGs],[sulfonamide],[92],[925],[+],[100.0],[547.4],[2.6999999999999996e-156],[506],[829921.0],[Alcaligenaceae],[0.982332155477],[0]
