In [1]:
import pandas as pd
import numpy as np

# Thoughts
Can a contig can still be a contaminant even if it is at lower relative coverage in a control than in the sample?

# Preprocessing

In [119]:
# from sample_name, contigs db, and profile db, generate needed input files
# also take as input, a list of controls sample names

# Script

In [2]:
sample_name = 'AWTP_2_RO2_bulk_3'
contig_data_file = '/Users/rosekantor/data/awtp2_metagenomics/decontamination/KNLK_58_contigs_basic_info.txt'
cov_means_file = '/Users/rosekantor/data/awtp2_metagenomics/decontamination/KNLK_58_mean_coverage_Q2Q3_contigs.txt'
cov_stds_file = '/Users/rosekantor/data/awtp2_metagenomics/decontamination/KNLK_58_std_coverage_contigs.txt'
out_file = '/Users/rosekantor/data/awtp2_metagenomics/KNLK_58_contaminants/'

In [10]:
def import_table(anvio_file):
    # import coverage means table
    df = pd.read_csv(anvio_file, sep='\t')
    #df = df.drop(columns = 'contig') #first col is actually the split so remove it since we want contigs (luckily values for all splits in a contig are identical since we specified contig)
    df = df.rename(columns = {'contig': 'split', '__parent__' : 'contig', sample_name : 's'}) 
    #df = df.drop_duplicates() # collapse down to just contig level
    df = df.drop(columns = 'contig')
    df = df.set_index('split')

    return(df)

In [45]:
def score_contigs(cov_means, cov_stds):
    # calculate relative coverage
    cov_means_norm = cov_means.div(cov_means.sum(axis=0), axis=1)
    # compare each column to s and make a matrix that is binary control < sample --> noncontam contig
    cov_means_norm_lt = cov_means_norm.lt(cov_means_norm.s, axis=0)

    # get coefficient of variance (mean-normalized standard deviation of coverage for each contig)
    cov_stds_norm = cov_stds / cov_means
    cov_stds_norm = cov_stds_norm.replace([np.nan, np.inf], 0)
    # compare each column to s and make a matrix that is binary control > sample --> noncontam contig
    cov_stds_norm_gt = cov_stds_norm.gt(cov_stds_norm.s, axis=0)

    ## Rules
    # identify contig x control combinations where:
    # iscontam = 'contam': (cov_means_norm_lt==False and cov_stds_norm_gt==False) score=0 or cov_means_norm.s==0
    # iscontam = 'real': (cov_means_norm_lt==True and cov_stds_norm_gt==True) score=2 or cov_means_norm.c==0
    # iscontam = 'unknown': ((cov_means_norm_lt==False and cov_stds_norm_gt==True) or 
    #                        (cov_means_norm_lt==True and cov_stds_norm_gt==False)) score=1

    ## First look at just combinations of cov_means_norm_lt and cov_stds_norm_gt
    # convert boolean to number and then add, so that True&True = 2, True&False = 1, False&False = 0
    iscontam = cov_means_norm_lt*1 + cov_stds_norm_gt*1

    ## resolve cases of False&True and True&False (both have value of 1)

    ## for any contig where (cov_means_norm.s == 0), change iscontam to 0 for all controls
    s_no_cov = (cov_means_norm.s == 0) * 1
    iscontam = iscontam.subtract(s_no_cov, axis=0) # subtract 1 in cases where sample has no cov.

    # for any contig x control where (cov_means_norm.c == 0), change iscontam to 2 for that control
    c_no_cov = (cov_means_norm==0 * 1)
    iscontam = iscontam.add(c_no_cov) # add 1 in cases where control has no cov.
    
    iscontam = iscontam.drop(columns = 's')
    iscontam = iscontam.add_prefix('score_')
    
    ## convert score [0,1,2] to ['contam', 'unknown', 'noncontam']
    iscontam = iscontam.replace({0: 'contam', 1: 'unknown', 2: 'noncontam'})

    return(iscontam)

In [46]:
cov_means = import_table(cov_means_file)
cov_stds = import_table(cov_stds_file)
iscontam = score_contigs(cov_means, cov_stds)

In [111]:
# collect only scaffolds that are possible contam by excluding any row that contains all 'noncontam'
possible_contams = iscontam[~((iscontam == 'noncontam').all(axis = 'columns'))]
possible_contams.to_csv('~/data/awtp2_metagenomics/KNLK_58_contaminants/contam/KNLK_58_contam_scores.txt', sep='\t')

In [140]:
collection = pd.DataFrame(possible_contams.index)
collection['bin'] = 'contam'
collection

Unnamed: 0,split,bin
0,KNLK_58_100189_split_00001,contam
1,KNLK_58_100346_split_00001,contam
2,KNLK_58_10037_split_00001,contam
3,KNLK_58_100456_split_00001,contam
4,KNLK_58_100490_split_00001,contam
...,...,...
1673,KNLK_58_99127_split_00001,contam
1674,KNLK_58_99127_split_00002,contam
1675,KNLK_58_99370_split_00001,contam
1676,KNLK_58_99632_split_00001,contam


In [123]:
# trying to get contig column for testing
t = possible_contams.reset_index()
t['contig'] = t['split'].str.extract(r'(.+)_split_\d+')
t= t.drop(columns='split')
t = t.drop_duplicates()
t = t.set_index('contig')

iscontamm = t.reset_index().melt(id_vars=['contig'],
                          var_name='control',
                          value_name='contamination')
p = pd.DataFrame(iscontamm[iscontamm.contamination!='noncontam'].contig.unique(), columns=['contigs'])
p

Unnamed: 0,contigs
0,KNLK_58_100346
1,KNLK_58_100456
2,KNLK_58_100490
3,KNLK_58_100836
4,KNLK_58_100999
...,...
1613,KNLK_58_87401
1614,KNLK_58_90394
1615,KNLK_58_92561
1616,KNLK_58_93295


In [180]:
# questions
# what fraction of negative controls is from positive control contam
# what should rule be: 1) if any control says the contig is a contam, then it is contam (what about cross-contam of samples into just one control?)
#                      2) if any control says contig is noncontam, then it is noncontam (some controls won't have enough info)

In [120]:
## import contig table
# contig_data = pd.read_csv(contig_data_file, sep='\t')
# contig_data = contig_data.set_index('contig')

In [212]:
possible_contams = pd.DataFrame(iscontamm[iscontamm.contamination!=2].contig.unique(), columns=['contigs'])
possible_contams['bin']='contam'

In [213]:
possible_contams.head()

Unnamed: 0,contigs,bin
0,KNLK_23_314861,contam
1,KNLK_23_63001,contam
2,KNLK_23_223930,contam
3,KNLK_23_279903,contam
4,KNLK_23_237933,contam


In [214]:
possible_contams.to_csv(out_file, header=False, index=False, sep='\t')