In [32]:
import numpy as np
import pandas as pd
from IPython.display import display
from notebook_importer import *

## Simulate PCR

We begin with approximately 100-500 thousand DMS modified molecules. Right now, we can DMS modify molecules of length 1000-2000nt. 

We perform PCR to produce approximately 100 billion molecules (from the original 100-500). Currently looking for sophisticated tools for simulating the PCR process, but in the meanwhile, using a naive sample and duplicate algorithm.



In [33]:
def naive_duplication(molecules, max_molecules):
    '''
    Naively simulate PCR by randomly sampling a molecule from the list of molecules and duplicating this molecule.Naively simulate PCR by randomly sampling a molecule from the list of molecules and duplicating this molecule.
    @param: molecules - list of molecules, where each molecule is the Query_name corresponding to an original molecule in the pool to be amplified
    @param: max_molecules - desired total number of molecules after PCR
    
    @return: molecules -  a numpy array of all the molecule query names after PCR''' 
    
    indices = [i for i in range(len(molecules))]
    while len(indices)<max_molecules:
        dup = np.random.choice(indices)
        indices.append(dup)

    indices.sort()
    return molecules[indices]
    
def naive_PCR(basevector_fname, max_molecules):
    '''
    Naively simulate PCR by randomly sampling a molecule from the list of molecules and duplicating this molecule.
    Repeat this naive amplification until we have the desired number of molecules.
    @param: basevector_fname - name of the basevector file containing molecules you wish to amplify
    @param: max_molecules - desired total number of molecules after PCR
    
    @param: molecules -  a numpy array of all the molecules after PCR'''
     
    
    df = pd.read_csv(basevector_fname, sep='\t',index_col='Query_name')
    display(df.head())
    
    N_occur = list(df['N_occur'])
    molecules = np.repeat(list(df.index), N_occur)
    print('Loaded {0} molecules.'.format(len(molecules)))

    proportions_before = N_occur/np.sum(N_occur)
    
    amplified_molecules = naive_duplication(molecules, max_molecules)
    query_names, amplified_counts = np.unique(amplified_molecules,return_counts=True)
    
    proportions_after = amplified_counts/np.sum(amplified_counts)
    
    # Plot the proportions before and after to double check that the amplification is reasonably uniform
    #TODO
    
    # Create a dictionary with a N_occur column in dataframe to contain the amplified counts
    N_df = pd.DataFrame({'Query_name':query_names,'N_occur':amplified_counts})
    N_df.set_index('Query_name', inplace=True)
    
    # Merge original datafram with new counts
    df.drop(['N_occur'],axis=1,inplace=True)
    
    df = pd.merge(df, N_df, left_index=True, right_index=True)
    df = df[['Bases_vector','Molecules','N_occur','N_mutations','N_deletions','Coverage','Reference','Index']]
    display(df.head())
    
    df.to_csv(basevector_fname[:-4]+'_PCR.txt',sep='\t')
    
    return df

In [34]:
PCR_df = naive_PCR('LongTestFile_1_reads.txt',2000)

Unnamed: 0_level_0,Bases_vector,Molecules,N_occur,N_mutations,N_deletions,Coverage,Reference,Index
Query_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,0000000000000000000000000000000000000000000000...,GGGTCTCTCTGGTTAGACCAGATCTGAGCCTGGGAGCTCTCTGGCT...,1,0,0,987,struc_1,
1,0000000000000000000000000000000000000000000000...,GGGTCTCTCTGGTTAGACCAGATCTGAGCCTGGGAGCTCTCTGGCT...,1,0,0,987,struc_1,
2,0000000000000000000000000000000000000000000000...,GGGTCTCTCTGGTTAGACCAGATCTGAGCCTGGGAGCTCTCTGGCT...,1,0,0,987,struc_1,
3,0000000000000000000000000000000000000000000000...,GGGTCTCTCTGGTTAGACCAGATCTGAGCCTGGGAGCTCTCTGGCT...,1,0,0,987,struc_1,
4,0000000000000000000000000000000000000000000000...,GGGTCTCTCTGGTTAGACCAGATCTGAGCCTGGGAGCTCTCTGGCT...,1,0,0,987,struc_1,


Loaded 1488 molecules.


Unnamed: 0_level_0,Bases_vector,Molecules,N_occur,N_mutations,N_deletions,Coverage,Reference,Index
Query_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,0000000000000000000000000000000000000000000000...,GGGTCTCTCTGGTTAGACCAGATCTGAGCCTGGGAGCTCTCTGGCT...,1,0,0,987,struc_1,
1,0000000000000000000000000000000000000000000000...,GGGTCTCTCTGGTTAGACCAGATCTGAGCCTGGGAGCTCTCTGGCT...,1,0,0,987,struc_1,
2,0000000000000000000000000000000000000000000000...,GGGTCTCTCTGGTTAGACCAGATCTGAGCCTGGGAGCTCTCTGGCT...,1,0,0,987,struc_1,
3,0000000000000000000000000000000000000000000000...,GGGTCTCTCTGGTTAGACCAGATCTGAGCCTGGGAGCTCTCTGGCT...,1,0,0,987,struc_1,
4,0000000000000000000000000000000000000000000000...,GGGTCTCTCTGGTTAGACCAGATCTGAGCCTGGGAGCTCTCTGGCT...,2,0,0,987,struc_1,


<hr>
## Fragmentation & Size Selection

Before sequencing the PCR products, they must be fragmented and size selected to a certain range. We simulate the fragmentation of PCR products by choosing two (?) random points to break up the PCR product and then select all resulting fragments within the appropriate size.

We keep track of the Query_name of the original PCR product which generated the fragment (not the original molecule, since we don't care which copy each fragment belongs to). All molecules that have identical modifications are given the same Query_name. 

In [42]:
def fragment_random(reads_fname, molecule_fname, num_frags, size_range, paired_end=True, seq_length=300):
    '''
    Simulate random fragmentation on each molecule before sequencing. Generate a bitvector file of fragments as well as a 'FASTA' file.
    In the case of producing paired end reads, generates one bitvector file and two fasta files (one for each direction).
    Fasta file format example:
    >SRR041655.1 HWI-EAS284_61BKE:6:1:2:1735/1
    NAAATCAGACAAATCTCCGTTATTGGTATATACTTTGGGAGTGTTATGGAATTGCACACCCATTTCGAACATGAAGCCAATTCGTTTCTTAGGAATCGCT.
    
    @param: reads_fname - name of file containing DataFrame of reads
    @param: num_frags - number of random cuts to make per molecule
    @param: size_range -  tuple (smallest, biggest) defining the range of PCR fragments to select for 'sequencing'
    @param: paired_end - True if reads are to be sequenced using paired end technology
    @param: seq_length - Maximum length that the sequencer can read. Used only when paired_end is True, otherwise assume that size_range is chosen to capture the region of interest.
    
    @output: *_frag.csv file containing the basevectors for the fragmented regions
    @output: *.fasta - fasta file containing the products of sequencing. If paired end reads, produces two corresponding fasta files.
    
    @return: None
    '''
    df = pd.read_csv(reads_fname, sep='\t',index_col='Query_name')
    display(df.head(10))
    
    N_occur = list(df['N_occur'])
    n_basevectors = np.sum(N_occur)
    basevectors = np.repeat(list(df.Bases_vector), N_occur)
    molecules = np.repeat(list(df.Molecules), N_occur)
    query_names = np.repeat(list(df.index), N_occur)
    print(query_names)
    mol_size = len(basevectors[0])
    
    print('Loaded {0} basevectors.'.format(len(basevectors)))
    
    # Generate random fragmentation points
    frag_points = np.random.randint(mol_size, size=[n_basevectors,num_frags])
    frag_points.sort(axis=1)
    end_points = np.array([mol_size]*n_basevectors).reshape(-1,1)
    start_points = np.array([0]*n_basevectors).reshape(-1,1)
    frag_points = np.hstack((start_points,frag_points, end_points))

    # Compute the corresponding fragment lengths for each break point
    frag_lengths = np.diff(frag_points) #for example [[l_11,l_12,l_13],[l_21,l_22,l_23],...]

    # For each molecule, perform "size selection". If a fragment is of the appropriate size, it can be sequenced.
    valid_lengths = np.logical_and(frag_lengths>=size_range[0], frag_lengths<=size_range[1]) #for example [[False,True,True],[True,False,False],...]
  
    # Keep track of the results of sequencing
    seq_query = []
    seq_ampl = []
    seq_base_vec = []
    seq_start = [] #The start of coverage of that read
    seq_end = [] #The end of coverage of that read
    
    seq_mol = [] #The original bases fragmented and size selected
    
    uncovered = '.'*mol_size
    for (i, basevector) in enumerate(basevectors):
        q_name = query_names[i]     
        valid = np.where(valid_lengths[i]==True)[0] #
        frags = frag_points[i]
        for j in valid:
            start = frags[j]
            end = frags[j+1]
            bv = uncovered[:start]+basevector[start:end]+uncovered[end:]
            seq_query.append(q_name)
            seq_base_vec.append(bv)
            seq_mol.append(molecules[i][start:end])
            seq_start.append(start)
            seq_end.append(end)
            seq_ampl.append(i)
    
    df_dict = {'Amplicon':seq_ampl,'Query':seq_query,'Molecule':seq_base_vec,'Start':seq_start,'End':seq_end}
    df = pd.DataFrame(df_dict, columns=['Amplicon','Query','Molecule','Start','End'])

    df = df.groupby(df.columns.tolist()).size().reset_index().rename(columns={0:'count'})
    display(df.head())
    df.to_csv(reads_fname[:-4]+'_frag.csv')
    print('Saved fragmented molecules dataframe.')
    
    fasta_1, fasta_2 = ([],[])
    query_1, query_2 = ([],[])
    if paired_end:
        for i,molecule in enumerate(seq_mol):
            fasta_1.append(molecule[:seq_length])  #Sequence from the beginning of the read
            fasta_2.append(molecule[-seq_length:]) #Sequence from the end of the read
            query_1.append('{0}/1'.format(seq_query[i]))
            query_2.append('{0}/2'.format(seq_query[i]))
    with open(reads_fname[:-4]+'_1.fasta','w+') as f:
        for i, molecule in enumerate(fasta_1):
            f.write('>{}\n'.format(query_1[i]))
            f.write('{}\n'.format(molecule))
            
    with open(reads_fname[:-4]+'_2.fasta','w+') as f:
        for i, molecule in enumerate(fasta_2):
            f.write('>{}\n'.format(query_2[i]))
            f.write('{}\n'.format(molecule))
    
    print('Saved fasta files.')
            
    return None

fragment_random('LongTestFile_1_reads_PCR.txt','LongTestFile_1_basevector_PCR.txt',2,(500,600))

Unnamed: 0_level_0,Bases_vector,Molecules,N_occur,N_mutations,N_deletions,Coverage,Reference,Index
Query_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,0000000000000000000000000000000000000000000000...,GGGTCTCTCTGGTTAGACCAGATCTGAGCCTGGGAGCTCTCTGGCT...,1,0,0,987,struc_1,
1,0000000000000000000000000000000000000000000000...,GGGTCTCTCTGGTTAGACCAGATCTGAGCCTGGGAGCTCTCTGGCT...,1,0,0,987,struc_1,
2,0000000000000000000000000000000000000000000000...,GGGTCTCTCTGGTTAGACCAGATCTGAGCCTGGGAGCTCTCTGGCT...,1,0,0,987,struc_1,
3,0000000000000000000000000000000000000000000000...,GGGTCTCTCTGGTTAGACCAGATCTGAGCCTGGGAGCTCTCTGGCT...,1,0,0,987,struc_1,
4,0000000000000000000000000000000000000000000000...,GGGTCTCTCTGGTTAGACCAGATCTGAGCCTGGGAGCTCTCTGGCT...,2,0,0,987,struc_1,
5,0000000000000000000000000000000000000000000000...,GGGTCTCTCTGGTTAGACCAGATCTGAGCCTGGGAGCTCTCTGGCT...,1,0,0,987,struc_1,
6,0000000000000000000000000000000000000000000000...,GGGTCTCTCTGGTTAGACCAGATCTGAGCCTGGGAGCTCTCTGGCT...,2,0,0,987,struc_1,
7,0000000000000000000000000000000000000000000000...,GGGTCTCTCTGGTTAGACCAGATCTGAGCCTGGGAGCTCTCTGGCT...,1,0,0,987,struc_1,
8,0000000000000000000000000000000000000000000000...,GGGTCTCTCTGGTTAGACCAGATCTGAGCCTGGGAGCTCTCTGGCT...,1,0,0,987,struc_1,
9,0000000000000000000000000000000000000000000000...,GGGTCTCTCTGGTTAGACCAGATCTGAGCCTGGGAGCTCTCTGGCT...,1,0,0,987,struc_1,


[   0    1    2 ..., 1485 1486 1487]
Loaded 2000 basevectors.
0
[]
[False False False]
[  0 158 276 987]
[158 118 711]
____
1
[2]
[False False  True]
[  0  94 423 987]
[ 94 329 564]
____
2
[]
[False False False]
[  0 747 852 987]
[747 105 135]
____
3
[]
[False False False]
[  0 624 628 987]
[624   4 359]
____
4
[]
[False False False]
[  0 485 547 987]
[485  62 440]
____
4
[]
[False False False]
[  0 361 853 987]
[361 492 134]
____
5
[]
[False False False]
[  0  55 889 987]
[ 55 834  98]
____
6
[2]
[False False  True]
[  0 262 391 987]
[262 129 596]
____
6
[]
[False False False]
[  0 139 965 987]
[139 826  22]
____
7
[]
[False False False]
[  0 467 634 987]
[467 167 353]
____
8
[]
[False False False]
[  0 146 778 987]
[146 632 209]
____
9
[]
[False False False]
[  0 787 913 987]
[787 126  74]
____
10
[2]
[False False  True]
[  0 245 438 987]
[245 193 549]
____
11
[]
[False False False]
[  0 171 362 987]
[171 191 625]
____
12
[]
[False False False]
[  0 469 837 987]
[469 368 150]
____
12

Unnamed: 0,Amplicon,Query,Molecule,Start,End,count
0,1,1,.................................................,423,987,1
1,7,6,.................................................,391,987,1
2,12,10,.................................................,438,987,1
3,16,13,.................................................,481,987,1
4,18,14,.................................................,447,981,1


Saved fragmented molecules dataframe.
Saved fasta files.
