In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
from Bio import SeqIO, AlignIO, Seq

In [2]:
# COMPUTATION OF THE FRAMED SEQUENCE THAT MINIMIZES THE NUMBER OF STOP CODONS

bounds = [
    [[0,0], [1,-2], [2,-1]],
    [[0,-1], [1,0], [2,-2]],
    [[0,-2], [1,-1], [2,0]]
]

stop_codons = ['TGA', 'TAG', 'TAA']
stop_codons_arr = np.array([list(c) for c in stop_codons])


def check_if_productive(candidate_seq, min_prod_length):
    tri_seq = np.array(list(candidate_seq)).reshape((int(len(candidate_seq)/3), 3))
    stop_cod_position = np.array([0,len(candidate_seq)/3])
    for stop_c in stop_codons_arr:
        stop_cod_position = np.append(stop_cod_position, np.where(np.all(tri_seq == stop_c, axis=1)))
    # Sorted positions of the stop codons, including beginning and end of seq
    sort_cod_pos = np.sort(stop_cod_position)
    largest_productive_len = max(sort_cod_pos[1:]-sort_cod_pos[:-1])*3
    if largest_productive_len >= min_prod_length:
        return True
    
    
def get_framed_seqs(seqs_dict, min_prod_length=400):
    framed_seqs = dict()
    n_codons = dict()
    for id_, s in seqs_dict.items():
        mod3 = len(s)%3
        n_codons_per_seq = np.zeros(3)
        for fr in range(3):        
            tri_seq = np.array(list(s[bounds[mod3][fr][0]:bounds[mod3][fr][1]+len(s)]))
            tri_seq = tri_seq.reshape((int(len(tri_seq)/3), 3))
            for stop_c in stop_codons_arr:
                n_codons_per_seq[fr] += np.sum(np.all(tri_seq == stop_c, axis=1))
        
        # Minimum number of codons in the frame
        good_fr = np.argmin(n_codons_per_seq)
        candidate_seq = s[bounds[mod3][good_fr][0]:bounds[mod3][good_fr][1]+len(s)]
        # If stop codons are found, the sequence is considered productive if 
        # at least 400 bp without stop codons are found
        if min(n_codons_per_seq) > 1:
            #if not check_if_productive(candidate_seq, min_prod_length):
            continue
        
        n_codons[id_] = min(n_codons_per_seq)
        framed_seqs[id_] = candidate_seq
        
    return framed_seqs, n_codons

In [3]:
def read_fasta(path):
    """ Read a fasta file and return a dictionary """
    fasta = SeqIO.parse(path, "fasta")
    seqs = dict()
    for record in fasta:
        seqs[record.id] = str(record.seq)
    return seqs


def import_fasta(directory, samples=[]):
    seqs = dict()
    for file_name in os.listdir(directory):
        split_name = file_name.split('_')
        if len(split_name) == 3 and split_name[2] == 'hiv.fasta':
            sample_id = split_name[0]+'_'+split_name[1]
            if len(samples) > 0 and sample_id not in samples:
                continue
            s = read_fasta(directory+"/"+file_name)
            seqs[sample_id] = s
    return seqs


def write_seqs_fasta(path, seqs):
    f = open(path, 'w')
    for k, v in seqs.items():
        f.write('>'+k+'\n')
        f.write(v+'\n')
    f.close()

In [4]:
samples = import_fasta('../hiv_seqs2')
seq_info = pd.read_csv('../hiv_seqs_info.tsv', sep='\t', index_col='id')
len(samples)

104

In [5]:
samples_framed = dict()
for name, seqs in samples.items():
    framed_seqs, n_codons = get_framed_seqs(seqs)
    seq_info.at[name, 'N_unique_seqs'] = len(framed_seqs)
    seq_info.at[name, 'Nu_discarded_frame'] = len(seqs)-len(framed_seqs)
    if len(framed_seqs) == 0:
        continue
        
    samples_framed[name] = framed_seqs
    f = open('../hiv_framed_seqs/'+name+'_hiv.fasta', 'w')
    for id_, s in framed_seqs.items():
        f.write(id_+'\n')
        f.write(s+'\n')
    f.close()

In [10]:
seq_info.to_csv('../hiv_seqs_info.tsv', sep='\t')

In [7]:
def join_samples_of_patient(pat, all_seqs):
    """
    It creates a dictionary of all the sequences belonging to all time 
    points of a patient. The key of the dictionary is "seqid_counts_timeindex".
    It returns the sequence dictionary and the list of sorted times.
    """
    times, sort_seqs = [], []
    for id_ in all_seqs:
        if id_.split('_')[0][3:] != pat:
            continue
        sort_seqs.append(all_seqs[id_])
        times.append(int(id_.split('_')[1][3:]))

    sort_times = np.argsort(times)
    times = np.array(times)[sort_times]
    sort_seqs = np.array(sort_seqs)[sort_times]

    all_seq_dict = pd.Series(dtype=str)
    for i,s in enumerate(sort_seqs):
        sers = pd.Series(s)
        sers.index = sers.index + ['_'+str(i)]*len(sers)
        all_seq_dict = {**all_seq_dict, **sers.to_dict()}
    return all_seq_dict, times

In [8]:
import re
import subprocess

def build_msa(fa_path_in, fa_path_out, verbose=True):
    process = subprocess.Popen(['clustalo', '-i', fa_path_in, '-t', 'Protein', '-o', fa_path_out], 
                                stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    stdout, stderr = process.communicate()
    if verbose: 
        print(stdout.decode())
        if stderr.decode() != '':
            print(stderr.decode())
    result = read_fasta(fa_path_out)
    return result


def build_msa_with_codon_structure(framed_seqs, out_path):
    
    # Building the data struncture containing the seq info and the aa seq
    def aa_translate(row):
        row['aa_seq'] = Seq.translate(row['framed_seq'])
        return row
    framed_seqs_frame = pd.DataFrame(index=framed_seqs.keys())
    framed_seqs_frame['framed_seq'] = framed_seqs_frame.index.map(framed_seqs)
    framed_seqs_frame = framed_seqs_frame.apply(aa_translate, axis=1)
    
    # Performing msa on the amino-acid sequence
    write_seqs_fasta('tmp_aaseqs.fasta', framed_seqs_frame['aa_seq'].to_dict())
    msa_aa = build_msa('tmp_aaseqs.fasta', 'tmp_msa_aaseqs.fasta')
    subprocess.Popen(['rm', '-f', 'tmp_aaseqs.fasta'])
    subprocess.Popen(['rm', '-f', 'tmp_msa_aaseqs.fasta'])
    
    # Getting back the alignment on the nucleotide sequence
    def aa_to_nt_msa(row):
        gap_indexes = [m.start() for m in re.finditer('-', row['aa_seq_msa'])]
        msa_seq = row['framed_seq']
        for gap_i in gap_indexes:
            msa_seq = msa_seq[:gap_i*3] + '---' + msa_seq[gap_i*3:]
        row['framed_seq_msa'] = msa_seq
        return row
    framed_seqs_frame['aa_seq_msa'] = framed_seqs_frame.index.map(msa_aa)
    framed_seqs_frame = framed_seqs_frame.apply(aa_to_nt_msa, axis=1)
    
    write_seqs_fasta(out_path, framed_seqs_frame['framed_seq_msa'].to_dict())
    return framed_seqs_frame['framed_seq_msa'].to_dict()

In [9]:
out_dir='../hiv_msa/'
time_file = open(out_dir+'/times.txt', 'w')
for i_pat in range(10):
    pat = str(i_pat+1)
    all_seq_dict, times = join_samples_of_patient(pat, samples_framed)
    for t in times:
        time_file.write(str(t)+'\t')
    time_file.write('\n')
    msa_seqs = build_msa_with_codon_structure(all_seq_dict, out_dir+'/pat'+pat+'_hiv_msa.fasta')
    
time_file.close()





















