alignTranscripts1.0

#!/usr/bin/env python

import os
import sys
import argparse
import string
import tempfile
import subprocess
from math import floor

LASTZ = 'lastz'
BEDTOOLS = 'bedtools'
FASTAFROMBED = 'fastaFromBed'
SHUFFLEBED = 'shuffleBed'
STOP_CODON = [
	'TAA',
	'TAG',
	'TGA'
	]
def checkDependencies():
	#check bedtools version
	cmd = [BEDTOOLS, '--version']
	try:
		out = subprocess.check_output(cmd)
	except:
		sys.exit("ERROR: bedtools not installed! You must have bedtools v2.17.0 or higher in your path! Exiting...")

	out = out.split()
	version = out[1][1:].split(".")
	if int(version[0]) > 2 or int(version[1]) < 17:
		sys.exit("ERROR: The version of bedtools you have installed is too old. You must have bedtools v2.17.0 or higher in your path! Exiting...")

	#check lastz
	cmd = [LASTZ, '--version']
	try:
		p = subprocess.Popen(cmd, stderr=subprocess.PIPE, stdout=subprocess.PIPE)
	except:
		sys.exit("ERROR: lastz not installed! You must have lastz in your path! Exiting...")

	out = p.stdout.read()
	out = out.split()
	if out[0].strip() != "lastz":
		sys.exit("ERROR: lastz not installed! You must have lastz in your path! Exiting...")

def intArray(array):
	for i in range(len(array)):
		if array[i] != "": array[i] = int(array[i])
	return array

def convertToGeneCoords(base, exonSizes, exonStarts, numExons):
	block = 0
	for i in range(numExons):
		if base <= block + exonSizes[i]:
			newBase = exonStarts[i] + base - block
			break
		block += exonSizes[i]
	return newBase

def inExon(base, start, exonStarts, exonSizes, numExons, strand):
	#print strand
	for i in range(numExons):
		if (base > start + exonStarts[i] and base <= (start + exonStarts[i] + exonSizes[i])):
			
			#get position for splicing, 1 for first base in exon; -1 for last base in exon
			if strand == "+":
				posAbs = base - (start + exonStarts[i])
			else:
				posAbs = (start + exonStarts[i] + exonSizes[i]) - base + 1

			if (posAbs == 1): pos = 1
			elif (posAbs == exonSizes[i]): pos = -1
			else: pos = 0

			
			if strand == "+": return i+1, pos
			else: return numExons-i, pos

	return -1, 0

def inIntron(base, exonStarts, exonSizes, numExons, strand, geneSize):
	#print strand
	for i in range(numExons-1):
		if (base > (exonStarts[i] + exonSizes[i]) and base < exonStarts[i+1]):
			return exonStarts[i+1] - (exonStarts[i] + exonSizes[i])
	return -1


def writePaddedBed(gene, pad, genomeDict):
	chr = gene[0]

	gene[1] = max(0, int(gene[1])-pad)
	gene[2] = min(int(gene[2])+pad, genomeDict[chr])
		
	tmpFd, tmpPath = tempfile.mkstemp()
	tmp = os.fdopen(tmpFd, 'w')
	tmp.write("%s\t%i\t%i\t%s\t%s\t%s\n" % (gene[0], gene[1], gene[2], gene[3], gene[4], gene[5]))
	return tmpPath

def writeBed(gene):
	tmpFd, tmpPath = tempfile.mkstemp()
	tmp = os.fdopen(tmpFd, 'w')
	tmp.write("%s" % gene[0])
	for i in range(11):
		tmp.write("\t%s" % gene[i+1])
	tmp.write("\n")
	return tmpPath

def getFa(bed, fasta):
	tmpFd, tmpPath = tempfile.mkstemp()
	#cmd = "%s -fi %s -bed %s -fo %s -name" % (FASTAFROMBED, fasta, bed, tmpPath)
	cmd = [FASTAFROMBED, "-s", "-fi", fasta, "-bed", bed, "-fo", tmpPath, "-name"]
	subprocess.check_call(cmd)
	return tmpPath

def alignFa(paddedAFaPath, paddedBFaPath, unmask, gap_open, gap_extend):
	tmpFd, tmpPath = tempfile.mkstemp()
	if unmask:
		cmd = [LASTZ, '--strand=plus', paddedAFaPath+"[unmask]", paddedBFaPath+"[unmask]", '--format=maf', '--output=%s' % tmpPath, '--chain', '--gap=%d,%d' %(gap_open, gap_extend)]
	else:
		cmd = [LASTZ, '--strand=plus', paddedAFaPath, paddedBFaPath, '--format=maf', '--output=%s' % tmpPath, '--chain', '--gap=%d,%d' %(gap_open, gap_extend)]

	nullFd, nullPath = tempfile.mkstemp()
	null = open(nullPath, 'w')
	subprocess.check_call(cmd, stderr = null)
	null.close()
	cmd = ['rm', nullPath]
	subprocess.check_call(cmd)
	return tmpPath

code = {
	'TTT': 'Phe',
	'TTC': 'Phe',
	'TTA': 'Leu',
	'TTG': 'Leu',
	'TCT': 'Ser',
	'TCC': 'Ser',
	'TCA': 'Ser',
	'TCG': 'Ser',
	'TAT': 'Tyr',
	'TAC': 'Tyr',
	'TAA': 'Stop',
	'TAG': 'Stop',
	'TGT': 'Cys',
	'TGC': 'Cys',
	'TGA': 'Stop',
	'TGG': 'Trp',
	'CTT': 'Leu',
	'CTC': 'Leu',
	'CTA': 'Leu',
	'CTG': 'Leu',
	'CCT': 'Pro',
	'CCC': 'Pro',
	'CCA': 'Pro',
	'CCG': 'Pro',
	'CAT': 'His',
	'CAC': 'His',
	'CAA': 'Gln',
	'CAG': 'Gln',
	'CGT': 'Arg',
	'CGC': 'Arg',
	'CGA': 'Arg',
	'CGG': 'Arg',
	'ATT': 'Ile',
	'ATC': 'Ile',
	'ATA': 'Ile',
	'ATG': 'Met',
	'ACT': 'Thr',
	'ACC': 'Thr',
	'ACA': 'Thr',
	'ACG': 'Thr',
	'AAT': 'Asn',
	'AAC': 'Asn',
	'AAA': 'Lys',
	'AAG': 'Lys',
	'AGT': 'Ser',
	'AGC': 'Ser',
	'AGA': 'Arg',
	'AGG': 'Arg',
	'GTT': 'Val',
	'GTC': 'Val',
	'GTA': 'Val',
	'GTG': 'Val',
	'GCT': 'Ala',
	'GCC': 'Ala',
	'GCA': 'Ala',
	'GCG': 'Ala',
	'GAT': 'Asp',
	'GAC': 'Asp',
	'GAA': 'Glu',
	'GAG': 'Glu',
	'GGT': 'Gly',
	'GGC': 'Gly',
	'GGA': 'Gly',
	'GGG': 'Gly'
}

def kNkS(seqA, seqB):
	totalN = 0
	totalS = 0
	for i in range(3, len(seqB)-3):
		if i%3 == 0:
			codonA = seqA[i:i+3].upper()
			codonB = seqB[i:i+3].upper()
		elif i%3 == 1: 
			codonA = seqA[i-1:i+2].upper()
			codonB = seqB[i-1:i+2].upper()
		else: 
			codonA = seqA[i-2:i+1].upper()
			codonB = seqB[i-2:i+1].upper()

		if codonA.find("-") != -1 or codonB.find("-") != -1: continue
		
		for j in ['A', 'T', 'C', 'G']:
			if j == seqB[i].strip(): continue
			newcodon = list(codonB)
			newcodon[(i%3)] = j
			newcodon = "".join(newcodon)

			if code[codonB] == code[newcodon]: totalS += 1
			else: totalN += 1
	
	kN = 0
	kS = 0
	for i in range(3, len(seqB)-3):
		if seqB[i] != seqA[i]:
			if i%3 == 0:
				codonA = seqA[i:i+3].upper()
				codonB = seqB[i:i+3].upper()
			elif i%3 == 1: 
				codonA = seqA[i-1:i+2].upper()
				codonB = seqB[i-1:i+2].upper()
			else: 
				codonA = seqA[i-2:i+1].upper()
				codonB = seqB[i-2:i+1].upper()

			if codonA.find("-") != -1 or codonB.find("-") != -1: continue

			if code[codonA] == code[codonB]: kS += 1
			else: kN += 1
	
	kN = kN * 1.0 / totalN
	kS = kS * 1.0 / totalS
	
	return kN, kS

def checkForOrfs(stringA, stringB, baseA, baseB, pad, geneAStart, geneAEnd, exonStartsA, exonSizesA, numExonsA, strandA, geneBStart, geneBEnd, exonStartsB, exonSizesB, numExonsB, strandB):
	orfs = []

	starts = []
	startBaseA = []
	startBaseB = []
	
	if strandA == "+": baseAGen = geneAStart + baseA - pad + 1
	else: baseAGen = geneAEnd - (baseA - pad) 

	if strandB == "+": baseBGen = geneBStart - pad + baseB + 1
	else: baseBGen = geneBEnd - (baseB - pad)
	
	#get all possible start codons
	for i in range(len(stringA)):
		exonA, pos = inExon(baseAGen, geneAStart, exonStartsA, exonSizesA, numExonsA, strandA) 
		exonB, pos = inExon(baseBGen, geneBStart, exonStartsB, exonSizesB, numExonsB, strandB) 

		
		if exonA != -1 and exonB != -1:
			if stringA[i:i+3].upper() == "ATG" and stringB[i:i+3].upper() == "ATG":
				#print baseAGen
				starts.append(i)
				startBaseA.append(baseAGen)
				startBaseB.append(baseBGen)

		if stringA[i] !="-": 
			if strandA == "+": baseAGen += 1
			else: baseAGen -= 1

		if stringB[i] !="-": 
			if strandB == "+": baseBGen += 1
			else: baseBGen -= 1

	for i in range(len(starts)):
		valid = True
		baseAGen = startBaseA[i]
		baseBGen = startBaseB[i]
		orfA = ""
		orfB = ""

		#extract orf sequences
		for j in range(starts[i], len(stringA)):

			exonA, pos = inExon(baseAGen, geneAStart, exonStartsA, exonSizesA, numExonsA, strandA) 
			exonB, pos = inExon(baseBGen, geneBStart, exonStartsB, exonSizesB, numExonsB, strandB)
		
			if exonA != -1 and exonB != -1: 
				orfA += stringA[j]
				orfB += stringB[j]
			
			if stringA[j] !="-": 
				if strandA == "+": baseAGen += 1
				else: baseAGen -= 1

			if stringB[j] !="-":
				if strandB == "+" : baseBGen += 1
				else: baseBGen -= 1

		
		#truncate at first stop codon

		truncOrfA = ""
		truncOrfB = ""
		
		
		for j in range(min(len(orfA), len(orfB))):

			if len(truncOrfA.replace("-", "")) % 3 == 0 and len(truncOrfB.replace("-", "")) %3 == 0 and orfA[j:j+3] in STOP_CODON and orfB[j:j+3] in STOP_CODON:
					truncOrfA += orfA[j:j+3]
					truncOrfB += orfB[j:j+3]
					break
			else:
				truncOrfA += orfA[j]
				truncOrfB += orfB[j]

		if truncOrfA[-3:] not in STOP_CODON and truncOrfB[-3:] not in STOP_CODON: valid = False

	
		#check indels
		if valid:
			indel = False
			indelSize = 0
			for j in range(len(truncOrfA)):
				if truncOrfA[j] == "-":
					if not indel: indel = True
					indelSize += 1
				else:
					if indel: indel = False
					if indelSize %3 != 0: 
						valid = False
						break
			indel = False
			indelSize = 0
			for j in range(len(truncOrfB)):
				if truncOrfB[j] == "-":
					if not indel: indel = True
					indelSize += 1
				else:
					if indel: indel = False
					if indelSize %3 != 0: 
						valid = False
						break
		
		if valid and len(truncOrfA.replace("-", "")) > 29 and len(truncOrfB.replace("-", ""))> 29:
			kN, kS = kNkS(truncOrfA, truncOrfB)
			
			orfs.append([truncOrfA, truncOrfB, baseAGen, baseAGen+len(truncOrfA.replace("-", "")), len(truncOrfA.replace("-", "")), kN, kS])
	
	sorted(orfs, key=lambda x: x[4])
	return orfs
def main():
	parser = argparse.ArgumentParser(description='''
		Wrapper script for lastz to align two transcripts and report back exonic and sequence identity.\n
		If aligning non-coding genes, set --noncoding flag for more sensitive alignment.
	''')
	parser.add_argument('bedA', type=file,  help='bed file A')
	parser.add_argument('genomeFastaA', type=str, help='fasta file A')
	parser.add_argument('bedB', type=file, help='bed file B')
	parser.add_argument('genomeFastaB', type=str, help='fasta file B')
	parser.add_argument('out_prefix', type=str, help='out prefix')
	parser.add_argument('--geneA', type=str, help='if bedA has more than one entry, specify which gene to align')
	parser.add_argument('--geneB', type=str, help='if bedB has more than one entry, specify which gene to align')
	parser.add_argument('--pad', type=int, default=15000)
	parser.add_argument('--gap_open', type=int, default=200)
	parser.add_argument('--gap_extend', type=int, default=40)
	parser.add_argument('--unmask', action='store_true', help='unmask repeats when aligning')
	parser.add_argument('--orf', action='store_true', help='flag for checking for orfs in alignment')
	parser.add_argument('--bedtools_path', type=str)
	parser.add_argument('--lastz_path', type=str)
	parser.add_argument('--shuffle_bg', action='store_true', help='Aligns genes to random background and removes nonsignificant alignments')
	parser.add_argument('--pvalue', type=float, default=0.05)
	args = parser.parse_args()

	if args.bedtools_path is not None:
		global BEDTOOLS
		global FASTAFROMBED
		global SHUFFLEBED
		BEDTOOLS  = args.bedtools_path+"/bedtools"
		FASTAFROMBED = args.bedtools_path+"/fastaFromBed"
		SHUFFLEBED = args.bedtools_path+"shuffleBed"

	if args.lastz_path is not None:
		global LASTZ
		LASTZ = args.lastz_path+"/lastz"
	
	checkDependencies()

	genomeA = {}
	if os.path.exists(args.genomeFastaA+".fai"):
		genome = open(args.genomeFastaA+".fai", 'r')
		for line in genome.readlines():
			line = line.split()
			genomeA[line[0].strip()] = int(line[1])
	else:
		print "Fasta index for %s does not exist.  Please run samtools faidx %s. Exiting..." % (args.genomeFastaA, args.genomeFastaA)
		sys.exit(1)

	genomeB = {}
	if os.path.exists(args.genomeFastaB+".fai"):
		genome = open(args.genomeFastaB+".fai", 'r')
		for line in genome.readlines():
			line = line.split()
			genomeB[line[0].strip()] = int(line[1])
	else:
		print "Fasta index for %s does not exist.  Please run samtools faidx %s. Exiting..." % (args.genomeFastaB, args.genomeFastaB)
		sys.exit(1)

	#read in bed file A into array geneA
	geneA = []
	if args.geneA is None:
		geneA = args.bedA.readline().split()
	else:
		for line in args.bedA.readlines():
			if line.strip() == "": continue
			line = line.split()
			if (line[3].strip() == args.geneA.strip()): 
				geneA = line
				break
		if (len(geneA)==0):
			print "geneA %s not found! exiting..." % args.geneA
			sys.exit(1)

	#read in bed file B
	geneB = []
	if args.geneB is None:
		geneB = args.bedB.readline().split()
	else:
		for line in args.bedB.readlines():
			if line.strip() == "": continue
			line = line.split()
			if (line[3].strip() == args.geneB.strip()): 
				geneB = line
				break
		if (len(geneB)==0):
			print "geneB %s not found! exiting..." % args.geneB
			sys.exit(1)

	paddedABedPath = writePaddedBed(geneA, args.pad, genomeA)
	paddedBBedPath = writePaddedBed(geneB, args.pad, genomeB)

	paddedAFaPath = getFa(paddedABedPath, args.genomeFastaA)
	paddedBFaPath = getFa(paddedBBedPath, args.genomeFastaB)

	#ALIGN GENES OF INTEREST

	alignments = []
	mafs = []
	mafHeader = []
	finalOrfs = []

	mafPath = alignFa(paddedAFaPath, paddedBFaPath, args.unmask, args.gap_open, args.gap_extend)
	mafFile = open(mafPath, 'r')
	maf = mafFile.readlines()
	mafFile.close()


	if len(maf) > 14:
		chrA = geneA[0].strip()
		geneAStart = int(geneA[1])+args.pad
		geneAEnd = int(geneA[2])-args.pad
		geneAName = geneA[3]
		strandA = geneA[5].strip()
		exonStartsA = intArray(geneA[11].split(','))
		exonSizesA = intArray(geneA[10].split(','))
		numExonsA = int(geneA[9])
		geneASize = geneAEnd - geneAStart
		exonASize = 0
		for i in range(numExonsA):
			exonASize += exonSizesA[i]
		intronASize = 0
		for i in range(numExonsA):
			intronASize += exonStartsA[i]

		chrB = geneB[0].strip()
		geneBStart = int(geneB[1])+args.pad
		geneBEnd = int(geneB[2])-args.pad
		geneBName = geneB[3]
		strandB = geneB[5].strip()
		if strandB == "*": strandB = "+"
		exonStartsB = intArray(geneB[11].split(','))
		exonSizesB = intArray(geneB[10].split(','))
		numExonsB = int(geneB[9])
		geneBSize = geneBEnd - geneBStart
		exonBSize = 0
		for i in range(numExonsB):
			exonBSize += exonSizesB[i]
		
		#CRAWL THROUGH ALIGNMENTS
		
		id = 0
		exonId = 0
		intronId = 0
		maxIntronId = 0
		maxIntronNum = 0
		stringB = ""
		stringA = ""
		baseB = -1
		baseA = -1
		truncA = ""
		truncB = ""
		truncAStart = -1
		truncBStart = -1
		exonsAlignedA = []
		exonsAlignedB = []
		spliceStart = [0]*(numExonsA-1)
		spliceEnd = [0]*(numExonsA-1)
		counter = 0
		indelRate = 0
		indelRateIntron = 0
		inIndel = False
		inIndelIntron = False
		seen = 0
		seenIntron = 0
		for line in maf:
			line = line.strip()
			if line.startswith("#"): 
				mafHeader.append(line)

			if line.startswith("a"):
				counter = counter + 1
				a = line
				score = int(float(line.split("=")[1]))
			if line.startswith("s"):
				if stringA == "":
					sA = line.split()
					stringA = sA[6].strip()
					baseA = int(sA[2])
				else:
					sB = line.split()
					stringB = sB[6].strip()
					baseB = int(sB[2])

			#at line break, reset
			if line == "" and stringA !="":
				if seen == 0: indelRateNorm = 'NA'
				else: indelRateNorm = indelRate / (seen * 1.0)
				
				if seenIntron == 0: indelRateIntronNorm = 'NA'
				else: indelRateIntronNorm = indelRateIntron / (seenIntron * 1.0)
			
				#print counter	
				#print spliceStart
				#print spliceEnd
				
				numSpliceConserved = 0.0
				for i in range(numExonsA-1):
					if spliceStart[i] == spliceEnd[i] - 1: numSpliceConserved += 1
					elif spliceStart[i]> 0 or spliceEnd[i] > 0: numSpliceConserved += 0.5
				#print "numSpliceConserved: ", numSpliceConserved
				alignments.append([counter, geneAName, geneBName, exonId / (exonASize*1.0), id / (geneASize*1.0), exonId / (exonBSize*1.0), id / (geneBSize*1.0), indelRateNorm, indelRateIntronNorm, exonsAlignedA, exonsAlignedB, score, numSpliceConserved])
					

				#write maf
				if (strandA == "-"): truncAStart = genomeA[chrA] - truncAStart
				if (strandB == "-"): truncBStart = genomeB[chrB] - truncBStart
						
						
				mafStr = a+"\n"
				mafStr += "s %s %d %d %s %d %s\n" % (chrA, truncAStart, len(truncA.replace("-", "")), strandA, genomeA[chrA], truncA)
				mafStr += "s %s %d %d %s %d %s\n" % (chrB, truncBStart, len(truncB.replace("-", "")), strandB, genomeB[chrB], truncB)
				mafStr += ("\n")	
				mafs.append(mafStr)
					

				#check for small orfs
				if args.orf and exonId != 0:
					orfs = checkForOrfs(stringA, stringB, baseA, baseB, args.pad, geneAStart, geneAEnd, exonStartsA, exonSizesA, numExonsA, strandA, geneBStart, geneBEnd, exonStartsB, exonSizesB, numExonsB, strandB)
					for i in range(len(orfs)):
						eclipsed = False
						for j in range(i):
							if orfs[i][2] >= orfs[j][2] and orfs[i][2] <= orfs[j][3] and orfs[i][2] % 3 == orfs[j][2] % 3:
								eclipsed = True
								break
						if not eclipsed:
							curOrf = orfs[i]
							curOrf.append(score)
							curOrf.append(geneAName)
							curOrf.append(geneBName)
							finalOrfs.append(curOrf)

				#reset stats
				exonId = 0
				intronId = 0
				maxIntronId = 0
				maxIntronNum = 0
				id = 0
				score = 0
				stringB = ""
				stringA = ""
				truncA = ""
				truncB = ""
				truncAStart = -1
				truncBStart = -1
				exonsAlignedA = []
				exonsAlignedB = []
				spliceStart = [0]*(numExonsA-1)
				spliceEnd = [0]*(numExonsA-1)
				indelRate = 0
				indelRateIntron = 0
				inIndel = False	
				inIndelIntron = False
				seen = 0
				seenIntron = 0

			#print "*****"
			#if stringA and stringB populated, crawl alignments
			if stringB != "" and stringA !="":
				if strandA == "+": baseAGen = geneAStart + baseA - args.pad + 1

				else: baseAGen = geneAEnd - (baseA - args.pad) 

				if strandB == "+": baseBGen = geneBStart - args.pad + baseB + 1
				else: baseBGen = geneBEnd - (baseB - args.pad)
				
				for i in range(len(stringA)):
					
					if baseAGen >= geneAStart - 1000 and baseAGen <= geneAEnd+1000 and baseBGen >=geneBStart-1000 and baseBGen <= geneBEnd+1000:
						if truncA == "":
							truncAStart = baseAGen
							truncBStart = baseBGen
						truncA += stringA[i]
						truncB += stringB[i]


					exonA, exonAPos = inExon(baseAGen, geneAStart, exonStartsA, exonSizesA, numExonsA, strandA) 
					exonB, exonBPos = inExon(baseBGen, geneBStart, exonStartsB, exonSizesB, numExonsB, strandB)
					
					#if (exonAPos == -1 or exonAPos == 1): print exonA, exonAPos, stringA[i]
					#splice start
					if (exonAPos == -1 and exonBPos == -1): 
						#print (exonA-0), "splice start", exonAPos, exonBPos, stringA[i-20:i+20], stringB[i-20:i+20]
						if (exonA < numExonsA): spliceStart[exonA-1] = exonB
					
					if (exonAPos == 1 and exonBPos == 1): 
						#print (exonA-1), "splice end", exonAPos, exonBPos, stringA[i-20:i+20], stringB[i-20:i+20]
						if (exonA > 1): spliceEnd[exonA-2] = exonB

					#if its inside gene
					if baseAGen >= geneAStart and baseAGen < geneAEnd and baseBGen >= geneBStart and baseBGen < geneBEnd:
						if stringB[i].upper()==stringA[i].upper():
							id += 1

						#if it's inside exons
						if exonA != -1 and exonB != -1:
							seen += 1
							
							#reset inIndelIntron
							if inIndelIntron: inIndelIntron = False

							#if in exons and '-' at alignment
							if stringA[i] == "-" or stringB[i] == "-":
								if not inIndel:
									inIndel = True
									indelRate += 1
							else:
								inIndel = False
							#if stringA == stringB, increase exonID
							if stringB[i].upper()==stringA[i].upper():
								exonId += 1
								if exonA not in exonsAlignedA: exonsAlignedA.append(exonA)
								if exonB not in exonsAlignedB: exonsAlignedB.append(exonB)


						#else if inside intron-intron alignment
						elif exonA == -1 and exonB == -1:
							seenIntron += 1

							#reset inIndel(exons)
							if inIndel: inIndel = False
							
							#if in introns and '-' at alignment
							if stringA[i] == "-" or stringB[i] == "-":
								if not inIndelIntron:
									inIndelIntron = True
									indelRateIntron += 1
							else:
								inIndelIntron = False
						
						#else if at exon/intron alignment, just reset inIndel
						else:
							if inIndel: inIndel = False
							if inIndelIntron: inIndelIntron = False

					if stringA[i] !="-": 
						if strandA == "+": baseAGen += 1
						else: baseAGen -= 1

					if stringB[i] !="-":
						if strandB == "+" : baseBGen += 1
						else: baseBGen -= 1
	
	#if len(alignments)==0 : print "no alignment!"
	#else: print alignments

	#MAKE BACKGROUND
	if args.shuffle_bg and len(alignments)>0:
		shuffleScores = []
		bedB = writeBed(geneB)
		for i in range(25):
			#shuffle geneA
			cmd = [SHUFFLEBED, '-i', bedB, '-g', args.genomeFastaB+".fai"]
			shuffle = subprocess.check_output(cmd)
			shuffle = shuffle.split()
			paddedShuffleBed = writePaddedBed(shuffle, args.pad, genomeB)
			paddedShuffleFa = getFa(paddedShuffleBed, args.genomeFastaB)
			
			#align geneA to shuffled
			shuffleMafPath = alignFa(paddedAFaPath, paddedShuffleFa, args.unmask, args.gap_open, args.gap_extend)
			mafFile = open(shuffleMafPath, 'r')
			maf = mafFile.readlines()
			if len(maf) <=14: 
				shuffleScores.append(0)
			else:
				for line in maf:
					if line.startswith("a"): 
						shuffleScores.append(int(line.split("=")[1]))
			cmd = ['rm', shuffleMafPath, paddedShuffleBed, paddedShuffleFa]
			subprocess.check_call(cmd)

		bedA = writeBed(geneA)
		for i in range(25):
			#shuffle geneB
			cmd = [SHUFFLEBED, '-i', bedA, '-g', args.genomeFastaA+".fai"]
			shuffle = subprocess.check_output(cmd)
			shuffle = shuffle.split()
			paddedShuffleBed = writePaddedBed(shuffle, args.pad, genomeA)
			paddedShuffleFa = getFa(paddedShuffleBed, args.genomeFastaA)
			
			#align geneB to shuffled
			shuffleMafPath = alignFa(paddedBFaPath, paddedShuffleFa, args.unmask, args.gap_open, args.gap_extend)
			mafFile = open(shuffleMafPath, 'r')
			maf = mafFile.readlines()
			if len(maf) <=1: shuffleScores.append(0)
			else:
				for line in maf:
					if line.startswith("a"): shuffleScores.append(int(line.split("=")[1]))
			cmd = ['rm', shuffleMafPath, paddedShuffleBed, paddedShuffleFa]
			subprocess.check_call(cmd)

		cmd = ['rm', bedB, bedA]
		subprocess.check_call(cmd)
		
		shuffleScores.sort()
		maxShuffleScore = shuffleScores[int(floor(len(shuffleScores)* (1.0-args.pvalue)))]
		maxScore = maxShuffleScore
		for a in alignments:
			if a[11] > maxScore: 
				maxScore = a[11]
				break
	
	cmd = ['rm', paddedABedPath, paddedBBedPath, paddedAFaPath, paddedBFaPath, mafPath]
	subprocess.check_call(cmd)	
	
	if len(alignments) > 0 and (not args.shuffle_bg or maxScore > maxShuffleScore):
		alignment = open(args.out_prefix+".alignment_identity.txt", 'w')
		mafout = open(args.out_prefix+".maf", 'w')
		if len(finalOrfs) > 0: 
			orfout = open(args.out_prefix+".orfs.txt", 'w')
			orfout.write("#alignmentID\tgeneA\tgeneB\tlengthOrfA\tlengthOrfB\tkN\tkS\tkN/kS\torfA\torfB\n")

			for orf in finalOrfs:
				if orf[6] != 0: orfout.write("%d\t%s\t%s\t%d\t%d\t%.2f\t%.2f\t%.3f\t%s\t%s\n" % (orf[7], orf[8], orf[9], len(orf[0].replace("-", "")), len(orf[1].replace("-", "")), orf[5], orf[6], orf[5] / orf[6], orf[0].upper(), orf[1].upper()))
				else: orfout.write("%d\t%s\t%s\t%d\t%d\t%.2f\t%.2f\tinf\t%s\t%s\n" % (orf[7], orf[8], orf[9], len(orf[0].replace("-", "")), len(orf[1].replace("-", "")), orf[5], orf[6],orf[0].upper(), orf[1].upper()))
		
		for line in mafHeader:
			mafout.write(line+"\n")
		

		alignment.write("#alignmentID\talignmentScore\tgeneA\tgeneB\texonID_A\tgeneID_A\texonID_B\tgeneID_B\tindelRate(exons)\tindelRate(introns)\texonsAlignedA\texonsAlignedB\tnumSpliceSitesAligned\tnumTotalSpliceSites\n")
		counter = -1
		

		for i in range(len(alignments)):
			alignments[i].append(mafs[i])
		alignments.sort(key = lambda x: (x[3], x[4], x[10]), reverse=True)
		
		bestExonID = alignments[0][3]
		bestLociID = alignments[0][4]
		written = False
		for a in alignments:
			counter += 1
			if (not args.shuffle_bg or a[11] > maxShuffleScore):
				if (a[4] == 0.0 and not written):
					written = True
					alignment.write("%d\t%d\t%s\t%s\t%0.3f\t%0.3f\t%0.3f\t%0.3f\t" % (a[0],a[11], a[1], a[2], a[3], a[4], a[5], a[6]))
					
					#write indelRate
					if str(a[7]) != "NA": alignment.write("%0.3f\t" % a[7])
					else: alignment.write("NA\t")

					if str(a[8]) != "NA": alignment.write("%0.3f\t" % a[8])
					else: alignment.write("NA\t")

					a[9].sort()
					a[10].sort()
					if len(a[9]) > 0:
						for exon in a[9]:
							alignment.write("%d," % exon)
					else:
						alignment.write("NA")
					alignment.write("\t")
					if len(a[10]) > 0:
						for exon in a[10]:
							alignment.write("%d," % exon)
					else:
						alignment.write("NA")

					alignment.write("\t%.1f\t%d" % (a[12], min(numExonsA, numExonsB)-1))
					alignment.write("\n")
					if (float(a[4] > 0.0)):
						mafout.write(a[13])
				elif (a[4] != 0.0):
					if ((bestExonID > 0.0 and a[3] > 0.0) or (bestExonID == 0)):
						written = True
						alignment.write("%d\t%d\t%s\t%s\t%0.3f\t%0.3f\t%0.3f\t%0.3f\t" % (a[0], a[11], a[1], a[2], a[3], a[4], a[5], a[6]))
					
						#write indelRate
						if str(a[7]) != "NA": alignment.write("%0.3f\t" % a[7])
						else: alignment.write("NA\t")

						if str(a[8]) != "NA": alignment.write("%0.3f\t" % a[8])
						else: alignment.write("NA\t")
	
						a[9].sort()
						a[10].sort()
						if len(a[9]) > 0:
							for exon in a[9]:
								alignment.write("%d," % exon)
						else:
							alignment.write("NA")
						alignment.write("\t")
						if len(a[10]) > 0:
							for exon in a[10]:
								alignment.write("%d," % exon)
						else:
							alignment.write("NA")

						alignment.write("\t%.1f\t%d" % (a[12], min(numExonsA, numExonsB)-1))
						alignment.write("\n")
						if (float(a[4] > 0.0)):
							mafout.write(a[13])

		alignment.close()

if __name__ == "__main__":
	main()