# Sparse Alignment columns

In [15]:
class Contig:
    def __init__(self, name, seq):
        self.name = name
        self.seq = seq
        
    def __repr__(self):
        return '< "%s" %i nucleotides>' % (self.name, len(self.seq))

def read_contigs(input_file_path):
    contigs = []
    current_name = ""
    seq_collection = []

    # Pre-read generates an array of contigs with labels and sequences
    with open(input_file_path, 'r') as streamFASTAFile:
        for read in streamFASTAFile.read().splitlines():
            if read == "":
                continue
            if read[0] == ">":
                # If we have sequence gathered and we run into a second (or more) block
                if len(seq_collection) > 0:
                    sequence = "".join(seq_collection)
                    seq_collection = []  # clear
                    contigs.append(Contig(current_name, sequence))
                current_name = read[1:]  # remove >
            else:
                # collects the sequence to be stored in the contig, constant time performance don't concat strings!
                seq_collection.append(read.upper())

    # add the last contig to the list
    sequence = "".join(seq_collection)
    contigs.append(Contig(current_name, sequence))
    return contigs


In [47]:
from collections import Counter

species = read_contigs('9927_alignment.fasta')
informative_columns = {}
consensus_sequence = []
for col in range(len(species[0].seq)):
    letters = []
    for entry in species:
        letters.append(entry.seq[col])
    column_seq = ''.join(letters)
    consensusing = Counter(column_seq)
    consensus_sequence.append(consensusing.most_common()[0][0])
    if column_seq != letters[0] * len(species) and col > 200 and col < 1500:
        informative_columns[col] = column_seq
        print(column_seq, col+1)
species.append(Contig('Consensus', ''.join(consensus_sequence)))

CCCCCCCCCTTCCCCACCCAACCCCCCCC 210
GGGGGGGGGGGGGGGGGGGGGAGGGGGGG 211
CCCCCCCCCCCGCCCCCCCCCCCCCCCCC 212
AAATAAAAAAAAAAAAAAAAAAAAAAGGA 216
GGGGGGGGGGGAGGGGTTGGGGTGGGGGG 223
GGGGGAGGGGGGGGGGGGGGGGGGGGGGG 226
TTTTTTTTTTTAAAAAAATTTTTTTTTTT 229
TTTTTTTTTTTTTTTTTTT-TTTTTTTTT 232
TTTTTTTTTTTTTTTTTTT-TTTTTTTTT 233
CCCCCCCCCCCCCCCCCCC-CCCCCCCCC 234
TTTTTTTTTTTTTTTTTTT-TTTTTTTTT 235
AAAAAAAAAAAAAAAAAAA-AAAAAAAAA 236
GGGGGGGGGGGGGGGGGGC-GGGGGGGGG 237
TTTTTTTTTTTTTTTTTTT-TTTTTTTTT 238
AAAAAAAAAAAAAAAAAAA-AAAAAAAAA 239
CCCCCCCCCCCCCCCCCCC-CCCCCCCCC 240
CCCCCCCCCCCCCCCACCC-CCCCCCCCC 241
AAAAAAAAAAAAAAAAAAA-AAAAAAAAA 242
GGGGGGGGGGGGGGGGGGG-GGGGGGGGG 243
TTTTTTTTTTTTTTTTTTT-TTTTTTTTT 244
AAAAAAAAAAAAAAAAAAA-AAAAAAAAA 245
GGGGGGGGGGGGGGGGGGG-GGGGGGGGG 246
TTTTTTTTTTTTCTTTTTT-TTTCTTTTT 247
GGGGGGGGGGGGGGGGGGG-AGGGGGGGG 248
TTTTTTTTTTTTTTTTTTT-TTTTTTTTT 249
GGGGGGGGGGGGGGGGGGG-GGGGGGGGG 250
GGGGGGGGGGGGGGGGGGG-GGGGGGGGG 251
AAAAAAAAAAAAAAAAAAA-AAAAAAAAA 252
AAAGAAAAAAAAAAAAAAAAAAAAAAAAA 280
CCCCCCCCCCCCCC

* Generate a fasta with informative columns
* Majority vote consensus sequence, but it includes gaps
* transpose?
* CSV file write

In [49]:
with open('9927_informative_positions.csv', 'w') as csv_out:
    csv_out.write('Positions,' + ','.join([str(x+1) for x in sorted(informative_columns.keys())]))
    csv_out.write('\n')
    for entry in species:
        csv_out.write(entry.name[:6] + ",")
        for col in range(len(species[0].seq)):
            if col in informative_columns:
                csv_out.write(entry.seq[col] + ",")
        csv_out.write('\n')
            

'A'

In [11]:
base_command = "java -cp CONTEXT-.jar uk.ac.qmul.sbcs.evolution.convergence.runners.BasicAlignmentStats "
data_directory = './Data/'

In [12]:
from glob import glob

for filename in glob(data_directory + '*'):
    print(base_command + filename)

java -cp CONTEXT-.jar uk.ac.qmul.sbcs.evolution.convergence.runners.BasicAlignmentStats ./Data\OG100_full_length_guidance_results_full_length_MUSCLE.MSA.MUSCLE.Without_low_SP_Col.With_Names.fasta
java -cp CONTEXT-.jar uk.ac.qmul.sbcs.evolution.convergence.runners.BasicAlignmentStats ./Data\OG133_full_length_guidance_results_full_length_MUSCLE.MSA.MUSCLE.Without_low_SP_Col.With_Names.fasta
java -cp CONTEXT-.jar uk.ac.qmul.sbcs.evolution.convergence.runners.BasicAlignmentStats ./Data\OG149_full_length_guidance_results_full_length_MUSCLE.MSA.MUSCLE.Without_low_SP_Col.With_Names.fasta
java -cp CONTEXT-.jar uk.ac.qmul.sbcs.evolution.convergence.runners.BasicAlignmentStats ./Data\OG237_full_length_guidance_results_full_length_MUSCLE.MSA.MUSCLE.Without_low_SP_Col.With_Names.fasta
java -cp CONTEXT-.jar uk.ac.qmul.sbcs.evolution.convergence.runners.BasicAlignmentStats ./Data\OG267_full_length_guidance_results_full_length_MUSCLE.MSA.MUSCLE.Without_low_SP_Col.With_Names.fasta
java -cp CONTEXT-.ja

trans = {'AGC': 'Y', 'TTC': 'A'}
for codon in ['']

* Iterate over all the sequences at the same time
* for each position, how many species can you differentiate
* keep of list of species
* 

In [1]:
for line in open('9927_differences_table.csv', 'r'):
    print(line)

FRAX21_FRAEX388Â  241 CAGTAGTGTGGATTGGAAGGCAGAGTTTCTAGGGGAGATAGACCCATATGGATATTTGCC,c 398,c 446,indel aag 453,a 445,t 603,t 682,c 743,c 868,,,,,,,

FRAX19_FRAEX388Â  241 CAGTAGTGTGGATTGGAAGGCAGAGTTTCTAGGGGAGATAGACCCATATGGATATTTGCC,t 350,t 443,t 463,c 502,c 521,t 542,g 590,a 611,g 648,a 734,c 743,a 758,a 920,,

FRAX20_FRAEX388Â  241 CAGTAGTGTGGATTGGAAGGCAGAGTTTCTAGGGGAGATAGACCCATATGGATATTTGCC,t 350,t 443,t 463,c 502,c 521,t 525,g 558,g 590,a 611,g 648,a 732,a 734,c 743,a 758,a 920

FRAX31_FRAEX388Â  241 CAGTAGTGTGGATTGGAAGGCAGAGTTTCTAGGGGAGATGGACCCATATGGATATTTGCC 1,g 281,a 486,a 518,c 524,c 527,t 589,c 728,c 743,a 773,c 809,a 940,t 945,,,

FRAX28_FRAEX388Â  241 CAGTAGTGTGGATTGGAAGGCAGAGTTTCTAGGGGAGATAGACCCATATGGATATTTGCC,c 473,a 540,c 578,g 603,t 680,c 809,g 881,,,,,,,,

FRAX12_FRAEX388Â  241 CAGTAGTGTGGATTGGAAGGCAGAGTTTCTAGGGGAGATAGACCCATATGGATATTTGCC,c 515,a 553,c 578,a 704,c 809,t 945,,,,,,,,,

FRAX29_FRAEX388Â  241 CAGTAGTGTGGATTGGAAGGCAGAGTTTCTAGGGGAGATAGACCCATATGGATATTTGCC,c 512,c 

## Terms Used
* a c t g
* indel

In [2]:
consensus = '-' * 1000
consensus

'---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [9]:
species_indicators = {}
unique_columns = set()
for line in open('9927_differences_table.csv', 'r'):
#     print(line)
    cells = line.split(',')
    species = cells[0][:6]
    print(cells)
    for cell in cells[1:]:
        if cell and cell != '\n' and 'indel' not in cell:
            unique_columns.add(int(cell[2:]))


['FRAX21_FRAEX388Â\xa0 241 CAGTAGTGTGGATTGGAAGGCAGAGTTTCTAGGGGAGATAGACCCATATGGATATTTGCC', 'c 398', 'c 446', 'indel aag 453', 'a 445', 't 603', 't 682', 'c 743', 'c 868', '', '', '', '', '', '', '\n']
['FRAX19_FRAEX388Â\xa0 241 CAGTAGTGTGGATTGGAAGGCAGAGTTTCTAGGGGAGATAGACCCATATGGATATTTGCC', 't 350', 't 443', 't 463', 'c 502', 'c 521', 't 542', 'g 590', 'a 611', 'g 648', 'a 734', 'c 743', 'a 758', 'a 920', '', '\n']
['FRAX20_FRAEX388Â\xa0 241 CAGTAGTGTGGATTGGAAGGCAGAGTTTCTAGGGGAGATAGACCCATATGGATATTTGCC', 't 350', 't 443', 't 463', 'c 502', 'c 521', 't 525', 'g 558', 'g 590', 'a 611', 'g 648', 'a 732', 'a 734', 'c 743', 'a 758', 'a 920\n']
['FRAX31_FRAEX388Â\xa0 241 CAGTAGTGTGGATTGGAAGGCAGAGTTTCTAGGGGAGATGGACCCATATGGATATTTGCC 1', 'g 281', 'a 486', 'a 518', 'c 524', 'c 527', 't 589', 'c 728', 'c 743', 'a 773', 'c 809', 'a 940', 't 945', '', '', '\n']
['FRAX28_FRAEX388Â\xa0 241 CAGTAGTGTGGATTGGAAGGCAGAGTTTCTAGGGGAGATAGACCCATATGGATATTTGCC', 'c 473', 'a 540', 'c 578', 'g 603', 't 680', 'c 809'

In [10]:
unique_columns

{281,
 284,
 294,
 312,
 327,
 329,
 330,
 342,
 350,
 351,
 373,
 374,
 398,
 443,
 445,
 446,
 457,
 463,
 465,
 473,
 486,
 488,
 502,
 512,
 515,
 518,
 521,
 524,
 525,
 527,
 538,
 540,
 542,
 546,
 550,
 551,
 553,
 558,
 561,
 578,
 584,
 589,
 590,
 600,
 603,
 611,
 620,
 645,
 648,
 680,
 682,
 687,
 693,
 704,
 711,
 717,
 728,
 732,
 734,
 743,
 758,
 765,
 773,
 794,
 809,
 833,
 839,
 858,
 863,
 868,
 881,
 884,
 888,
 920,
 922,
 934,
 940,
 945,
 974,
 982,
 983,
 994,
 1009}

In [11]:
len(unique_columns)

83

In [4]:
a = consensus
a[12] = 'X'
a

TypeError: 'str' object does not support item assignment