In [1]:
def printMatrix(mat):
    for i in range(len(mat)):
        for j in range(len(mat[i])):
            print(mat[i][j], end=" ")
        print()

def constructMatrix(str1, str2):
    list = [[0 for i in range(len(str1)+1)] for j in range(len(str2)+1)]
    return list

def printStringMatrix(matrix, str1, str2):
    print("", end = "  ")
    print("_", end = " ")
    for i in range(len(str1)):
        print(str1[i], end = " ")
    print()
    for i in range(len(matrix)):
        if i == 0 :
            print("_", end = " ")
        else:
            print(str2[i-1], end = " ")
        for j in range(len(matrix[i])):
            print(matrix[i][j], end = " ")
        print()

def initMatrix(matrix, gapPenalty):
    x=0
    for i in range(len(matrix)):
        matrix[i][0] = x * gapPenalty
        x=x+1
    x=0
    for i in range(len(matrix[0])):
        matrix[0][i] = x * gapPenalty
        x=x+1
    return matrix

def getMinPenalty(matrix, i, j, str1, str2,matchPenalty, mismatchPenalty, gapPenalty ):
    if str2[i-1] == str1[j-1]:
        currPenalty = matchPenalty
    else:
        currPenalty = mismatchPenalty

    top = matrix[i-1][j] + gapPenalty
    left = matrix[i][j-1] + gapPenalty
    diag = matrix[i-1][j-1] + currPenalty
    maxE = max(top, left, diag)
    currPenalty = maxE

    return currPenalty

def fillMatrix(matrix, str1, str2, matchPenalty, mismatchPenalty, gapPenalty):
    for i in range(1, len(matrix)):
        for j in range(1, len(matrix[0])):
            matrix[i][j] = getMinPenalty(matrix, i, j, str1, str2, matchPenalty, mismatchPenalty, gapPenalty)


def getPrevious(matrix, i, j,matchPenalty, mismatchPenalty, gapPenalty ):

    score = matrix[i][j]
    top = matrix[i-1][j]
    left = matrix[i][j-1]
    diag = matrix[i-1][j-1]

    maxPrevIndexI = 0
    maxPrevIndexJ = 0
    maxPrevDir = 0
    if diag + matchPenalty == score or diag + mismatchPenalty == score:
        maxPrevIndexI = i-1
        maxPrevIndexJ = j-1
        maxPrevDir = 0
    elif top + gapPenalty == score:
        maxPrevIndexI = i-1
        maxPrevIndexJ = j
        maxPrevDir = 1
    elif left + gapPenalty == score:
        maxPrevIndexI = i
        maxPrevIndexJ = j-1
        maxPrevDir = 2

    return maxPrevDir, maxPrevIndexI, maxPrevIndexJ


def backTrack(matrix, matchPenalty, mismatchPenalty, gapPenalty):

    score = 0
    directions = []

    i = len(matrix)-1
    j = len(matrix[0])-1

    while i>=0 and j>=0:
        score += matrix[i][j]
        if i == 0 or j == 0:
            break
        prev = getPrevious(matrix, i, j, matchPenalty, mismatchPenalty, gapPenalty)
        directions.insert(0, prev[0])
        i = prev[1]
        j = prev[2]
    return directions, score


In [2]:
def alignSequences(sequence1, sequence2, matchPenalty = 1, mismatchPenalty = -1, gapPenalty = -2):
    mat = constructMatrix(sequence1, sequence2)
    matrix = initMatrix(mat, gapPenalty)
    fillMatrix(matrix, sequence1, sequence2, matchPenalty, mismatchPenalty, gapPenalty)
    dir = backTrack(matrix, matchPenalty, mismatchPenalty, gapPenalty)
    return dir[1]

In [6]:
alignSequences("GGGGGCCATCGTAATGTCACCTTT", "AAGGGCCATAGGTGTGTTAACCTC")

72

In [3]:
from Bio import SeqIO
sequences = {}
for record in SeqIO.parse("sequences_24.fasta", "fasta"):
    sequences[record.id] = str(record.seq)
print(len(sequences))

529


In [4]:
sequences

{'C001': 'GGGAGCCATCGTAATGTTTCCCCC',
 'C002': 'GGGGGCCACCGTAATGCCTCCCCC',
 'C003': 'GAGAGCCATAGGAGTGCCAACCTT',
 'C004': 'GAGGGCCATAGGAGAACTACTCTC',
 'C005': 'AGGGGCCATAGTAAAGCCAACTTC',
 'C006': 'GGGAGCGGTATTAATGTTACCTTT',
 'C007': 'GGGAGCGGTATTAATGTTACCTTT',
 'C008': 'GGGGGCGGTATGTATGTTACCTTC',
 'C009': 'GGGAGCCGTATGAATGTTACCTTT',
 'C010': 'GAGGGCCGTATGTATGCCAACCTT',
 'C011': 'GGGAGCGGTATGTATGTTACCTTT',
 'C012': 'GAAAGCCATAGGTGAGTTAACCTT',
 'C013': 'GGGGGCCACCGTAATGTCTCCCTC',
 'C014': 'GAAAGCGGTATGTGTACCAACTTT',
 'C015': 'GGGGGCCACAGTAATGTCTCCCCC',
 'C016': 'GAGAGCCATAGGTGTACCAACCCT',
 'C017': 'GAAGGCCATAGGTGTACCAACCTT',
 'C018': 'GAAAGCCATAGGTGTACCAACCTT',
 'C019': 'GGGAGCGGTATTAATGTTACCTTT',
 'C020': 'GGGAGCGGTAGTAATGTTACCTCT',
 'C021': 'GGGAGCGGTATTAATGCTACCTCT',
 'C022': 'GGGAGCGATCGGTATGTTACCTTT',
 'C023': 'GAGGGCCATAGGTGTACCAACCTT',
 'C024': 'GGAAGCGATCGTAATGTTAACTCT',
 'C025': 'GGGGGCGGTATGTATGTTACCTTT',
 'C026': 'GAAAGCCATAGGTGTACCAACCTT',
 'C027': 'GGGAGCGGTATGAATGTTACCTTC',
 

In [5]:
querySequence = "AAGATCTNNNNNNAACCTTTC"
alignmentScore = dict()
for k,v in sequences.items():
    alignmentScore[k] = alignSequences(querySequence, v)
print(alignmentScore)

{'C001': -148, 'C002': -245, 'C003': -163, 'C004': -169, 'C005': -173, 'C006': -219, 'C007': -219, 'C008': -261, 'C009': -219, 'C010': -195, 'C011': -219, 'C012': -210, 'C013': -249, 'C014': -182, 'C015': -253, 'C016': -151, 'C017': -168, 'C018': -168, 'C019': -219, 'C020': -223, 'C021': -191, 'C022': -187, 'C023': -208, 'C024': -164, 'C025': -261, 'C026': -168, 'C027': -219, 'C028': -168, 'C029': -179, 'C030': -219, 'C031': -219, 'C032': -143, 'C033': -168, 'C034': -250, 'C035': -168, 'C036': -180, 'C037': -219, 'C038': -201, 'C039': -265, 'C040': -223, 'C041': -219, 'C042': -261, 'C043': -219, 'C044': -261, 'C045': -219, 'C046': -219, 'C047': -184, 'C048': -210, 'C049': -210, 'C050': -210, 'C051': -195, 'C052': -168, 'C053': -210, 'C054': -250, 'C055': -215, 'C056': -151, 'C057': -168, 'C058': -223, 'C059': -223, 'C060': -191, 'C061': -219, 'C062': -219, 'C063': -162, 'C064': -210, 'C065': -219, 'C066': -237, 'C067': -180, 'C068': -219, 'C069': -219, 'C070': -139, 'C071': -157, 'C072

In [6]:
sorted_dict_by_value_desc = dict(sorted(alignmentScore.items(), key=lambda item: item[1], reverse=True))
sorted_dict_by_value_desc

{'W038': -95,
 'W011': -103,
 'C138': -105,
 'C082': -121,
 'C074': -129,
 'C186': -129,
 'W122': -129,
 'C111': -133,
 'C137': -133,
 'W002': -133,
 'W004': -133,
 'W288': -133,
 'W008': -135,
 'W324': -137,
 'C070': -139,
 'C084': -141,
 'W067': -141,
 'W116': -141,
 'C107': -142,
 'C142': -142,
 'W054': -142,
 'W154': -142,
 'W172': -142,
 'W274': -142,
 'W291': -142,
 'W298': -142,
 'C032': -143,
 'W112': -145,
 'W007': -147,
 'W097': -147,
 'W127': -147,
 'W260': -147,
 'W326': -147,
 'C001': -148,
 'W022': -149,
 'W119': -149,
 'W143': -149,
 'C185': -150,
 'W021': -150,
 'C016': -151,
 'C056': -151,
 'C085': -151,
 'W125': -152,
 'W070': -153,
 'W173': -156,
 'C071': -157,
 'W003': -158,
 'W323': -158,
 'C089': -159,
 'C187': -159,
 'W013': -160,
 'W060': -160,
 'W249': -160,
 'W250': -160,
 'C101': -161,
 'C106': -161,
 'C144': -161,
 'C063': -162,
 'C120': -162,
 'C149': -162,
 'C172': -162,
 'C003': -163,
 'W128': -163,
 'C024': -164,
 'C113': -164,
 'C145': -164,
 'C174': -1

In [7]:
k = 5

In [8]:
topk = list(sorted_dict_by_value_desc.keys())[:k]
topk

['W038', 'W011', 'C138', 'C082', 'C074']

In [9]:
import pandas as pd
df = pd.read_csv("mapping_24.csv")
df['Subpopulation']= df['Subpopulation'].astype("string")

In [10]:
print(df.columns)

Index(['Cultivar ID', 'vg0112116426', 'vg0128525986', 'vg0130976864',
       'vg0131664768', 'vg0133440209', 'vg0135617816', 'vg0135642980',
       'vg0138418739', 'vg0138428840', 'vg0138608956', 'vg0138999212',
       'vg0405463422', 'vg0405463763', 'vg0603483061', 'vg0713178880',
       'vg0719727299', 'vg0719727339', 'vg0719834473', 'vg0819793460',
       'vg0904094998', 'vg0904282939', 'vg1019044175', 'vg1123563633',
       'vg1207667840', 'Subpopulation', 'Plant Height (cm)', 'sequence'],
      dtype='object')


In [15]:
for cultivar in topk:
    row = df.loc[df['Cultivar ID'] == cultivar]
    print(cultivar, row['Subpopulation'].item() )

W038 Temperate Japonica
W011 Temperate Japonica
C138 Temperate Japonica
C082 Temperate Japonica
C074 Temperate Japonica


In [14]:
import json

# Read the JSON data from a file
with open("json/clusters_dict.json", "r") as f:
    clusters_dict = json.loads(f.read())

with open("json/sequence_cluster_dict.json", "r") as f:
    sequence_cluster_dict = json.loads(f.read())

with open("json/sequences_dict.json", "r") as f:
    sequences_dict = json.loads(f.read())

with open("json/unique_sequences_per_cluster.json", "r") as f:
    unique_sequences_per_cluster = json.loads(f.read())


In [17]:
clusters = []
subpops = []
for cultivar in topk:
    row = df.loc[df['Cultivar ID'] == cultivar]
    print(cultivar, row['Subpopulation'].item(), sequence_cluster_dict[cultivar])
    clusters.append(sequence_cluster_dict[cultivar])
    subpops.append(row['Subpopulation'].item())

W038 Temperate Japonica 37
W011 Temperate Japonica 25
C138 Temperate Japonica 37
C082 Temperate Japonica 36
C074 Temperate Japonica 39


In [28]:
cluster_set = set(clusters)
cluster_freq = dict()
for x in cluster_set:
    cluster_freq[x] = clusters.count(x)
cluster_freq = dict(sorted(cluster_freq.items(), key=lambda item: item[1], reverse=True))
cluster_belong = list(cluster_freq.keys())[0]
cluster_belong

37

In [31]:
subpop_set = set(subpops)
subpop_freq = dict()
for x in subpop_set:
    subpop_freq[x] = subpops.count(x)
subpop_freq = dict(sorted(subpop_freq.items(), key=lambda item: item[1], reverse=True))
subpop_belong = list(subpop_freq.keys())[0]
subpop_belong

'Temperate Japonica'