# Analysing for 24 chromosomes

merging data

In [None]:
import pandas as pd

In [None]:
cultivars = pd.read_csv("Cultivars.csv")
cultivars.head()

In [None]:
heights = pd.read_csv("Height.csv")
heights.head()

In [None]:
genotype = pd.read_csv("Genotype_24chr.csv")
genotype = genotype.rename(columns={'Variation ID': 'Cultivar ID'})
genotype.head()

In [None]:
genotype.shape

In [None]:
# counting del values for each SNP

delCount = []
for i in range(1, genotype.shape[1]):
    val = 0
    for j in range(6, genotype.shape[0]):
        if genotype.iloc[j, i] == "DEL":
            val = val + 1
    delCount.append((genotype.columns[i],val))

print(sorted(delCount, key=lambda x : x[1], reverse=True))

In [None]:
# counting n values for each SNP

delCount = []
for i in range(1, genotype.shape[1]):
    val = 0
    for j in range(6, genotype.shape[0]):
        if genotype.iloc[j, i] == "N":
            val = val + 1
    delCount.append((genotype.columns[i],val))

print(sorted(delCount, key=lambda x : x[1], reverse=True))

In [None]:
genotype.head()

In [None]:
genotypeHeader = genotype.iloc[:6, :]
primaryAlleleRow = 4
secondaryAlleleRow = 5
genotypeHeader

In [None]:
genotype = genotype.iloc[6:, :]
genotype

In [None]:
# replacing genotype for each column with their primary allele

for i in range(1, genotype.shape[1]):
    primaryAllele = genotypeHeader.iloc[primaryAlleleRow, i]
    secondaryAllele = genotypeHeader.iloc[secondaryAlleleRow, i]
    col = genotype.iloc[:, i]
    col.replace("DEL", secondaryAllele, inplace=True)
    col.replace("N", primaryAllele, inplace=True)

genotype

In [None]:
def extractVariantID(cultivarID):
    return cultivarID.split(sep=" ")[0]

genotype['Cultivar ID'] = genotype['Cultivar ID'].map(extractVariantID)
genotype

In [None]:
mapping = pd.merge(genotype, heights.iloc[:, 1:4], on='Cultivar ID' )

In [None]:
mapping

In [None]:
mapping['sequence'] = mapping.apply(lambda x: ''.join(x[1:-2]), axis=1)

In [None]:
mapping

In [None]:
len(mapping['sequence'].unique())

In [None]:
mapping.to_csv('mapping_24.csv', index=False)

In [None]:
mapping_dropped = mapping.drop("vg0128525986", axis=1).drop("vg0131664768", axis=1)

In [None]:
mapping_dropped['sequence'] = mapping_dropped.apply(lambda x: ''.join(x[1:-3]), axis=1)

In [None]:
mapping_dropped.to_csv('mapping_22.csv', index=False)

In [None]:
sequences = []
for i in range(mapping.shape[0]):
    sequences.append((mapping.iloc[i, 0], mapping.iloc[i, -1]))
sequences_dict = dict() 
for x, y in sequences:
    sequences_dict[x]=y
sequences_dict

In [None]:
filename = 'sequences_24.fasta'

# open the output file for writing
with open(filename, 'w') as f:

    # iterate over each sequence and write to the file in FASTA format
    for seq in sequences:
        cultivar_id, sequence = seq
        name = cultivar_id
        f.write(f'>{name}\n{sequence}\n')

In [None]:
from Bio import AlignIO
from Bio.Phylo.TreeConstruction import DistanceCalculator, DistanceTreeConstructor
from Bio import Phylo
# script to run muscle command line

import subprocess

# Set the input and output file names
input_file = "sequences_24.fasta"
output_file = "aligned_24.fasta"

# Define the MUSCLE command as a list of arguments
muscle_command = ["muscle", "-align", input_file, "-output", output_file]

# Run the MUSCLE command using subprocess
process = subprocess.Popen(muscle_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

# Get the standard output and error messages
stdout, stderr = process.communicate()

# Print the standard output and error messages
print(stdout.decode())
print(stderr.decode())


In [None]:
# Read in the FASTA file and create phylogenetic tree
alignment = AlignIO.read("aligned_24.fasta", "fasta")

# Calculate the distance matrix
calculator = DistanceCalculator('identity')
dm = calculator.get_distance(alignment)

# Construct the tree using the UPGMA
constructor = DistanceTreeConstructor(calculator, 'upgma')
tree = constructor.build_tree(alignment)

# Visualize the tree
# print(Phylo.draw_ascii(tree))

In [None]:
Phylo.draw(tree)

In [None]:
Phylo.write(tree, "tree_24.nwk", "newick")

In [None]:
# Get the tuple representation of the tree
def get_tuple(clade):
    if clade.is_terminal():
        return clade.name
    else:
        return (get_tuple(clade.clades[0]), get_tuple(clade.clades[1]))

tree_tuple = get_tuple(tree.clade)

print(tree_tuple)

In [None]:
s = str(tree_tuple).replace(" ", "").replace("'", "")
non_characters = ["(", ")", "'"]
clusters = []
i = 0
while i < len(s):
    if s[i] == "(":
        clusterOpen = i
        flag = 0
        i+=1
        while i < len(s):
            if s[i] not in non_characters:
                flag = 1
            if s[i] == "(" and flag == 1:
                break
            i+=1
        clusterClose = i-1
        cluster = s[clusterOpen:clusterClose].replace("(", "").replace(")", "").split(",")
        clusters.append(cluster)

clusters

In [None]:
len(clusters)

In [None]:
clusters_dict = dict()
for n, cluster in enumerate(clusters):
    clusters_dict[n] = cluster
clusters_dict

In [None]:
sequence_cluster_dict = dict()
for k, v in clusters_dict.items():
    for cultivar in v:
        sequence_cluster_dict[cultivar] = k
sequence_cluster_dict

In [None]:
unique_sequences_per_cluster = dict()
for k,v in clusters_dict.items():
    unique_sequences_per_cluster[k] = list(set(sequences_dict[cult] for cult in v))
unique_sequences_per_cluster

In [None]:
sequences_dict

In [None]:
import json

listX = [sequences_dict, clusters_dict, sequence_cluster_dict, unique_sequences_per_cluster]
listN = ['sequences_dict', 'clusters_dict', 'sequence_cluster_dict', 'unique_sequences_per_cluster']

# Open a new file in write mode

for i in range(len(listN)):
    print(listN[i]+".json")
    with open(listN[i]+".json", "w") as f:
        json.dump(list[i], f)
        print(list[i])
        print()


# for file in listN:
#     with open(str(file)+".json", "w") as f:
#         # Serialize the dictionary to a JSON string and write it to the file
#         json.dump(file, f)


In [46]:
import json
# Open a new file in write mode
with open("sequence_cluster_dict", "w") as f:
    # Serialize the dictionary to a JSON string and write it to the file
    json.dump(sequence_cluster_dict, f)