In [3]:
import numpy as np
import pandas as pd


In [4]:
def build_markov_matrix(sequence):
    # Define nucleotide indices
    nucleotides = ['a', 'c', 'g', 't']
    index = {nuc: i for i, nuc in enumerate(nucleotides)}

    # Initialize 4x4 matrix
    matrix = np.zeros((4, 4), dtype=int)

    # Count transitions
    for i in range(len(sequence) - 1):
        curr = sequence[i]
        next_ = sequence[i + 1]
        if curr in index and next_ in index:
            matrix[index[curr], index[next_]] += 1

    # Normalize to get probabilities
    transition_matrix = matrix.astype(float)
    row_sums = transition_matrix.sum(axis=1)
    for i in range(4):
        if row_sums[i] > 0:
            transition_matrix[i] /= row_sums[i]

    # Display as DataFrame
    df = pd.DataFrame(transition_matrix, columns=nucleotides, index=nucleotides)
    return df


random dna sequence generated from: https://www.bioinformatics.org/sms2/random_dna.html

In [5]:
dna_seq = "ttgaatccctgtacgttaagtatatcacagtgttgtatgtcgagttgggtcgtagccaatacgtgcctccgtacagaggtctattttaactagtaggctcatttacttgagggactaatgtccaactcatattagcgttgggacgcgtaatggacggagccagcctaaggcgaaccgatggcatcaaatacggttgacgtccttatggggaagctcagggtagaagacagttttaacagatccctacggggcgccccttggcattagccagacctcggtgcaacatcagacttgttgggtttcaaataagtaccccgcctgtaaactcccgcgagccatgccgggtggagttactgcgttttgcggctcggagtataatgcctataaacgtctaccgcaaaatgaggatatgagggatctcaacctcgactctattaagcccagacgacgtgaaacaggggctactctctgatagccccatcgacatatagttcccgattaatattttaatttctatatagatcctcgggaagccgcctcgcgtcggttgcaggcattccaagagtatcccgctgtcagagatatgaggtggtgatatcattgacctatacttcgcaacggggatagacgttggcgggctcgcaccaagtcgattacatcaccgacccggatcagagcccgcgataacccataatatgcagagtcgactcacattcaggagccgtagcatcatcatcggctggcaatcgtacaaccccggggatcctaagccatccgttgctatcgagttatttgcgttcgacaataattgctgcttagtacggaacgtggacccgtagctgggaataatcagagtttccctttacgctcgatcgtgtgttcgacacgcgttgattgatgattactacaggaagagccaccgcgacaatcgcgtggctgcttctgcgcgcatcacgaggctaggataagcaaacatctacgcgatttttgcctgcgcgga"
markov_df = build_markov_matrix(dna_seq)
print(markov_df)


          a         c         g         t
a  0.205645  0.233871  0.250000  0.310484
c  0.232932  0.257028  0.301205  0.208835
g  0.267717  0.259843  0.251969  0.220472
t  0.290323  0.245968  0.213710  0.250000
