### Question 2: Building Markov transition matrix
You have learned important differences between simple dinucleotide frequency model and Markov models. Your goal is to build one from the input sequence. In the Jupyter notebook, you should print the transition matrix.

In [23]:
# Imports
from collections import defaultdict
import numpy as np
import pandas as pd

# Read and flatten the DNA sequence from singleline_output.fasta
def read_input(path):
    sequence = ''
    with open(path, 'r') as f:
        for line in f:
            if not line.startswith('>'):
                sequence += line.strip().upper()
    return ''.join([c for c in sequence if c in 'ACGT'])
sequence = read_input('singleline_output.fasta')

# Build transition counts
nucleotides = 'ACGT'
transition_counts = defaultdict(lambda: defaultdict(int))

for i in range(len(sequence) - 1):
    a, b = sequence[i], sequence[i + 1]
    transition_counts[a][b] += 1

# Convert to normalized transition matrix
transition_matrix = np.zeros((4, 4))

for i, from_base in enumerate(nucleotides):
    total = sum(transition_counts[from_base].values())
    if total > 0:
        for j, to_base in enumerate(nucleotides):
            transition_matrix[i][j] = transition_counts[from_base][to_base] / total

 ### Properties of transition matrices:
      1. tij represents the entry in row i and column j
      2. tij = the probability of moving from state represented by row i to the state represented by row j in a single transition.
      3. tij is a conditional probability which we can write as tij = P(next state is the state in column j | current state is the state in row i)
      4. Each row adds to 1.
      5. All entries are between 0 and 1, inclusive because they are probablities.
      6. The transition matrix represents change over one transition period.

In [24]:
# Display as Markov Transition Matrix
df = pd.DataFrame(transition_matrix, index=list('ACGT'), columns=list('ACGT'))
print("Markov Transition Matrix (rows = from, columns = to):")
print(df.round(4))

Markov Transition Matrix (rows = from, columns = to):
        A       C       G       T
A  0.2509  0.2426  0.2648  0.2417
C  0.2465  0.2544  0.2449  0.2541
G  0.2518  0.2474  0.2556  0.2452
T  0.2709  0.2374  0.2502  0.2414
