In [4]:
import numpy as np
import pandas as pd

In [5]:
# Define tags for Adverb, Noun and To (the preposition) , respectively
tags = ['RB', 'NN', 'TO']

In [6]:
transition_counts = {
    ('NN', 'NN'): 16241,
    ('RB', 'RB'): 2263,
    ('TO', 'TO'): 2,
    ('NN', 'TO'): 5256,
    ('RB', 'TO'): 855,
    ('TO', 'NN'): 734,
    ('NN', 'RB'): 2431,
    ('RB', 'NN'): 358,
    ('TO', 'RB'): 200
}

In [7]:
num_tags = len(tags)

# Initialize a 3X3 numpy array with zeros
transition_matrix = np.zeros((num_tags, num_tags))

# Print matrix
transition_matrix

array([[0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.]])

In [8]:
transition_matrix.shape

(3, 3)

In [10]:
sorted_tags = sorted(tags)
sorted_tags

['NN', 'RB', 'TO']

In [11]:
for i in range(num_tags):
    # Loop columns
    for j in range(num_tags):
        # Define tag pair
        tag_tuple = (sorted_tags[i], sorted_tags[j])
        # Get frequency from transition_counts dict and assign to (i, j) position in the matrix
        transition_matrix[i, j] = transition_counts.get(tag_tuple)

# Print matrix
transition_matrix

array([[1.6241e+04, 2.4310e+03, 5.2560e+03],
       [3.5800e+02, 2.2630e+03, 8.5500e+02],
       [7.3400e+02, 2.0000e+02, 2.0000e+00]])

In [12]:
# Define 'print_matrix' function
def print_matrix(matrix):
    print(pd.DataFrame(matrix, index=sorted_tags, columns=sorted_tags))

In [13]:
print_matrix(transition_matrix)

         NN      RB      TO
NN  16241.0  2431.0  5256.0
RB    358.0  2263.0   855.0
TO    734.0   200.0     2.0


In [15]:
transition_matrix = transition_matrix/10

print_matrix(transition_matrix)

        NN     RB     TO
NN  162.41  24.31  52.56
RB    3.58  22.63   8.55
TO    7.34   2.00   0.02


In [19]:
rows_sum = transition_matrix.sum(axis=1, keepdims=True)
rows_sum

array([[239.28],
       [ 34.76],
       [  9.36]])

In [21]:
import math

# Copy transition matrix for for-loop example
t_matrix_for = np.copy(transition_matrix)

# Copy transition matrix for numpy functions example
t_matrix_np = np.copy(transition_matrix)

print(t_matrix_np)

[[1.6241e+02 2.4310e+01 5.2560e+01]
 [3.5800e+00 2.2630e+01 8.5500e+00]
 [7.3400e+00 2.0000e+00 2.0000e-02]]


In [24]:
# Save diagonal in a numpy array
d = np.diag(t_matrix_np)

print(d)
# Print shape of diagonal
d.shape

[1.6241e+02 2.2630e+01 2.0000e-02]


(3,)

In [25]:
# Perform the vectorized operation
d = d + np.vectorize(math.log)(rows_sum)

# Use numpy's 'fill_diagonal' function to update the diagonal
np.fill_diagonal(t_matrix_np, d)

# Print the matrix
print_matrix(t_matrix_np)

            NN         RB         TO
NN  167.887634  24.310000  52.560000
RB    3.580000  28.107634   8.550000
TO    7.340000   2.000000   5.497634


In [28]:
# Check for equality
t_matrix_for == t_matrix_np

array([[False,  True,  True],
       [ True, False,  True],
       [ True,  True, False]])