# Text segmentation using Hidden Markov Model

## Rémi JAYLET & Hugues RENE-BAZIN

### Imports

In [27]:
from matplotlib import pyplot as plt
import numpy as np
import os

### Question 1

The initial vector is $\pi = \begin{pmatrix}1,0\end{pmatrix}$ as the mails begins in the state 1 (hypothesis). 

### Question 2

The probability to move from state 1 to state 2 :
$A[1,2] = 0.000781921964187974$. 


The probability to remain in state 2 :
$A[2,2] = 1$

The lower probability : the transition from 2 to 1 as it will always be 0.

The higher probability : the transition from 2 to 2 as it will always be 1.





### Question 3

In [6]:
P = np.loadtxt('PerlScriptAndModel/PerlScriptAndModel/P.dat')

In [7]:
print(f'The size of B is {P.shape} (number of caracters, number of states)')

The size of B is (256, 2) (number of caracters, number of states)


### Question 4

#### Initialization

In [10]:
Pi0 = np.array([1,0])  # initial state
A = np.array([[0.999218078035812, 0.000781921964187974], [0, 1]])  # transition matrix
P = np.loadtxt('PerlScriptAndModel/PerlScriptAndModel/P.dat')  #distribution
B = np.loadtxt("PerlScriptAndModel/PerlScriptAndModel/P.text")

#### Implementation of Viterbi function

In [15]:
def viterbi(X,Pi0,A,P):
    """
        Viterbi Algorithm Implementation

        Keyword arguments:
            - obs: sequence of observation
            - states:list of states
            - start_prob:vector of the initial probabilities
            - trans: transition matrix
            - emission_prob: emission probability matrix
        Returns:
            - seq: sequence of state
    """

    #pour eviter d avoir des valeurs nulles dans le log
    realmin = np.finfo(np.double).tiny
    A = np.log(A + realmin)
    Pi0 = np.log(Pi0 + realmin)
    P = np.log(P + realmin)
    
    taille = np.shape(X) #X.shape[0]
    T = taille[0] #nombre d observations
    N = Pi0.shape[0]#nombre des etats du modele
    
    #Initialisation
    logl = np.zeros((T,N))
    bcktr = np.zeros((T-1,N))
    
    logl[0,:]=Pi0 + P[X[0],:]
    for t in range(1,(T-1)):
        temp = A + logl[t,:].reshape((N,1))
        bcktr[t,:] = np.argmax(temp, axis = 0)
        logl[t+1,:] = P[X[t+1],:] + np.amax(temp, axis = 0)
    
    path = np.zeros(T, dtype = int)
    path[T-1] = np.argmax(logl[T-1,:])
    
    for t in range(T-2,-1,-1):
        path[t] = bcktr[t,path[t+1]]
        
    return logl, path

#### Results

In [16]:
mails = []
paths = []
for i in range(11,31):
    mails.append(np.loadtxt("dat/dat/mail" + str(i) + ".dat", dtype = int))
    paths.append(viterbi(mails[i-11], Pi0, A, B)[1])

#### Visualizing segmentation

In [29]:
print("id :", 15) # choose a mail id
path = paths[15-11]
path_file = "path.txt"

with open(path_file, "w") as file:
    for value in path:
        file.write(str(value+1))
        
for i in range(1, len(path)):
    if path[i] != path[i-1]:
        print("Body starts at :", i)

id : 15
Body starts at : 2183


In [30]:
os.system("perl PerlScriptAndModel/PerlScriptAndModel/segment.pl dat/dat/mail"+str(15)+".txt " + path_file + "> mail.txt")
with open("mail.txt", "r") as file:
    print(file.read())


