# Language Detection

In [1]:
from glob import glob #importing library to load files

#reading texts from files and storing as lists
languages = ['A', 'B', 'C'] #languages
A = [open(f).read() for f in glob('symbol/language-training-langA*')] #train set
B = [open(f).read() for f in glob('symbol/language-training-langB*')] #train set
C = [open(f).read() for f in glob('symbol/language-training-langC*')] #train set
test = [open(f).read() for f in glob('symbol/language-test-*')] #test set

In [2]:
#extracting all the unique letters in training languages
letters = list({l for word in A+B+C for l in word})
letters

['e', 'g', 'k', 'o', 't', 'p', 'A']

Breaking down Bayes theorem components for our classifier

In [3]:
def posterior(lang, txt): #posterior, P(LANGUAGE|TEXT)
    return likelihood(txt, lang) * prior(lang) / marginalization(txt)

In [4]:
def likelihood(txt, lang): #likelihood, P(TEXT|LANGUAGE)
    likelihood = prob(txt[0], lang) #prior for the first letter
    for i in range(len(txt)-1):
        cur, nxt = txt[i], txt[i+1] #combo of two continuos letters at a time
        likelihood *= prob(cur + nxt, lang) #P(COMBO|LANGUAGE)
    return likelihood

In [5]:
def prior(language): #prior, P(LANGUAGE)
    return 1/len(languages)

In [6]:
def marginalization(txt): #marginalization, P(TEXT), denominator
    m = 0
    for lang in languages: m += likelihood(txt, lang)
    return m

In [7]:
def prob(txt, lang): #P(LETTER|LANGUAGE) or P(2 LETTERS|LANGUAGE) [markov]
    if len(txt) == 1: #letter occurence/occurence of all letters
        if lang == 'A': return tmA[txt].sum()/tmA.values.sum()
        elif lang == 'B': return tmB[txt].sum()/tmB.values.sum()
        elif lang == 'C': return tmC[txt].sum()/tmC.values.sum()
        else: print("Language not found!")
    else: #normalized value from transition matrices
        if lang == 'A': return tmA[txt[1]][txt[0]]
        elif lang == 'B': return tmB[txt[1]][txt[0]]
        elif lang == 'C': return tmC[txt[1]][txt[0]]
        else: print("Language not found!")

In [8]:
import pandas as pd #loading library
#markov model using current state to next state probabilities
def transitionMatrix(txts): #transition matrix generator for a list of texts
    tm = pd.DataFrame(index=letters,columns=letters)
    tm = tm.fillna(0) #empty matrix initialization
    for txt in txts:
        for i in range(len(txt)-1):
            cur, nxt = txt[i], txt[i+1]
            tm[nxt][cur] += 1
    return tm/tm.sum()

In [9]:
def classify(txt): #posterior distribution and classification
    posts=[]
    print("Text: ", txt)
    for lang in languages:
        post = (posterior(lang, txt))
        posts.append(post)
        print("Posterior for language", lang, ": " , post)
    print("Language Class:", languages[posts.index(max(posts))], '\n')

In [10]:
#training, transition matrices for all languages
tmA = transitionMatrix(A)
tmB = transitionMatrix(B)
tmC = transitionMatrix(C)

In [11]:
for txt in test: #classifying all the test cases
    classify(txt)

Text:  pppooootgAookggggtttopAtttkkkeeggeeeeAAAgtkoAkkkkkooppppttppppppgppteoooooottkttttkkkktAAApgookkkkkp
Posterior for language A :  2.1395507310550748e-61
Posterior for language B :  0.0
Posterior for language C :  0.3333333333333333
Language Class: C 

Text:  gooooAAAAAAAAAkkkkkkooooAAAeppppppgeeeeepAAppeektetttgggogptttttttkppAAAApetAeegggtttteetttttppAAAAA
Posterior for language A :  1.161719418110752e-68
Posterior for language B :  0.0
Posterior for language C :  0.3333333333333333
Language Class: C 

Text:  ekogoAgkepokogoppAttpAttgeekApegepApotpAAtpetgAtpopAttpAppAtkokettkgAttggokoogApppepogeApopogetpokog
Posterior for language A :  0.3333333333333333
Posterior for language B :  0.0
Posterior for language C :  3.7046111214919513e-35
Language Class: A 

Text:  AtAgegegegAgegetoAtetAogAooAoeAtegAgeotAoAgoetAteAteoegoeogetekoAoegAoegAtegAgeotAtAoAtetAtgeggAtAto
Posterior for language A :  7.248250988955608e-47
Posterior for language B :  0.33333333333333326
Posterior for languag

There is a weird pattern of obtaining 0.333... posterior for classified class

In [12]:
import numpy as np
file = open('speaker.txt').read() #loading file
phonemes = list({p for word in file for p in word}) #extracting unique
n_phonemes = len(phonemes)
print(phonemes)

['e', 'g', 'k', 'o', 't', 'p', 'A']


In [13]:
speakers = ['A', 'B', 'C']
n_speakers = len(speakers)
#random intial probablities distributed for speakers
initialProb = [0.3, 0.3, 0.4] 
print("Initial Probabilties: ", initialProb) #starting probabilities

Initial Probabilties:  [0.3, 0.3, 0.4]


In [14]:
#initializing transition matrix
#assuming uniform priors for interruption
transitionMatrix = np.full((n_speakers, n_speakers), 0.1)
for i in range(n_speakers):
    transitionMatrix[i][i] = 0.8
print("Transition Matrix: \n", transitionMatrix)

Transition Matrix: 
 [[0.8 0.1 0.1]
 [0.1 0.8 0.1]
 [0.1 0.1 0.8]]


In [15]:
emissionMatrix = np.zeros((n_speakers, n_phonemes)) #initializing emission matrix
for i in range(n_speakers):
    p = np.random.dirichlet(np.ones(n_phonemes)) #random & normalized
    emissionMatrix[i,:]= p[0]
print("Emission Matrix: \n", emissionMatrix)

Emission Matrix: 
 [[0.21835967 0.21835967 0.21835967 0.21835967 0.21835967 0.21835967
  0.21835967]
 [0.24487286 0.24487286 0.24487286 0.24487286 0.24487286 0.24487286
  0.24487286]
 [0.02955606 0.02955606 0.02955606 0.02955606 0.02955606 0.02955606
  0.02955606]]


In [28]:
data = []
#extracting characters and storing their ASCII value, data prep for HMM model
for i in range(len(file)):
    data.append(ord(file[i]))
#I got an error and jupyter suggested me to reshape data and it worked
data = np.asarray(data).reshape(-1,1)

In [30]:
from hmmlearn import hmm #loading library
model = hmm.MultinomialHMM(n_components=n_speakers)
model.startprob = initialProb #inital probability
model.transmat = transitionMatrix #transition matrix
model.emissionprob = emissionMatrix #emission matrix
model.fit(data) #data
print(model.predict(data))

[2 2 0 0 0 1 1 1 0 0 0 0 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 1 1 1
 0 1 1 1 1 1 1 1 0 2 2 2 2 2 2 2 2 0 0 0 0 0 0 0 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 0 2 2 2 2 2 0 1 1 1 1 1 0 0 0 2 2 2 2 2 2 2 0 1
 1 1 1 1 1 0 0 0 1 1 1 1 1 1 1 0 1 1 0 0 0 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 1
 0 2 2 2 2 2 2 2 0 0 1 1 1 0 2 2 2 2 2 2 2 0 1 1 1 1 1 0 0 2 2 2 2 2 2 2 2
 2 2 2 2 2 0 1 1 1 1 1 1 1 0 0 2 2 2 2 2 2 0 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 0 0 2 2 2 2 0 0 1 1 0 1 1 1
 1 1 0 2 2 2 2 2 2 2 2 2 0 0 0 1 1 0 2 2 2 2 2 2 2 2 2 2 2 0 1 1 0 1 1 0 0
 0 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 0 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 0 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 0 1 0 0 2 2 2
 2 2 2 0 0 0 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 0 2 2 2 2 2 2 2 2 2 2 2 0 0 0 0
 1 1 1 1 1 1 1 1 1 1 1 1 1 0 2 2 2 2 0 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 0 2 2 2 2 2 0 0 2 2 0 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 1 1 0 1 1 1 0 0
 2 2 2 2 2 2 0 0 0 0 0 0 

Reference: https://github.com/madsbk/hidden-markov-model-example