In [1]:
!pip install -U scikit-learn

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [15]:
!tar -xvf languageID.tgz

languageID/
languageID/e0.txt
languageID/e10.txt
languageID/e11.txt
languageID/e12.txt
languageID/e13.txt
languageID/e14.txt
languageID/e15.txt
languageID/e16.txt
languageID/e17.txt
languageID/e18.txt
languageID/e19.txt
languageID/e1.txt
languageID/e2.txt
languageID/e3.txt
languageID/e4.txt
languageID/e5.txt
languageID/e6.txt
languageID/e7.txt
languageID/e8.txt
languageID/e9.txt
languageID/j0.txt
languageID/j10.txt
languageID/j11.txt
languageID/j12.txt
languageID/j13.txt
languageID/j14.txt
languageID/j15.txt
languageID/j16.txt
languageID/j17.txt
languageID/j18.txt
languageID/j19.txt
languageID/j1.txt
languageID/j2.txt
languageID/j3.txt
languageID/j4.txt
languageID/j5.txt
languageID/j6.txt
languageID/j7.txt
languageID/j8.txt
languageID/j9.txt
languageID/s0.txt
languageID/s1.txt
languageID/s10.txt
languageID/s11.txt
languageID/s12.txt
languageID/s13.txt
languageID/s14.txt
languageID/s15.txt
languageID/s16.txt
languageID/s17.txt
languageID/s18.txt
languageID/s19.txt
languageID/s2.txt
lang

In [3]:
import pandas as pd
import os
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

**Prior Probability:**

Treating texts 0-9 from each language as training set, we have 30 documents in total. So $\hat{p}(y=e)=\frac{1}{3}$, $\hat{p}(y=s)=\frac{1}{3}$ and $\hat{p}(y=j)=\frac{1}{3}$. Using additive smoothing, we have the probabilities as, 

$\hat{p}(y=e)= \frac{10 + 0.5}{30 + 3*0.5}=\frac{11.5}{31.5}=0.33$
$\hat{p}(y=s)= \frac{10 + 0.5}{30 + 3*0.5}=\frac{11.5}{31.5}=0.33$
$\hat{p}(y=j)= \frac{10 + 0.5}{30 + 3*0.5}=\frac{11.5}{31.5}=0.33$

In [10]:
character_list = [' ','a', 'b', 'c', 'd', 'e', 'f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z']
char_probs_dict_en = {}
char_probs_dict_jap = {}
char_probs_dict_s = {}

char_count_test_en = {}
char_count_test_jap = {}
char_count_test_s = {}

for char in character_list:
  char_probs_dict_en[char] = 0
  char_probs_dict_jap[char] = 0
  char_probs_dict_s[char] = 0

  char_count_test_en[char] = 0
  char_count_test_jap[char] = 0
  char_count_test_s[char] = 0


In [11]:
def reinitialize_dict(char_dict):
  for key in char_dict.keys():
    char_dict[key] = 0

def additive_smoothing(char_dict, char_count_sum):
  for key in char_dict.keys():
    if char_dict[key] == 0.0:
      char_dict[key] = 0.5/(char_count_sum+13.5)

#Get character count in set of files


In [12]:
def get_char_count(file_list):
  char_count = {}
  for filename in file_list:
    with open(filename) as f:
      while True:
        c = f.read(1)
        if not c:
          break
        if c!='\n':
          if c not in char_count:
            char_count[c] = 1
          else:
            char_count[c] +=1
  return char_count

#Class Conditional Probability of English


In [13]:
en_file_list = []
for i in range(0,10):
   en_file_list.append("languageID/e"+str(i)+".txt")
print(en_file_list)


['languageID/e0.txt', 'languageID/e1.txt', 'languageID/e2.txt', 'languageID/e3.txt', 'languageID/e4.txt', 'languageID/e5.txt', 'languageID/e6.txt', 'languageID/e7.txt', 'languageID/e8.txt', 'languageID/e9.txt']


#Use vectorizer to get frequency count and features

In [16]:
vectorizer_en = CountVectorizer(analyzer='char',input="filename")
char_vector_en = vectorizer_en.fit_transform(en_file_list)
print(vectorizer_en.get_feature_names_out())
char_array_en = char_vector_en.toarray()
char_count_dict = get_char_count(en_file_list)
char_count_en = char_array_en.sum(axis=0)
char_count_en[0]=char_count_dict[' ']
char_probs_en = (char_count_en+0.5) / (char_count_en.sum()+13.5)



[' ' 'a' 'b' 'c' 'd' 'e' 'f' 'g' 'h' 'i' 'j' 'k' 'l' 'm' 'n' 'o' 'p' 'q'
 'r' 's' 't' 'u' 'v' 'w' 'x' 'y' 'z']


**`Get class conditional probabilities`**

In [17]:
for char, char_prob in zip(list(vectorizer_en.get_feature_names_out()), char_probs_en):
  char_probs_dict_en[char] = char_prob
additive_smoothing(char_probs_dict_en,char_count_en.sum())
char_probs_en = list(char_probs_dict_en.values())

In [18]:
char_probs_dict_en

{' ': 0.1792499586981662,
 'a': 0.0601685114819098,
 'b': 0.011134974392863043,
 'c': 0.021509995043779945,
 'd': 0.021972575582355856,
 'e': 0.1053692383941847,
 'f': 0.018932760614571286,
 'g': 0.017478936064761277,
 'h': 0.047216256401784236,
 'i': 0.055410540227986124,
 'j': 0.001420783082768875,
 'k': 0.0037336857756484387,
 'l': 0.028977366595076822,
 'm': 0.020518751032545846,
 'n': 0.057921691723112505,
 'o': 0.06446390219725756,
 'p': 0.01675202378985627,
 'q': 0.0005617049396993227,
 'r': 0.053824549810011564,
 's': 0.06618205848339666,
 't': 0.08012555757475633,
 'u': 0.026664463902197257,
 'v': 0.009284652238559392,
 'w': 0.015496448042293078,
 'x': 0.001156451346439782,
 'y': 0.013844374690236246,
 'z': 0.0006277878737815959}

# Class conditional probabilities of Japanese

In [19]:
jap_file_list = []
for i in range(0,10):
   jap_file_list.append("languageID/j"+str(i)+".txt")
print(jap_file_list)



['languageID/j0.txt', 'languageID/j1.txt', 'languageID/j2.txt', 'languageID/j3.txt', 'languageID/j4.txt', 'languageID/j5.txt', 'languageID/j6.txt', 'languageID/j7.txt', 'languageID/j8.txt', 'languageID/j9.txt']


#Use vectorizer to get frequency count and features

In [20]:
vectorizer_jap = CountVectorizer(analyzer='char',input="filename")
char_vector_jap = vectorizer_jap.fit_transform(jap_file_list)
print(vectorizer_jap.get_feature_names_out())
char_array_jap = char_vector_jap.toarray()
char_count_dict = get_char_count(jap_file_list)
char_count_jap = char_array_jap.sum(axis=0)
char_count_jap[0] = char_count_dict[' ']
char_probs_jap = (char_count_jap+ 0.5)  / (char_count_jap.sum() + 13.5)



[' ' 'a' 'b' 'c' 'd' 'e' 'f' 'g' 'h' 'i' 'j' 'k' 'l' 'm' 'n' 'o' 'p' 'q'
 'r' 's' 't' 'u' 'v' 'w' 'y' 'z']


**`Get class conditional probabilities`**

In [21]:
for char, char_prob in zip(list(vectorizer_jap.get_feature_names_out()), char_probs_jap):
  char_probs_dict_jap[char] = char_prob
additive_smoothing(char_probs_dict_jap,char_count_en.sum())
char_probs_jap = list(char_probs_dict_jap.values())

In [22]:
char_probs_dict_jap

{' ': 0.12344945665466997,
 'a': 0.1317656102589189,
 'b': 0.010866906600510151,
 'c': 0.005485866033054963,
 'd': 0.01722631818022992,
 'e': 0.06020475907613823,
 'f': 0.003878542227191726,
 'g': 0.014011670568503443,
 'h': 0.03176211607673224,
 'i': 0.09703343932352633,
 'j': 0.0023411020650616725,
 'k': 0.05740941332681086,
 'l': 0.001432614696530277,
 'm': 0.03979873510604843,
 'n': 0.05671057688947902,
 'o': 0.09116321324993885,
 'p': 0.0008735455466648031,
 'q': 0.00010482546559977637,
 'r': 0.04280373178657535,
 's': 0.0421747789929767,
 't': 0.056990111464411755,
 'u': 0.07061742199238269,
 'v': 0.0002445927530661449,
 'w': 0.01974212935462455,
 'x': 3.3041467041136624e-05,
 'y': 0.01415143785596981,
 'z': 0.00772214263251686}

# Class conditional probabilities of Spanish


In [23]:
span_file_list = []
for i in range(0,10):
   span_file_list.append("languageID/s"+str(i)+".txt")
print(span_file_list)



['languageID/s0.txt', 'languageID/s1.txt', 'languageID/s2.txt', 'languageID/s3.txt', 'languageID/s4.txt', 'languageID/s5.txt', 'languageID/s6.txt', 'languageID/s7.txt', 'languageID/s8.txt', 'languageID/s9.txt']


#Use vectorizer to get frequency count and features

In [24]:
vectorizer_s = CountVectorizer(analyzer='char',input="filename")
char_vector_s = vectorizer_s.fit_transform(span_file_list)
print(vectorizer_s.get_feature_names_out())
char_array_s = char_vector_s.toarray()
char_count_dict = get_char_count(span_file_list)
char_count_s = char_array_s.sum(axis=0)
char_count_s[0] = char_count_dict[' ']
char_probs_s = (char_count_s+0.5) / (char_count_s.sum() + 13.5)



[' ' 'a' 'b' 'c' 'd' 'e' 'f' 'g' 'h' 'i' 'j' 'k' 'l' 'm' 'n' 'o' 'p' 'q'
 'r' 's' 't' 'u' 'v' 'w' 'x' 'y' 'z']


Get class conditional probabilities



In [25]:
for char, char_prob in zip(list(vectorizer_s.get_feature_names_out()), char_probs_s):
  char_probs_dict_s[char] = char_prob
additive_smoothing(char_probs_dict_s,char_count_en.sum())
char_probs_s = list(char_probs_dict_s.values())

In [26]:
char_probs_dict_s

{' ': 0.16826493170115014,
 'a': 0.10456045141993771,
 'b': 0.008232863618143134,
 'c': 0.03752582405722919,
 'd': 0.039745922111559924,
 'e': 0.1138108599796491,
 'f': 0.00860287996053159,
 'g': 0.0071844839813758445,
 'h': 0.0045327001942585795,
 'i': 0.049859702136844375,
 'j': 0.006629459467793161,
 'k': 0.0002775122567913416,
 'l': 0.052943171656748174,
 'm': 0.02580863988159477,
 'n': 0.054176559464709693,
 'o': 0.07249236841293824,
 'p': 0.02426690512164287,
 'q': 0.007677839104560451,
 'r': 0.05929511886774999,
 's': 0.06577040485954797,
 't': 0.03561407295488884,
 'u': 0.03370232185254849,
 'v': 0.00588942678301625,
 'w': 9.250408559711388e-05,
 'x': 0.0024976103111220747,
 'y': 0.007862847275754679,
 'z': 0.0026826184823163022}

#Prediction for test file - e10.txt

##Using vectorizer to get features and frequency count


In [27]:

char_vector_en_test = vectorizer_en.transform(["languageID/e10.txt"])
print(vectorizer_en.get_feature_names_out())
char_array_en_test = char_vector_en_test.toarray()
char_count_dict = get_char_count(["languageID/e10.txt"])
char_count_en_test = char_array_en_test.sum(axis=0)
char_count_en_test[0] = char_count_dict[' ']
for char, char_count in zip(list(vectorizer_en.get_feature_names_out()), char_count_en_test):
  char_count_test_en[char] = char_count


[' ' 'a' 'b' 'c' 'd' 'e' 'f' 'g' 'h' 'i' 'j' 'k' 'l' 'm' 'n' 'o' 'p' 'q'
 'r' 's' 't' 'u' 'v' 'w' 'x' 'y' 'z']


###Calculate $\hat{p}(x \mid y=e)$

In [28]:
total_prob_en = 0
for char_prob, char_count in zip(char_probs_en,list(char_count_test_en.values())):
  total_prob_en += char_count * np.log(char_prob)
  # print(np.log(char_prob))
print(total_prob_en)

-7841.865447060634


###Calculate $\hat{p}(x \mid y=j)$

In [29]:
total_prob_jap = 0
for char_prob, char_count in zip(char_probs_jap,list(char_count_test_en.values())):
  total_prob_jap += char_count * np.log(char_prob)
  # print(np.log(char_prob))
print(total_prob_jap)

-8771.65676346074


###Calculate $\hat{p}(x \mid y=s)$

In [30]:
total_prob_s = 0
for char_prob, char_count in zip(char_probs_s,list(char_count_test_en.values())):
  total_prob_s += char_count * np.log(char_prob)
  # print(np.log(char_prob))
print(total_prob_s)

-8467.282044010557


**Posterior Probabilities**

Calculate $\hat{p}(y=e \mid x)$

In [31]:
log_prior_en = np.log(1/3)
log_posterior_en = total_prob_en +log_prior_en
print(log_posterior_en)

-7842.964059349302


Calculate $\hat{p}(y=j \mid x)$

In [32]:
log_prior_jap = np.log(1/3)
log_posterior_jap = total_prob_jap +log_prior_jap
print(log_posterior_jap)

-8772.755375749408


Calculate $\hat{p}(y=s \mid x)$

In [33]:
log_prior_s = np.log(1/3)
log_posterior_s = total_prob_s +log_prior_s
print(log_posterior_s)

-8468.380656299225


Predicted label is "english"

In [34]:
max(log_posterior_en,log_posterior_jap,log_posterior_s)

-7842.964059349302

**Log to base 10**

In [35]:
total_prob_en = 0
for char_prob, char_count in zip(char_probs_en,list(char_count_test_en.values())):
  total_prob_en += char_count * np.log10(char_prob)
  # print(np.log(char_prob))
print(total_prob_en)

-3405.6788914862113


In [36]:
total_prob_jap = 0
for char_prob, char_count in zip(char_probs_jap,list(char_count_test_en.values())):
  total_prob_jap += char_count * np.log10(char_prob)
  # print(np.log(char_prob))
print(total_prob_jap)

-3809.482129520336


In [37]:
total_prob_s = 0
for char_prob, char_count in zip(char_probs_s,list(char_count_test_en.values())):
  total_prob_s += char_count * np.log10(char_prob)
  # print(np.log(char_prob))
print(total_prob_s)

-3677.2938684322726


In [38]:
log_prior_en = np.log10(1/3)
log_posterior_en = total_prob_en +log_prior_en
print(log_posterior_en)

-3406.156012740931


In [39]:
log_prior_jap = np.log10(1/3)
log_posterior_jap = total_prob_jap +log_prior_jap
print(log_posterior_jap)

-3809.959250775056


In [40]:
log_prior_s = np.log10(1/3)
log_posterior_s = total_prob_s +log_prior_s
print(log_posterior_s)

-3677.7709896869924


In [41]:
max(log_posterior_en,log_posterior_jap,log_posterior_s)

-3406.156012740931

# Get Classification Performance - Confusion Matrix


In [42]:
def compute_en_prob(char_probs_en,char_count_en):
  total_prob_en = 0
  for char_prob, char_count in zip(char_probs_en,char_count_en):
    total_prob_en += char_count * np.log10(char_prob)
    # print(np.log(char_prob))
  
  print("Log likelihood is" + str(total_prob_en))
  log_prior_en = np.log10(1/3)
  log_posterior_en = total_prob_en +log_prior_en
  print("Log posterior is "+str(log_posterior_en))
  return log_posterior_en

In [43]:
def compute_jap_prob(char_probs_jap, char_count_jap):
  total_prob_jap = 0
  for char_prob, char_count in zip(char_probs_jap,char_count_jap):
    total_prob_jap += char_count * np.log10(char_prob)
    # print(np.log(char_prob))
  print("Log likelihood is" + str(total_prob_jap))
  log_prior_jap = np.log10(1/3)
  log_posterior_jap = total_prob_jap +log_prior_jap
  print("Log posterior is "+str(log_posterior_jap))
  return log_posterior_jap
  

In [44]:
def compute_spanish_prob(char_probs_s, char_count_s):
  total_prob_s = 0
  for char_prob, char_count in zip(char_probs_s,char_count_s):
    total_prob_s += char_count * np.log10(char_prob)
  print("Log likelihood is" + str(total_prob_s))
  log_prior_s = np.log10(1/3)
  log_posterior_s = total_prob_s +log_prior_s
  print("Log posterior is "+str(log_posterior_s))
  return log_posterior_s



##Performance on English test set

In [45]:
en_test_files = []
count_en = 0
count_jap = 0
count_span = 0
for i in range(10,20):
  log_posteriors = []
  reinitialize_dict(char_count_test_en)
  char_vector_en_test = vectorizer_en.transform(["languageID/e"+str(i)+".txt"])
  print(vectorizer_en.get_feature_names_out())
  char_array_en_test = char_vector_en_test.toarray()
  char_count_en_test = char_array_en_test.sum(axis=0)
  for char, char_count in zip(list(vectorizer_en.get_feature_names_out()), char_count_en_test):
    char_count_test_en[char] = char_count

  char_count_en_test = list(char_count_test_en.values())



  print("English")
  log_posteriors.append(compute_en_prob(char_probs_en,char_count_en_test))
  print("Japanese")
  log_posteriors.append(compute_jap_prob(char_probs_jap,char_count_en_test))
  print("Spanish")
  log_posteriors.append(compute_spanish_prob(char_probs_s,char_count_en_test))
  print(np.argmax(log_posteriors))
  if np.argmax(log_posteriors) == 0:
    count_en+=1
  elif np.argmax(log_posteriors) == 1:
    count_jap+=1
  else:
    count_span+=1
  








[' ' 'a' 'b' 'c' 'd' 'e' 'f' 'g' 'h' 'i' 'j' 'k' 'l' 'm' 'n' 'o' 'p' 'q'
 'r' 's' 't' 'u' 'v' 'w' 'x' 'y' 'z']
English
Log likelihood is-3401.1996458718436
Log posterior is -3401.6767671265634
Japanese
Log likelihood is-3804.031064617054
Log posterior is -3804.508185871774
Spanish
Log likelihood is-3672.6498301136667
Log posterior is -3673.1269513683865
0
[' ' 'a' 'b' 'c' 'd' 'e' 'f' 'g' 'h' 'i' 'j' 'k' 'l' 'm' 'n' 'o' 'p' 'q'
 'r' 's' 't' 'u' 'v' 'w' 'x' 'y' 'z']
English
Log likelihood is-4057.515162165417
Log posterior is -4057.992283420137
Japanese
Log likelihood is-4518.057491168433
Log posterior is -4518.534612423153
Spanish
Log likelihood is-4365.826788920477
Log posterior is -4366.3039101751965
0
[' ' 'a' 'b' 'c' 'd' 'e' 'f' 'g' 'h' 'i' 'j' 'k' 'l' 'm' 'n' 'o' 'p' 'q'
 'r' 's' 't' 'u' 'v' 'w' 'x' 'y' 'z']
English
Log likelihood is-2297.698961567596
Log posterior is -2298.176082822316
Japanese
Log likelihood is-2537.498548748493
Log posterior is -2537.9756700032126
Spanish
Log li

In [46]:
print(count_en)
print(count_jap)
print(count_span)

10
0
0


**All english test documents are labeled "English"**

##Performance on Japanese test set

In [47]:


count_en = 0
count_jap = 0
count_span = 0
for i in range(10,20):
  log_posteriors = []
  reinitialize_dict(char_count_test_jap)
  char_vector_jap_test = vectorizer_jap.fit_transform(["languageID/j"+str(i)+".txt"])
  print(vectorizer_jap.get_feature_names_out())
  char_array_jap_test = char_vector_jap_test.toarray()
  char_count_jap_test = char_array_jap_test.sum(axis=0)

  for char, char_count in zip(list(vectorizer_jap.get_feature_names_out()), char_count_jap_test):
    char_count_test_jap[char] = char_count

  char_count_jap_test = list(char_count_test_jap.values())


  print("English")
  log_posteriors.append(compute_en_prob(char_probs_en,char_count_jap_test))
  print("Japanese")
  log_posteriors.append(compute_jap_prob(char_probs_jap,char_count_jap_test))
  print("Spanish")
  log_posteriors.append(compute_spanish_prob(char_probs_s,char_count_jap_test))
  print(np.argmax(log_posteriors))
  if np.argmax(log_posteriors) == 0:
    count_en+=1
  elif np.argmax(log_posteriors) == 1:
    count_jap+=1
  else:
    count_span+=1
  








[' ' 'a' 'b' 'c' 'd' 'e' 'f' 'g' 'h' 'i' 'j' 'k' 'l' 'm' 'n' 'o' 'r' 's'
 't' 'u' 'v' 'w' 'y' 'z']
English
Log likelihood is-1980.5736122302806
Log posterior is -1981.0507334850001
Japanese
Log likelihood is-1803.9286991222164
Log posterior is -1804.405820376936
Spanish
Log likelihood is-2180.9279016744276
Log posterior is -2181.4050229291474
1
[' ' 'a' 'b' 'c' 'd' 'e' 'f' 'g' 'h' 'i' 'j' 'k' 'm' 'n' 'o' 'r' 's' 't'
 'u' 'w' 'y' 'z']
English
Log likelihood is-2005.4666282260337
Log posterior is -2005.9437494807532
Japanese
Log likelihood is-1793.3103962363818
Log posterior is -1793.7875174911014
Spanish
Log likelihood is-2202.608825036029
Log posterior is -2203.085946290749
1
[' ' 'a' 'b' 'c' 'd' 'e' 'f' 'g' 'h' 'i' 'j' 'k' 'm' 'n' 'o' 'r' 's' 't'
 'u' 'w' 'y' 'z']
English
Log likelihood is-1664.5049729603352
Log posterior is -1664.9820942150548
Japanese
Log likelihood is-1499.0076728675476
Log posterior is -1499.4847941222672
Spanish
Log likelihood is-1818.0878168022778
Log posterior 

In [48]:
print(count_en)
print(count_jap)
print(count_span)

0
10
0


**All files in Japanese test set are labeled Japanese**

##Performance on Spanish test set

In [49]:
vectorizer_test_s = CountVectorizer(analyzer='char',input="filename")

count_en = 0
count_jap = 0
count_span = 0
for i in range(10,20):
  log_posteriors = []
  reinitialize_dict(char_count_test_en)
  char_vector_s_test = vectorizer_test_s.fit_transform(["languageID/s"+str(i)+".txt"])
  print(vectorizer_test_s.get_feature_names_out())
  char_array_s_test = char_vector_s_test.toarray()
  char_count_s_test = char_array_s_test.sum(axis=0)

  for char, char_count in zip(list(vectorizer_s.get_feature_names_out()), char_count_s_test):
    char_count_test_s[char] = char_count
  
  char_count_s_test = list(char_count_test_s.values())

  
  print("English")
  log_posteriors.append(compute_en_prob(char_probs_en,char_count_s_test))
  print("Japanese")
  log_posteriors.append(compute_jap_prob(char_probs_jap,char_count_s_test))
  print("Spanish")
  log_posteriors.append(compute_spanish_prob(char_probs_s,char_count_s_test))
  print(np.argmax(log_posteriors))
  if np.argmax(log_posteriors) == 0:
    count_en+=1
  elif np.argmax(log_posteriors) == 1:
    count_jap+=1
  else:
    count_span+=1
  








[' ' 'a' 'b' 'c' 'd' 'e' 'f' 'g' 'h' 'i' 'j' 'k' 'l' 'm' 'n' 'o' 'p' 'q'
 'r' 's' 't' 'u' 'v' 'x' 'y' 'z']
English
Log likelihood is-2148.8904120418783
Log posterior is -2149.367533296598
Japanese
Log likelihood is-2417.077772191373
Log posterior is -2417.5548934460926
Spanish
Log likelihood is-2050.1735065543876
Log posterior is -2050.6506278091074
2
[' ' 'a' 'b' 'c' 'd' 'e' 'f' 'g' 'h' 'i' 'j' 'l' 'm' 'n' 'o' 'p' 'q' 'r'
 's' 't' 'u' 'v' 'x' 'y' 'z']
English
Log likelihood is-879.9636967825646
Log posterior is -880.4408180372843
Japanese
Log likelihood is-909.3125388149
Log posterior is -909.7896600696197
Spanish
Log likelihood is-869.9701404630113
Log posterior is -870.447261717731
2
[' ' 'a' 'b' 'c' 'd' 'e' 'f' 'g' 'h' 'i' 'j' 'l' 'm' 'n' 'o' 'p' 'q' 'r'
 's' 't' 'u' 'v' 'x' 'y' 'z']
English
Log likelihood is-2794.590115006536
Log posterior is -2795.067236261256
Japanese
Log likelihood is-2946.6848327038565
Log posterior is -2947.1619539585763
Spanish
Log likelihood is-2723.1922236

In [50]:
print(count_en)
print(count_jap)
print(count_span)

0
0
10


**All Spanish documents are labeled Spanish**

##Prediction on jumbled test document - e10.txt

In [51]:
import random
lines_jumbled = ''
lines = open('languageID/e10.txt').readlines()
for line in lines:
  l = list(line)
  random.shuffle(l)
  lines_jumbled+=''.join(l)

open('languageID/jumblede10.txt', 'w').writelines(lines_jumbled)

In [52]:

char_vector_en_test = vectorizer_en.transform(["languageID/jumblede10.txt"])
print(vectorizer_en.get_feature_names_out())
char_array_en_test = char_vector_en_test.toarray()
char_count_dict = get_char_count(["languageID/jumblede10.txt"])
char_count_en_test = char_array_en_test.sum(axis=0)
char_count_en_test[0] = char_count_dict[' ']
for char, char_count in zip(list(vectorizer_en.get_feature_names_out()), char_count_en_test):
  char_count_test_en[char] = char_count


[' ' 'a' 'b' 'c' 'd' 'e' 'f' 'g' 'h' 'i' 'j' 'k' 'l' 'm' 'n' 'o' 'p' 'q'
 'r' 's' 't' 'u' 'v' 'w' 'x' 'y' 'z']


In [53]:
total_prob_en = 0
for char_prob, char_count in zip(char_probs_en,list(char_count_test_en.values())):
  total_prob_en += char_count * np.log(char_prob)
  # print(np.log(char_prob))
print(total_prob_en)

-7841.865447060634


In [54]:
total_prob_jap = 0
for char_prob, char_count in zip(char_probs_jap,list(char_count_test_en.values())):
  total_prob_jap += char_count * np.log(char_prob)
  # print(np.log(char_prob))
print(total_prob_jap)

-8771.65676346074


In [55]:
total_prob_s = 0
for char_prob, char_count in zip(char_probs_s,list(char_count_test_en.values())):
  total_prob_s += char_count * np.log(char_prob)
  # print(np.log(char_prob))
print(total_prob_s)

-8467.282044010557


In [56]:
log_prior_en = np.log(1/3)
log_posterior_en = total_prob_en +log_prior_en
print(log_posterior_en)

-7842.964059349302


In [57]:
log_prior_jap = np.log(1/3)
log_posterior_jap = total_prob_jap +log_prior_jap
print(log_posterior_jap)

-8772.755375749408


In [58]:
log_prior_s = np.log(1/3)
log_posterior_s = total_prob_s +log_prior_s
print(log_posterior_s)

-8468.380656299225


In [59]:
max(log_posterior_en,log_posterior_jap,log_posterior_s)

-7842.964059349302

Predicion is **"English"** - NO change in prediction