In [495]:
#!/usr/bin/env python3


import sys
import operator
import numpy as np
from sklearn import svm
from past import autotranslate
autotranslate(['litcm'])
from litcm import LIT
import re
from sklearn.ensemble import RandomForestClassifier


lit = LIT(labels=['hin', 'eng'], transliteration=False)
#   Dictionary and associated files
en_dict_flname = "en_only"      # English words only
hi_dict_flname = "hi_only"      # Hindi words only
en_2ch_w_flname = "en_2.uniq"   # Most frequent 2-char seq in EN
en_3ch_w_flname = "en_3.uniq"   # Most frequent 3-char seq in EN
hi_2ch_w_flname = "hi_2.uniq"   # Most frequent 2-char seq in HI-roman
hi_3ch_w_flname = "hi_3.uniq"   # Most frequent 3-char seq in HI-roman
max_len =30

# Initialize dictionaries and lists to empty
en_dict = set ()
hi_dict = set ()
en_2ch_w_dict = {}
en_2ch_lst = []
en_3ch_w_dict = {}
en_3ch_lst = []
hi_2ch_w_dict = {}
hi_2ch_lst = []
hi_3ch_w_dict = {}
hi_3ch_lst = []
def load_dict ():
    with open (en_dict_flname) as fl:
        for l in fl:
            en_dict.add (l.strip ())
    
    with open (hi_dict_flname) as fl:
        for l in fl:
            hi_dict.add (l.strip ())

    with open (en_2ch_w_flname) as fl:
        for l in fl:
            arr = [ w for w in l.strip ().split (' ') ]
            en_2ch_w_dict[arr[0]] = float (arr[1])
            en_2ch_lst.append (arr[0])
    
    with open (en_3ch_w_flname) as fl:
        for l in fl:
            arr = [ w for w in l.strip ().split (' ') ]
            en_3ch_w_dict[arr[0]] = float (arr[1])
            en_3ch_lst.append (arr[0])
    
    with open (hi_2ch_w_flname) as fl:
        for l in fl:
            arr = [ w for w in l.strip ().split (' ') ]
            hi_2ch_w_dict[arr[0]] = float (arr[1])
            hi_2ch_lst.append (arr[0])
    
    with open (hi_3ch_w_flname) as fl:
        for l in fl:
            arr = [ w for w in l.strip ().split (' ') ]
            hi_3ch_w_dict[arr[0]] = float (arr[1])
            hi_3ch_lst.append (arr[0])
# Given a sentence 's' return the corresponding feature vector
def mk_vect (s):
    p = s.split(' ')
    res=[]
    t = max_len-len(p)
    if t < 0 :
        t=0
    s = s.strip ()
    s_tagged = lit.identify (s)
   
    ct_en = len (re.findall ('\\Eng', s_tagged))
    ct_hi = len (re.findall ('\\Hin', s_tagged))
    ct_all = ct_en + ct_hi
    
    if ct_all > 0:
        res = [ ct_en/ct_all, ct_hi/ct_all ]
    else:
        res = [ 0.00, 0.00 ]
    ar1 = re.findall(r'\Hin|\Eng',s_tagged)
    v1_en = [2 if w == "Hin" else -2 for w in ar1 ] + [0 for _ in range(max_len- len(ar1)) ]
    v1_en = v1_en[:max_len]
    
    v_en = [ 2 if w in en_dict else -1 for w in p ] 
    v_hi = [ 2 if w in hi_dict else -1 for w in p ]
    for i in range(len(v_en)):
        if(v_en[i] == v_hi[i]):
            v_en[i] = 1
            v_hi[i] = 1
    v_en = v_en + [0 for _ in range(t) ]
    v_hi = v_hi + [0 for _ in range(t) ]
    v_en = v_en[:max_len]
    v_hi = v_hi[:max_len]
    # print (v_en)
    # print (v_hi)
    res.append (sum (v_en) / len (v_en))
    res.append (sum (v_hi) / len (v_hi))

    for i in range(len(v_en)):
        res.append(v1_en[i])
        res.append(v_en[i])
        res.append(v_hi[i])
#     for i in range(max_len-len(p)):
#         res.append(0)
#         res.append(0)
    arr = [ s[i+0] + s[i+1] for i in range(len(s) - 1) ]
    dct = {}
    ct = 0
    for w in arr:
        if w in dct:
            dct[w] += 1
        else:
            dct[w] = 1
        ct += 1

    for k in en_2ch_lst:
        if k in dct:
            res.append (dct[k] * en_2ch_w_dict[k] / ct)
        else:
            res.append (0.00)
    
    arr = [ s[i+0] + s[i+1] for i in range(len(s) - 1) ]
    dct = {}
    ct = 0
    for w in arr:
        if w in dct:
            dct[w] += 1
        else:
            dct[w] = 1
        ct += 1

    for k in hi_2ch_lst:
        if k in dct:
            res.append (dct[k] * hi_2ch_w_dict[k] / ct)
        else:
            res.append (0.00)
    
    arr = [ s[i+0] + s[i+1] + s[i+2] for i in range(len(s) - 2) ]
    dct = {}
    ct = 0
    for w in arr:
        if w in dct:
            dct[w] += 1
        else:
            dct[w] = 1
        ct += 1

    for k in en_3ch_lst:
        if k in dct:
            res.append (dct[k] * en_3ch_w_dict[k] / ct)
        else:
            res.append (0.00)
    
    arr = [ s[i+0] + s[i+1] + s[i+2] for i in range(len(s) - 2) ]
    dct = {}
    ct = 0
    for w in arr:
        if w in dct:
            dct[w] += 1
        else:
            dct[w] = 1
        ct += 1

    for k in hi_3ch_lst:
        if k in dct:
            res.append (dct[k] * hi_3ch_w_dict[k] / ct)
        else:
            res.append (0.00)
    
    return np.array(res)


# Loads sentences in 'x_flname' and labels in 'y_flname'
# Converts sentences to feature vectors and returns it in X (ndarray)
# Labels are returned in Y (ndarray)
def load_data (x_flname, y_flname):
    x_orig = []
    y_orig = []
    with open (x_flname) as fl:
        for l in fl:
            x_orig.append (l.strip ())

    with open (y_flname) as fl:
        for l in fl:
            y_orig.append (int (l.strip ()))

    X = []
    for x in x_orig:
        X.append (mk_vect (x))

    X  = np.array (X)
    Y  = np.array (y_orig)

    return (X, Y, x_orig)

print ("[ ] Loading dictionaries ...", end="")
load_dict ()
print ("\r%-79s" % "[x] Dictionaries loaded.")




[x] Dictionaries loaded.                                                       


In [496]:
print ("[ ] Loading and converting Training/Testing data ...", end="")
# x_flname1 = "all.x"
# y_flname1= "all.y"
x_flname = "x_f.txt"
y_flname = "y_f.txt"
X_mixed, Y_mixed, X_mixed_orig = load_data ("code_mixed.x", "code_mixed.y")
X, Y, X_orig = load_data (x_flname, y_flname)
# X1,Y1,X_orig1 = load_data(x_flname1,y_flname1)
print ("\r%-79s" % "[x] Data loaded and converted.")

[x] Data loaded and converted.                                                 


In [497]:
x_mix=[]
y_mix=[]
x_eng =[]
y_eng =[]
x_eng_orig =[]
p = len(Y)
arr= []
arr1=[]
for i in range(p):
    if(Y[i] == 2):
        x_mix.append(X[i])
        y_mix.append(2)
        arr.append(i)
    if(Y[i] == 0):
        x_eng.append(X[i])
        y_eng.append(0)
        x_eng_orig.append(X_orig[i])
        arr1.append(i)
        
# X = np.delete(X,arr)
# Y = np.delete(Y,arr)
print(len(x_mix))
for name in [x_mix,y_mix,x_eng,y_eng,x_eng_orig]:
    name =np.array(name)
X= np.delete(X,arr+arr1,0)
Y= np.delete(Y,arr+arr1,0)

2201


In [501]:
x_mix = np.array(x_mix)
y_mix = np.array(y_mix)
x_eng = np.array(x_eng)
y_eng = np.array(y_eng)
print(X.shape)
print(y_mix.shape)
print(X_mixed.shape)
X1.shape
print(x_eng.shape)
# print(X_mixed)

(12289, 244)
(2201,)
(450, 244)
(4193, 244)


In [503]:

# Training (600 samples): Nearly 2/3 English, 1/3 Code-Mixed
X_tr = np.concatenate ((x_eng[:2500],X[:2800],x_mix), axis=0)
Y_tr = np.concatenate ((y_eng[:2500],Y[:2800],y_mix), axis=0)    
# X_tr = X[:14000]
# Y_tr = Y[:14000]
# test_no = 1
# if len (sys.argv) > 1:
#     if sys.argv[1] in ('1', '2', '3'):
#         test_no = int (sys.argv[1])

# if test_no == 1:
    # Nearly 50-50 mix of code-mixed and english (500 samples)
X_te = np.concatenate ((X[5000:5550],x_eng[2500:3100], X_mixed), axis=0)
Y_te = np.concatenate ((Y[5000:5550],y_eng[2500:3100], Y_mixed), axis=0)
# X_te = X[14000:]
# Y_te = Y[14000:]
X_sent = X_orig[5000:5550] + x_eng_orig[2500:3100]+ X_mixed_orig
# elif test_no == 2:
#     # All code-mixed (250 samples)
#     X_te = np.concatenate ((X1[1400:], X_mixed[200:]), axis=0)
#     Y_te = np.concatenate ((Y1[1400:], Y_mixed[200:]), axis=0)
#     X_sent = X_orig1[1400:] + X_mixed_orig[200:]
# elif test_no == 3:
#     # Nearly all english (1000 samples)
#     X_te = np.concatenate ((X1[400:], X_mixed[450:]), axis=0)
#     Y_te = np.concatenate ((Y1[400:], Y_mixed[450:]), axis=0)
#     X_sent = X_orig1[400:] + X_mixed_orig[450:]

def result():
    print ("    For a total of %d tweets in test-set: " % Y_te.shape[0])
    en=0
    en_hi=0
    en_mi = 0
    mi = 0
    mi_en=0
    mi_hi=0
    hi = 0
    hi_en = 0
    hi_mi=0
    for i in range(Y_te.shape[0]):
        a = Y_pr[i]
        b = Y_te[i]

        if(Y_pr[i] == Y_te[i]):
            if(Y_pr[i] == 0):
                en +=1
            elif(Y_pr[i] == 1):
                hi +=1
            elif(Y_pr[i] == 2):
                mi +=1
        else:
            if(a== 1 and b==0):
                en_hi += 1
            elif(a == 2 and b == 0):
                en_mi +=1
            elif(a == 0 and b == 1):
                hi_en +=1
            elif(a==2 and b==1):
                hi_mi +=1
            elif(a == 0 and b == 2):
                mi_en +=1
            elif(a == 1 and b==2):
                mi_hi +=1
    print("eng   hi   mix   True_values")
    print(en,end ="  ")
    print(en_hi, end = "  ")
    print(en_mi,end="     ")
    print((en+en_hi+en_mi))
    print(hi_en,end ="  ")
    print(hi, end= "  ")
    print(hi_mi,end = "     ")
    print(hi_en+hi+hi_mi)
    print(mi_en,end = "  ")
    print(mi_hi,end="  ")
    print(mi,end = "     ") 
    print(mi_en+mi_hi+mi)
    print((en+hi_en+mi_en),end = "  ")
    print((hi+en_hi+mi_hi),end= "  ")
    print((mi+en_mi+hi_mi),end= "    ")
    print(mi+en+hi+en_hi+en_mi+hi_en+hi_mi+mi_en+mi_hi)
    print("accuracy : ",end = "")
    print((en+hi+mi)/(Y_te.shape[0])) 
    print("----stats----")
    print("eng :",(en/(en+en_hi+en_mi)))
    print("hindi :",(hi/(hi_en+hi+hi_mi)))
    print("mixed :",(mi/(mi_en+mi_hi+mi)))
    mix = 0
    hi = 0
    en =0
    for i in range(len(Y_tr)):
        if(Y_tr[i] == 2):
            mix+=1
        elif(Y_tr[i] == 1):
            hi+=1
        elif(Y_tr[i] == 0):
            en+=1

    print("------Training--------")
    print("en:",en)
    print("hi:",hi)
    print("mix:",mix)
        

In [504]:
# Train the SVM
print ("[ ] Training SVM ...", end="")
sclf = svm.SVC (kernel = 'rbf')
sclf.fit (X_tr, Y_tr)
print ("\r%-79s" % "[x] Training complete.")
print ("    Trained on %d samples." % X_tr.shape[0])


# Test the learned model
print ("[ ] Testing ...", end="")
Y_pr = sclf.predict (X_te)
print ("\r%-79s" % "[x] Testing complete.")
Y_pr1 = sclf.predict(X_mixed)


# Print accuracy
print ("[ ] Calculating accuracy ...", end="")
acc = sum ((Y_te == Y_pr).astype ('int64'))/Y_te.shape[0]
print ("\r%-79s" % "[x] Accuracy calculated.")
print ("    For a total of %d tweets in test-set: " % Y_te.shape[0], end="")
print ("Accuracy = %.2f %%" % (acc * 100.0))
print ("[ ] Calculating accuracy code mixed...", end="")
acc = sum ((Y_mixed == Y_pr1).astype ('int64'))/Y_mixed.shape[0]
print ("\r%-79s" % "[x] Accuracy calculated code mixed.")
print ("    For a total of %d tweets in test-set: " % Y_mixed.shape[0], end="")
print ("Accuracy = %.2f %%" % (acc * 100.0))
result()

[x] Training complete.                                                         
    Trained on 7501 samples.
[x] Testing complete.                                                          
[x] Accuracy calculated.                                                       
    For a total of 1600 tweets in test-set: Accuracy = 81.44 %
[x] Accuracy calculated code mixed.                                            
    For a total of 450 tweets in test-set: Accuracy = 88.89 %
    For a total of 1600 tweets in test-set: 
eng   hi   mix   True_values
537  44  19     600
64  366  116     546
24  26  400     450
625  436  535    1596
accuracy : 0.814375
----stats----
eng : 0.895
hindi : 0.6703296703296703
mixed : 0.8888888888888888
------Training--------
en: 2500
hi: 2788
mix: 2201


In [505]:
# Train the randomForest
print ("[ ] Training Random ...", end="")
clf = RandomForestClassifier(n_jobs=-1,max_depth = 10,criterion = "gini")
clf.fit (X_tr, Y_tr)
print ("\r%-79s" % "[x] Training complete.")
print ("    Trained on %d samples." % X_tr.shape[0])


# Test the learned model
print ("[ ] Testing ...", end="")
Y_pr = clf.predict (X_te)
Y_pr1 = clf.predict(X_mixed)


# Print accuracy
print ("[ ] Calculating accuracy ...", end="")
acc = sum ((Y_te == Y_pr).astype ('int64'))/Y_te.shape[0]
print ("\r%-79s" % "[x] Accuracy calculated.")
print ("    For a total of %d tweets in test-set: " % Y_te.shape[0], end="")
print ("Accuracy = %.2f %%" % (acc * 100.0))
print ("[ ] Calculating accuracy code mixed...", end="")
acc = sum ((Y_mixed == Y_pr1).astype ('int64'))/Y_mixed.shape[0]
print ("\r%-79s" % "[x] Accuracy calculated code mixed.")
print ("    For a total of %d tweets in test-set: " % Y_mixed.shape[0], end="")
print ("Accuracy = %.2f %%" % (acc * 100.0))
result()

[x] Training complete.                                                         
    Trained on 7501 samples.
[x] Accuracy calculated.                                                       
    For a total of 1600 tweets in test-set: Accuracy = 78.12 %
[x] Accuracy calculated code mixed.                                            
    For a total of 450 tweets in test-set: Accuracy = 84.00 %
    For a total of 1600 tweets in test-set: 
eng   hi   mix   True_values
530  36  34     600
64  342  140     546
23  49  378     450
617  427  552    1596
accuracy : 0.78125
----stats----
eng : 0.8833333333333333
hindi : 0.6263736263736264
mixed : 0.84
------Training--------
en: 2500
hi: 2788
mix: 2201


In [487]:
lbl_map = ["English", "Hindi", "Code-Mixed", "Unknown" ]
ch = input ("Do you want per sample results [y/n]: ")
if ch == 'y':
    for l in zip(X_sent, range(Y_te.shape[0])):
        s = l[0]
        act = Y_te[l[1]]
        pr = Y_pr[l[1]]
        if(act != pr):
            print ("[%3d] %s" % (l[1], s))
            print ("    Predicted: %-12s\t Actual: %-12s" % (lbl_map[pr], lbl_map[act]))
            if input ("") == 'q':
                break


Do you want per sample results [y/n]: y
[  4] acha
    Predicted: Code-Mixed  	 Actual: Hindi       

[  5] jab bf ko pta h n he is still wid u to ab konse future ka darr h tuje
    Predicted: Code-Mixed  	 Actual: Hindi       

[  8] erosnowsouth shamitabh releases today really liked the movie gr8 acting and storyline
    Predicted: English     	 Actual: Hindi       

[ 11] Its 4pm sir
    Predicted: Code-Mixed  	 Actual: Hindi       

[ 15] bhaitu kaha hain
    Predicted: Code-Mixed  	 Actual: Hindi       

[ 16] go hug her n say sory  tel her hw much u love her   thats it  go 4 a date wid her
    Predicted: Hindi       	 Actual: Unknown     

[ 21] prestige puncture ho gayi guddu ki
    Predicted: Code-Mixed  	 Actual: Hindi       

[ 23] gurmeetramrahim blockbustermsg same to u papa g nd thanksss for ur such a lovely wiahes happy mahashivratri papa g
    Predicted: Code-Mixed  	 Actual: Hindi       

[ 28] Sick mntlity  anyone dnt hve any rght to laugh upon smeones dressng sense or

KeyboardInterrupt: 

[ ] Calculating accuracy ...[x] Accuracy calculated.                                                       
    For a total of 50 tweets in test-set: Accuracy = 72.00 %


In [514]:
sent = "things have a way"
print(lbl_map[clf.predict(mk_vect(sent))])

English


  from ipykernel import kernelapp as app


'marks is all i have'

In [401]:
s = "gupta is all i have mausam bada"
s_tagged = lit.identify (s.strip())
print(s_tagged)
ar1 = re.findall(r'\Hin|\Eng',s_tagged)
print(ar1)
v1_en = [2 if w == "Hin" else -2 for w in ar1 ]
print(v1_en)
# print(map(lambda x: x.group(),re.finditer("\\Eng",s_tagged)))

gupta\Hin is\Eng all\Eng i\Eng have\Hin mausam\Hin bada\Hin 
['Hin', 'Eng', 'Eng', 'Eng', 'Hin', 'Hin', 'Hin']
[2, -2, -2, -2, 2, 2, 2]
