## Language detection by 2-grams(letter n-grams), k=150

In [1]:
import pandas as pd
import seaborn as sns #Seaborn is a library that uses Matplotlib underneath to plot graphs. It will be used to visualize random distributions.
import matplotlib.pyplot as plt
import math
import string
import re
import numpy as np# multi-dimensional arrays and matrices, along with a large collection of high-level mathematical functions to operate on these arrays


In [2]:
lang_list = ['Danish', 'Dutch', 'English', 'French', 'German', 'Greek','Italian', 'Portugeese', 'Spanish', 'Sweedish', 'Turkish']

In [3]:
data = pd.read_csv('language_detection.csv',sep=';')   #store big data sets is to use CSV files
				                    #need to import pd library
				                    #stores dataset.csv to data

In [4]:
data.shape

(7614, 2)

In [5]:
data.head() #displays the first five rows of the dataframe by default.

Unnamed: 0,Text,language
0,"Nature, in the broadest sense, is the natural...",English
1,"""Nature"" can refer to the phenomena of the phy...",English
2,"The study of nature is a large, if not the onl...",English
3,"Although humans are part of nature, human acti...",English
4,[1] The word nature is borrowed from the Old F...,English


### Getting rid of punctuation

In [6]:
characters = string.punctuation+"0"+"1"+"2"+"3"+"4"+"5"+"6"+"7"+"8"+"9"
characters

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~0123456789'

In [7]:
#funtion to remove punctuation and all smallcase letters#
def remove_pun(text):
    for pun in characters:
        text=text.replace(pun,"")
    text = text.lower()
    return(text)

In [8]:
data['Text']=data['Text'].apply(remove_pun)

In [9]:
data.head()

Unnamed: 0,Text,language
0,nature in the broadest sense is the natural p...,English
1,nature can refer to the phenomena of the physi...,English
2,the study of nature is a large if not the only...,English
3,although humans are part of nature human activ...,English
4,the word nature is borrowed from the old fren...,English


### Split data for training and testing

In [10]:
y=data['language'].values
print("y.shape: ",y.shape)
x=data['Text'].values
print("x.shape: ",x.shape)

y.shape:  (7614,)
x.shape:  (7614,)


In [11]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)

In [12]:
print("x_train.shape: ",x_train.shape)
print("y_train.shape: ",y_train.shape)
print("x_test.shape: ",x_test.shape)
print("y_test.shape: ",y_test.shape)

x_train.shape:  (6091,)
y_train.shape:  (6091,)
x_test.shape:  (1523,)
y_test.shape:  (1523,)


In [13]:
data['language'].value_counts()

English       1385
French        1014
Spanish        819
Portugeese     739
Italian        698
Sweedish       676
Dutch          546
Turkish        474
German         470
Danish         428
Greek          365
Name: language, dtype: int64

### Display TrainDataset

In [14]:
df1=pd.DataFrame(x_train)
df1=df1.rename(columns={0:'Text'})
df2=pd.DataFrame(y_train)
df2=df2.rename(columns={0:'Language'})
df_train=pd.concat([df1,df2],axis=1)
df_train.head()

Unnamed: 0,Text,Language
0,ne le mentionnez pas,French
1,spinoza reviendra sur les propos de descartes ...,French
2,ve bu videoda size günlük kullanım için ve bu ...,Turkish
3,foi classificada como a revista científica ma...,Portugeese
4,quando senti una frase esercitati a dirla dopo...,Italian


In [15]:
df_train['Language'].value_counts()

English       1105
French         804
Spanish        669
Portugeese     591
Italian        558
Sweedish       524
Dutch          443
Turkish        385
German         382
Danish         338
Greek          292
Name: Language, dtype: int64

In [16]:
df_train.shape

(6091, 2)

### Display Test Dataset

In [17]:
df3=pd.DataFrame(x_test)
df3=df3.rename(columns={0:'Text'})
df4=pd.DataFrame(y_test)
df4=df4.rename(columns={0:'Language'})
df_test=pd.concat([df3,df4],axis=1)
df_test.head()

Unnamed: 0,Text,Language
0,a qualquer momento,Portugeese
1,​ en marzo de jimmy wales creó nupedia un pro...,Spanish
2,pensava di essere troppo grande per fare quals...,Italian
3,αν κάτι σας εκπλήσσει μπορείτε επίσης να πείτε...,Greek
4,om het woord catch te gebruiken,Dutch


In [18]:
df_test.shape

(1523, 2)

In [19]:
df_test['Language'].value_counts()

English       280
French        210
Sweedish      152
Spanish       150
Portugeese    148
Italian       140
Dutch         103
Danish         90
Turkish        89
German         88
Greek          73
Name: Language, dtype: int64

### generate_N_grams function
Generates n-grams for a given text. Returns list thats elements are like (n-gram,frequency)

In [20]:
def generate_N_grams(text,n):
    words = re.sub('\s+', ' ', text).strip() # replaces multiple spaces, newline tabs with a single space
    words = words.replace(' ','_')# so that we can visualise spaces easily
    #print(words)
    #grams= [words[i:i+n] for i in range(len(words)-n+1)] 
    grams = {}
    for i in range(len(words)-n):
        current = words[i:i+n] 
        #we are check how many times current shows up in a sentence
        if current in grams:
            grams[current] += 1 # increase if the n-gram showed up in the sentence
        else:
            grams[current] = 1  #if the sentence
    total = len(words) - n + 1 # total number of n-grams that will be generated
    #print(grams.keys())
    for key in grams.keys():
        grams[key] = round(math.log(grams[key] / (total)), 3) #normalizing by dividing by total no of n-grams for that corpus and taking log                                             
    grams = sorted(grams.items(), key= lambda x : x[1], reverse = True) 
    #print (grams)
    final_grams = [] # contains a list of top k n-grams in a given language 
    log_probs = [] # contains logprobs corresponding to each n-gram
    k= 150 if len(grams)>=150 else len(grams)
    for i in range(k):#get top k grams#len(grams)
            final_grams.append(grams[i][0])
            log_probs.append(grams[i][1])
    return final_grams, log_probs
    
    #return grams

###  Generate Lang Profiles(2-grams)

In [21]:
english_str=""
total_len=0
for text in df_train.loc[df_train['Language'] == 'English', 'Text']:
    total_len=total_len+len(text)
    english_str=english_str+" "+ text


In [22]:
print(total_len)
len(english_str)

143669


144774

In [23]:
print(generate_N_grams(english_str,2))
len(generate_N_grams(english_str,2))

(['e_', 's_', '_t', '_a', 'th', 'in', 'n_', 'd_', 'he', 't_', 'er', '_i', 'an', 're', '_o', 'on', 'ed', 'at', '_s', 'ti', 'y_', '_w', 'or', 'es', 'en', 'te', 'ar', 'r_', '_c', 'a_', 'al', 'ng', 'nd', 'it', 'is', '_m', 'nt', 'di', 'le', 'o_', 'to', 'f_', 'se', 'g_', 'of', '_p', 'ou', 'ic', 'as', '_b', '_f', 'st', 'io', 'ea', 'ha', '_e', 'co', 'me', 've', 'l_', '_r', '_l', 'pe', '_d', 'ne', 'de', 'ro', 'ma', 'ia', 'li', 'ri', 'h_', 'ra', 'wi', '_h', 'om', 'ce', 'ca', 'la', 'ch', 'si', 'hi', '_n', 'll', 'ns', 'ni', 'fo', 'ta', 'el', 'ac', 'ec', 'pr', 'ly', 'be', 'ge', 'us', 'ki', 'ur', 'so', 'rt', 'ik', 'rs', 'ct', 'et', 'nc', 'tr', '_g', 'm_', '_u', 'ot', '_y', 'ut', 'no', 'ip', 'yo', 'un', 'ts', 'im', 'ie', 'ss', 'mo', 'cl', 'lo', 'na', 'il', 'su', 'wa', 'em', 'ee', 'ci', 'rn', 'ai', 'vi', 'am', 'if', 'iv', 'ol', 'tu', 'mi', 'mp', 'po', 'u_', 'ho', 'pl', 'ul', 'fi', 'ow', 'pa', 'wh', 'we'], [-3.528, -3.728, -3.814, -3.846, -3.98, -4.057, -4.199, -4.201, -4.263, -4.321, -4.352, -4.353, -

2

In [24]:
#creating language corpuses
lang_corpus={}
str=""
for lang in lang_list:
    str=""
    for text in df_train.loc[df_train['Language'] == lang, 'Text']:
        str = str+' '+text
    lang_corpus[lang] = str
#print(lang_corpus)

In [25]:
print(lang_corpus['Danish'])

 er en meget fransk måde at sige møde eller dato på for eksempel har jeg et møde med mine venner over frokosten på lørdag eller jeg planlægger at møde med mine venner i indkøbscentret for film at møde kan bruges både som substantiv eller som verbum for eksempel hej hvorfor hænger du ikke med os du kan sige til dem jeg er ked af det men jeg er oversvømmet med arbejde eller jeg er oversvømmet med nogle andre ord som du kan bruge i stedet for oversvømmet hvis jeg har for meget at gøre nej slet ikke ikke meget åh fjollet han er rig og smuk hvad mere kan jeg ønske mig forfærdelse jeg accepterer dit forslag  i  modtog nature prince of asturias award for communications and humanity sammen med science desværre er jeg nødt til at sige nej med krydsede fingre jeg har det godt undskyld så hvis du lige er ved at starte din rejse det ville være rart ved hans egen refleksion i spejlet eller turisten blev flummoxed af de forskellige skikke der jeg har en rendezvous med mine venner i aften eller jeg v

In [26]:
print(lang_corpus['Turkish'])

 ve bu videoda size günlük kullanım için ve bu videonun sonunda  akıllı kelime vereceğim bu kişiyi bir kez daha tekrar etmeye sevk ettiği anlamına gelir pekala başlayalım normalde bir eserin yazarı o yazının telif hakkını elinde bulundurur ve bu hak diğer insanların bu yazıyı değiştirmesini ve kopyalamasını engeller biri gerçekten iyi gidiyorsa bunu yapabilirsin onlara iyi işi sürdürmelerini senin kadar iyi gibi iyi işlere devam etmelerini söyleyebilirsin ve iyi işi sürdürmeye devam et başka bir deyim olmaya devam et ve bu da amerikalılar söyle emeğiniz için teşekkür ederiz alexaya göre vikipedi ziyaretçilerinin i i̇ngilizce vikipedi sitesini ziyaret etmektedir özgür görünmesine rağmen bilgiyi yazan kişi telif hakkına sahiptir ancak diğer insanların bu bilgilerden yararlanmasına ve geliştirmesine izin vermektedir ve bununla birlikte arabasına bindi ve annesiyle vedalaşmadan melekle birlikte yolculuk ederken araba aniden büyülü bir geçide girdi ve arabanın muazzam bir şekilde çarptığı f

In [27]:
print(lang_corpus['English'])

 its main figures were bomis ceo jimmy wales and larry sanger editorinchief for nupedia and later wikipedia sanger coined its name as a portmanteau of wiki and encyclopedia  wikipedia has a volunteer response team that uses the otrs system to handle queries without having to reveal the identities of the involved parties complete bans from wikipedia are generally limited to instances of impersonation and antisocial behavior for example a microsystem can be a stone and all the life under it it also has nonsexual photographs of nude children  though the various language editions are held to global policies such as neutral point of view they diverge on some points of policy and practice most notably on whether images that are not licensed freely may be used under a claim of fair use  percent of the earths surface is covered by saltwater oceans apologizing  though the english wikipedia reached three million articles in august  the growth of the edition in terms of the numbers of new article

In [28]:
for lang in lang_corpus.keys():
    print(lang)

Danish
Dutch
English
French
German
Greek
Italian
Portugeese
Spanish
Sweedish
Turkish


In [29]:
bi_grams = {}
for lang in lang_corpus.keys():
    bi_grams[lang] = generate_N_grams(lang_corpus[lang],2) 

In [30]:
print(bi_grams)
print(len(bi_grams["Danish"][0]))

{'Danish': (['e_', 'r_', 'er', 'de', 't_', '_d', 'n_', 'g_', 'en', '_s', 'et', '_e', '_h', '_a', 'ge', '_o', '_m', 'd_', 're', 'or', 'ig', '_f', 'an', '_v', 'og', 'ed', '_i', 'ar', 'ke', 'te', 'le', 'at', 'me', 'u_', 'el', 'in', '_t', 'du', 'nd', 'ti', 'eg', '_n', 've', '_k', '_b', 'ne', 'vi', 'je', 'ik', 'fo', 'st', '_g', 'l_', '_j', 'm_', 'å_', 's_', 'il', 'sk', 'li', 'di', 'se', 'i_', 'om', 'ng', 'is', 'ka', 'ha', '_p', 'kk', 'hv', 'si', 'be', 'll', '_l', 'id', 'rd', 'al', 'mi', 'es', 'ta', 'k_', 'va', 'rt', 'no', 'em', 'so', 'ri', 'ad', 'un', 'pe', 'ær', 'la', 'mm', 'af', 'he', 'ag', 'så', '_u', 'a_', 'da', 'ma', 'gt', 'på', '_r', 'dt', 'tt', 'tr', 'on', 'nt', 'ni', 'f_', 'ej', 'rs', 'av', 'ra', 'br', 'nn', 'ru', 'sa', 'ld', 'na', 'ør', 'ds', 'ek', 'vo', 'bl', 'od', 'år', 'dr', 'ns', 'lt', 'op', 'væ', 'iv', 'sp', 'fa', 'ro', 'ug', 'ls', 'mo', 'kl', 'ko', 'am', 'ud', 'pr', '_w', 'ss', 'p_', 'ød'], [-3.356, -3.514, -3.618, -3.628, -3.633, -3.722, -3.971, -3.973, -3.98, -4.007, -4.177

In [31]:
#create lang profile as a dataset
df_langprofiles = pd.DataFrame()
bigrams= []
#trigrams=[]
#quadgrams=[]
df_langprofiles['Language'] = ''
df_langprofiles['bigrams'] = ''
for lang in lang_list:
    bigrams= bi_grams[lang]
    df_langprofiles.loc[len(df_langprofiles.index)] = [lang,bigrams]
display(df_langprofiles)


Unnamed: 0,Language,bigrams
0,Danish,"([e_, r_, er, de, t_, _d, n_, g_, en, _s, et, ..."
1,Dutch,"([n_, en, e_, t_, er, _d, de, _e, et, an, s_, ..."
2,English,"([e_, s_, _t, _a, th, in, n_, d_, he, t_, er, ..."
3,French,"([e_, s_, _d, es, t_, _l, en, on, de, _p, le, ..."
4,German,"([n_, en, e_, ch, t_, er, _s, _d, s_, r_, ie, ..."
5,Greek,"([α_, ι_, ς_, _τ, ε_, _π, να, _σ, _κ, ν_, ο_, ..."
6,Italian,"([e_, o_, i_, a_, _d, _s, di, _c, on, _p, en, ..."
7,Portugeese,"([o_, a_, e_, s_, _e, _d, _a, de, _p, es, m_, ..."
8,Spanish,"([e_, a_, s_, o_, _e, en, n_, es, _d, de, _p, ..."
9,Sweedish,"([r_, t_, a_, n_, er, ar, _s, en, _a, de, _d, ..."


### Distance function

In [32]:
def distance(test_grams, train_grams, n): # n helps us know whether it is bigram, trigram or etc.
    dist = {lang: 0 for lang in lang_list} # distance corresponding to each language
    for gram in test_grams[0]: 
        for lang in train_grams.keys():
            idx_2 = test_grams[0].index(gram)
            if gram in train_grams[lang][0] : 
                idx = train_grams[lang][0].index(gram)
                dist[lang] += abs(train_grams[lang][1][idx] - test_grams[1][idx_2]) 
            else: # gram is not present in that language's corpus
                dist[lang] += abs(test_grams[1][idx_2])
                # penalty term: gram's corresponding probabaility value in test_gram dict
    return dist   

In [33]:
test='bir nesnenin başka bir nesne üzerine uyguladığı yükün yüzey altında oluşturduğu gerilmeler o nesnenin gerilme direncinin üzerine çıkarak yüzey altında çatlaklar oluşturular bu çatlaklar yüzey altında ilerleyip yüzeye ulaştıklarında yüzey parçacıklar kavkıma görünümünde serbest kalırlar bu aşınma yöntemi'
test_bigram = generate_N_grams(test,2)
print(test_bigram)

# for i in range (0,len(test_bigram)):
#     print(test_bigram[i][0])

(['la', 'r_', 'ne', 'ar', 'a_', 'üz', 'ze', 'er', '_y', 'in', 'e_', 'yü', 'ey', 'ın', 'nd', 'es', 'n_', '_b', 'ka', 'ri', 'y_', '_a', 'al', 'tı', 'da', 'kl', 'ir', '_n', 'sn', 'en', 'ni', 'aş', 'ul', 'ün', 'lt', '_o', 'şt', 'u_', '_g', 'il', 'le', '_ç', 'ık', 'ak', 'ça', 'bi', '_ü', '_u', 'ol', 'lu', 'uş', 'tu', 'ur', 'ge', 'lm', 'me', 'at', 'tl', 'bu', 'rl', '_k', 'ma', 'ba', 'şk', 'uy', 'yg', 'gu', 'ad', 'dı', 'ığ', 'ğı', 'ı_', 'ük', 'kü', 'rd', 'du', 'uğ', 'ğu', 'el', 'o_', '_d', 'di', 're', 'nc', 'ci', 'çı', 'ra', 'k_', 'ru', '_i', 'yi', 'ip', 'p_', 'ye', 'rı', '_p', 'pa', 'rç', 'ac', 'cı', 'av', 'vk', 'kı', 'ım', 'gö', 'ör', 'rü', 'nü', 'üm', 'mü', 'de', '_s', 'se', 'rb', 'be', 'st', 't_', 'lı', 'ır', 'şı', 'nm', 'yö', 'ön', 'nt', 'te', 'em'], [-3.411, -3.634, -3.634, -3.634, -3.768, -3.768, -3.768, -3.768, -3.768, -3.922, -3.922, -3.922, -3.922, -4.104, -4.104, -4.327, -4.327, -4.327, -4.327, -4.327, -4.327, -4.327, -4.327, -4.327, -4.327, -4.327, -4.615, -4.615, -4.615, -4.615, 

In [34]:
a=distance(test_bigram,bi_grams,2)
print(a)

{'Danish': 415.36499999999995, 'Dutch': 435.80699999999985, 'English': 411.12199999999984, 'French': 422.06599999999986, 'German': 408.78699999999986, 'Greek': 643.3840000000012, 'Italian': 427.67699999999996, 'Portugeese': 432.4879999999998, 'Spanish': 420.1679999999998, 'Sweedish': 411.7769999999998, 'Turkish': 307.47999999999996}


### Language detection function

In [35]:
def language_detection(test_text):
    #get rid of punctuation
    test_text=remove_pun(test_text)
    #print(test_text)
    #create n-grams of test_text
    test_bigram = generate_N_grams(test_text,2)
    #find distences of each n-gram group and sum all
    distance_2=distance(test_bigram,bi_grams,2)
    # print(distance_2)

    distance_score={}
    for lang in lang_list:
        distance_score[lang]=distance_2[lang]
    #print(distance_score)   
    return min(distance_score, key=distance_score.get)
    

In [36]:
test='bir!! nesnenin başka bir nesne üzerine uyguladığı yükün yüzey altında oluşturduğu gerilmeler o nesnenin gerilme direncinin üzerine çıkarak yüzey altında çatlaklar oluşturular bu çatlaklar yüzey altında ilerleyip yüzeye ulaştıklarında yüzey parçacıklar kavkıma görünümünde serbest kalırlar bu aşınma yöntemi'
language_detection(test)

'Turkish'

### Apply language_detection on df_test

In [37]:
predict=[]
for text in df_test['Text']:
    predict.append(language_detection(text))

In [38]:
data = {
  "Text": df_test["Text"],
  "Actual": df_test["Language"], #y_test=df_test["Language"]
  "Predicted": predict,
}
df_compare = pd.DataFrame(data)
df_compare

Unnamed: 0,Text,Actual,Predicted
0,a qualquer momento,Portugeese,Sweedish
1,​ en marzo de jimmy wales creó nupedia un pro...,Spanish,Spanish
2,pensava di essere troppo grande per fare quals...,Italian,Italian
3,αν κάτι σας εκπλήσσει μπορείτε επίσης να πείτε...,Greek,Greek
4,om het woord catch te gebruiken,Dutch,Dutch
...,...,...,...
1518,auf grausame weise wird manchmal in gefängniss...,German,German
1519,bilmiyorum diyor,Turkish,Portugeese
1520,ook werd in een platenhoes niet verwijderd n...,Dutch,Dutch
1521,el proyecto wikipedia se inició el de enero de,Spanish,Spanish


In [39]:
from sklearn.metrics import accuracy_score,confusion_matrix
print(accuracy_score(df_test["Language"],predict))
print(confusion_matrix(df_test["Language"],predict,labels=lang_list))

0.8811556139198949
[[ 77   1   2   0   0   8   0   0   0   2   0]
 [  2  89   3   2   1   4   0   1   0   0   1]
 [  4   1 262   1   0   5   5   1   0   1   0]
 [  2   0   5 186   1   9   3   3   1   0   0]
 [  2   1   0   1  81   1   0   0   1   0   1]
 [  6   0   0   0   0  67   0   0   0   0   0]
 [  0   0   5   0   0  10 119   0   5   1   0]
 [  2   0   2   0   0   6   4 128   4   2   0]
 [  1   0   1   2   1   3   4   4 131   0   3]
 [ 13   0   3   2   1   6   0   0   0 126   1]
 [  3   0   1   0   1   6   0   1   1   0  76]]


In [40]:
#F-SCORE
from sklearn.metrics import f1_score
f1_score(df_test["Language"], predict,labels=lang_list, average=None)

array([0.76237624, 0.91282051, 0.92907801, 0.92079208, 0.93103448,
       0.67676768, 0.86545455, 0.8951049 , 0.89419795, 0.88732394,
       0.88888889])

### GUI

In [41]:
import tkinter as tk

In [42]:
# Top level window
frame = tk.Tk()
frame.title("ngam_LD")
frame.geometry('400x200')
# Function for getting Input
# from textbox and printing it 
# at label widget
  
def printInput():
    inp = inputtxt.get(1.0, "end-1c")
    lbl.config(text = "Predicted Language: "+language_detection(inp))
  
# TextBox Creation
inputtxt = tk.Text(frame,
                   height = 5,
                   width = 40)
  
inputtxt.pack()
  
# Button Creation
printButton = tk.Button(frame,
                        text = "Predict", 
                        command = printInput)
printButton.pack()
  
# Label Creation
lbl = tk.Label(frame, text = "")
lbl.pack()
frame.mainloop()