In [1]:
import string 
import re
import codecs
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import feature_extraction
from sklearn import linear_model
from sklearn import pipeline
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [2]:
eng_df=pd.read_csv("ngrams\\english.txt","utf-8",header=None, names=["English"],engine="python")
eng_df.head()

Unnamed: 0,English
0,Whereas recognition of the inherent dignity an...
1,Whereas disregard and contempt for human right...
2,Whereas it is essential if man is not to be co...
3,Whereas it is essential to promote the develop...
4,Whereas the peoples of the United Nations have...


In [3]:
ger_df=pd.read_csv("ngrams\\german.txt","utf-8",header=None, names=["German"],engine="python")
ger_df.head()

Unnamed: 0,German
0,Da die Anerkennung der angeborenen Wurde und d...
1,da die Nichtanerkennung und Verachtung der Men...
2,da es notwendig ist die Menschenrechte durch d...
3,da es notwendig ist die Entwicklung freundscha...
4,da die Volker der Vereinten Nationen in der Ch...


In [4]:
spa_df=pd.read_csv("ngrams\\spanish.txt","utf-8",header=None, names=["Spanish"],engine="python")
spa_df.head()

Unnamed: 0,Spanish
0,Considerando que la libertad la justicia y la ...
1,Considerando que el desconocimiento y el menos...
2,y que se ha proclamado como la aspiracion mas ...
3,Considerando esencial que los derechos humanos...
4,Considerando tambien esencial promover el desa...


In [5]:
fer_df=pd.read_csv("ngrams\\french.txt","utf-8",header=None, names=["French"],engine="python")
fer_df.head()

Unnamed: 0,French
0,Considerant que la reconnaissance de la dignit...
1,Considerant que la meconnaissance et le mepris...
2,Considerant quil est essentiel que les droits ...
3,Considerant quil est essentiel dencourager le ...
4,Considerant que dans la Charte les peuples des...


In [6]:
por_df=pd.read_csv("ngrams\\portuguese.txt","utf-8",header=None, names=["Portuguese"],engine="python")
por_df.head()

Unnamed: 0,Portuguese
0,Considerando que o reconhecimento da dignidade...
1,Considerando que o desconhecimento e o desprez...
2,Considerando que e essencial a protecao dos di...
3,Considerando que e essencial encorajar o desen...
4,Considerando que na Carta os povos das Nacoes ...


In [7]:
ita_df=pd.read_csv("ngrams\\italian.txt","utf-8",header=None, names=["Italian"],engine="python")
ita_df.head()

Unnamed: 0,Italian
0,Considerato che il riconoscimento della dignit...
1,Considerato che il disconoscimento e il dispre...
2,Considerato che e indispensabile che i diritti...
3,Considerato che e indispensabile promuovere lo...
4,Considerato che i popoli delle Nazioni Unite h...


In [8]:
for char in string.punctuation:
    print(char,end=" ")
translate_table=dict((ord(char),None) for char in string.punctuation)

! " # $ % & ' ( ) * + , - . / : ; < = > ? @ [ \ ] ^ _ ` { | } ~ 

In [9]:
data_eng=[]
lang_eng=[]
for i,line in eng_df.iterrows():
    line=line["English"]
    if len(line)!=0:
        line=line.lower()
        line=re.sub(r"\d+","",line)
        line=line.translate(translate_table)
        data_eng.append(line)
        lang_eng.append("English")

In [10]:
data_ger=[]
lang_ger=[]
for i,line in ger_df.iterrows():
    line=line["German"]
    if len(line)!=0:
        line=line.lower()
        line=re.sub(r"\d+","",line)
        line=line.translate(translate_table)
        data_ger.append(line)
        lang_ger.append("German")

In [11]:
data_spa=[]
lang_spa=[]
for i,line in spa_df.iterrows():
    line=line["Spanish"]
    if len(line)!=0:
        line=line.lower()
        line=re.sub(r"\d+","",line)
        line=line.translate(translate_table)
        data_spa.append(line)
        lang_spa.append("Spanish")

In [12]:
data_por=[]
lang_por=[]
for i,line in por_df.iterrows():
    line=line["Portuguese"]
    if len(line)!=0:
        line=line.lower()
        line=re.sub(r"\d+","",line)
        line=line.translate(translate_table)
        data_por.append(line)
        lang_por.append("Portuguese")

In [13]:
data_ita=[]
lang_ita=[]
for i,line in ita_df.iterrows():
    line=line["Italian"]
    if len(line)!=0:
        line=line.lower()
        line=re.sub(r"\d+","",line)
        line=line.translate(translate_table)
        data_ita.append(line)
        lang_ita.append("Italian")

In [14]:
data_fre=[]
lang_fre=[]
for i,line in fer_df.iterrows():
    line=line["French"]
    if len(line)!=0:
        line=line.lower()
        line=re.sub(r"\d+","",line)
        line=line.translate(translate_table)
        data_fre.append(line)
        lang_fre.append("French")

In [15]:
df=pd.DataFrame({"Text":data_eng+data_fre+data_spa+data_por+data_ita+data_ger,
                "Language":lang_eng+lang_fre+lang_ger+lang_spa+lang_ita+lang_por})
print(df.shape)

(436, 2)


In [16]:
X,y=df.iloc[:,0],df.iloc[:,1]
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(348,)
(88,)
(348,)
(88,)


In [17]:
vectorizer=feature_extraction.text.TfidfVectorizer(ngram_range=(1,3),analyzer="char")
pipe_lr_r13=pipeline.Pipeline([
    ("vectorizer",vectorizer),
    ("clf",linear_model.LogisticRegression())
])

In [18]:
pipe_lr_r13.fit(X_train,y_train)

Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(analyzer='char', ngram_range=(1, 3))),
                ('clf', LogisticRegression())])

In [19]:
y_predicted=pipe_lr_r13.predict(X_test)

In [20]:
acc=(metrics.accuracy_score(y_test,y_predicted))*100

In [21]:
print(acc,"%")

98.86363636363636 %


In [22]:
matrix=metrics.confusion_matrix(y_test,y_predicted)
print("Confusion Matrix:\n",matrix)

Confusion Matrix:
 [[17  0  0  0  0  0]
 [ 0 13  0  0  0  0]
 [ 0  0 17  0  0  0]
 [ 0  0  0 16  1  0]
 [ 0  0  0  0 16  0]
 [ 0  0  0  0  0  8]]


In [38]:
def languageDetect(text):
    import numpy as np
    import string
    import re
    translate_table=dict((ord(char),None) for char in string.punctuation)
    text="".join(text.split())
    text=text.lower()
    text=re.sub(r"\d+","",text)
    text=text.translate(translate_table)
    pred=pipe_lr_r13.predict([text])
    prob=pipe_lr_r13.predict_proba([text])
    return pred[0]

In [43]:
languageDetect("Hello world")

'English'

In [44]:
languageDetect("Stefan kümmert sich um die Unterkunft. Auf Teneriffa gibt es sehr viele Hotels mit unterschiedlichen Preisen")

'Portuguese'

In [26]:
languageDetect("Mein Name ist Hallo Welt. Ich bin gut, wie geht es dir?")

'Portuguese'

In [27]:
languageDetect("mon nom est bonjour le monde, je suis bien comment vas-tu être de retour")

'French'

In [28]:
languageDetect("mi nombre es hola mundo estoy bien, ¿cómo estás?")

'Spanish'

In [29]:
languageDetect("i love pasta and garlic italian i am good how are you we are great come by soon bye")

'English'

In [30]:
languageDetect("amo la pasta e l'aglio italiano sono bravo come stai siamo fantastici vieni presto ciao")

'Italian'

In [42]:
languageDetect("My name is Stuti, how are you ?")

'English'

In [40]:
languageDetect("Bonjour mon ami je suis bien comment vas-tu être de retour")

'French'

In [41]:
languageDetect("bonjour j'aime les pâtes et l'ail italien je vais bien comment allez-vous nous sommes super venez bientôt au revoir")

'French'

In [45]:
languageDetect("1234567 my name is stuti good morning")

'Portuguese'

In [46]:
languageDetect("my name is stuti , welcome abroad i hope you are doing well !&3456")

'English'

In [47]:
languageDetect("meu nome é bem vindo bom dia tchau você está bem")

'Portuguese'

In [48]:
languageDetect("il mio nome è buon benvenuto buongiorno ciao stai bene")

'Italian'

In [49]:
languageDetect("mon nom est bon accueil bonjour au revoir es-tu bon")

'French'

In [50]:
languageDetect("меня зовут добро добро пожаловать доброе утро пока вы в порядке")

'Portuguese'

In [51]:
langaugeDetect("nomen mihi grata est, bonis bonum mane qui tibi bonam vale")

NameError: name 'langaugeDetect' is not defined

In [52]:
languageDetect("nomen mihi grata est, bonis bonum mane qui tibi bonam vale")

'Italian'