In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer,TfidfVectorizer

#### Understanding Ngrams and analyser in TFVectorizer

In [17]:
cv = CountVectorizer()
tft = TfidfTransformer()
tfv = TfidfVectorizer(ngram_range=(1,3),analyzer='char')#takes ngrams of chars as 1,2,3 and creates the vector names

In [4]:
sent = ['he is a good boy','she is a good girl','she and he both are good']

In [8]:
cv_fit = cv.fit_transform(sent)

In [11]:
cv.get_feature_names()

['and', 'are', 'both', 'boy', 'girl', 'good', 'he', 'is', 'she']

In [9]:
cv_fit.toarray()

array([[0, 0, 0, 1, 0, 1, 1, 1, 0],
       [0, 0, 0, 0, 1, 1, 0, 1, 1],
       [1, 1, 1, 0, 0, 1, 1, 0, 1]], dtype=int64)

In [12]:
tf_fit = tft.fit_transform(cv_fit)

In [13]:
tf_fit.toarray()

array([[0.        , 0.        , 0.        , 0.63174505, 0.        ,
        0.37311881, 0.4804584 , 0.4804584 , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.63174505,
        0.37311881, 0.        , 0.4804584 , 0.4804584 ],
       [0.4711101 , 0.4711101 , 0.4711101 , 0.        , 0.        ,
        0.27824521, 0.35829137, 0.        , 0.35829137]])

In [18]:
tfc_fit = tfv.fit_transform(sent)

In [19]:
tfv.get_feature_names()

[' ',
 ' a',
 ' a ',
 ' an',
 ' ar',
 ' b',
 ' bo',
 ' g',
 ' gi',
 ' go',
 ' h',
 ' he',
 ' i',
 ' is',
 'a',
 'a ',
 'a g',
 'an',
 'and',
 'ar',
 'are',
 'b',
 'bo',
 'bot',
 'boy',
 'd',
 'd ',
 'd b',
 'd g',
 'd h',
 'e',
 'e ',
 'e a',
 'e b',
 'e g',
 'e i',
 'g',
 'gi',
 'gir',
 'go',
 'goo',
 'h',
 'h ',
 'h a',
 'he',
 'he ',
 'i',
 'ir',
 'irl',
 'is',
 'is ',
 'l',
 'n',
 'nd',
 'nd ',
 'o',
 'od',
 'od ',
 'oo',
 'ood',
 'ot',
 'oth',
 'oy',
 'r',
 're',
 're ',
 'rl',
 's',
 's ',
 's a',
 'sh',
 'she',
 't',
 'th',
 'th ',
 'y']

### Language Identification

In [24]:
data = pd.read_csv('Language Detection.csv')

In [25]:
len(data)

10337

##### Lets take only English,French,Hindi,Spanish

In [27]:
data_English = data[data['Language']=='English']

In [29]:
data_French = data[data['Language']=='French']
data_Hindi = data[data['Language']=='Hindi']
data_Spansih = data[data['Language']=='Spanish']

In [32]:
data_French

Unnamed: 0,Text,Language
3250,Si vous disposez d'ouvrages ou d'articles de r...,French
3251,Comment ajouter mes sources ?,French
3252,Cette page ou section est en train d'être trad...,French
3253,Vous pouvez aider au développement de Wikipédi...,French
3254,Le mot nature est un terme polysémique (c’est-...,French
...,...,...
4259,"quelle a été votre erreur, nous allons vous no...",French
4260,"narcisa a changé ses manières, elle a lutté au...",French
4261,Comment' Le narcissisme de s maintenant marian...,French
4262,a-t-elle je suppose qu'elle ne voudrait plus d...,French


In [30]:
data_lang = pd.concat([data_English,data_French,data_Hindi,data_Spansih])

In [34]:
data_lang.reset_index(inplace=True)

In [35]:
data_lang

Unnamed: 0,index,Text,Language
0,0,"Nature, in the broadest sense, is the natural...",English
1,1,"""Nature"" can refer to the phenomena of the phy...",English
2,2,"The study of nature is a large, if not the onl...",English
3,3,"Although humans are part of nature, human acti...",English
4,4,[1] The word nature is borrowed from the Old F...,English
...,...,...,...
3276,5624,¿Tiene ella?,Spanish
3277,5625,Supongo que no querría más pan de oro ahora ¿e...,Spanish
3278,5626,"Terry, en realidad te pareces un poco a ese án...",Spanish
3279,5627,¿Cómo pudiste ser él?,Spanish


In [71]:
string = 'He is a good boy 90 at age'

In [48]:
import re

In [82]:
remove_except_alphabets = re.sub(r"[^a-zA-Z]"," ",string)

In [83]:
remove_except_alphabets

'He is a good boy    at age'

In [84]:
removing_digits_1 = re.sub(r"[0-9]","",string)

In [85]:
removing_digits_1

'He is a good boy  at age'

In [93]:
removing_digits_2 = re.sub(r"\d","",string)

In [94]:
removing_digits_2

'He is a good boy  at age'

In [89]:
remove_white_space = re.sub(r"\s"," ",removing_digits_2)

In [90]:
remove_white_space

'He is a good boy  at age'

##### Cleaning of the data can be done to Hindi by removing the english alphabets if there are any

In [95]:
tf_vecotorizer = TfidfVectorizer(ngram_range=(1,3),analyzer='char')

In [97]:
def labeling(col):
    if col == 'English':
        return 1
    elif col == 'Hindi':
        return 2
    elif col == 'French':
        return 3
    else:
        return 4

In [98]:
data_lang['label'] = data_lang['Language'].apply(labeling)

In [102]:
data_lang

Unnamed: 0,index,Text,Language,label
0,0,"Nature, in the broadest sense, is the natural...",English,1
1,1,"""Nature"" can refer to the phenomena of the phy...",English,1
2,2,"The study of nature is a large, if not the onl...",English,1
3,3,"Although humans are part of nature, human acti...",English,1
4,4,[1] The word nature is borrowed from the Old F...,English,1
...,...,...,...,...
3276,5624,¿Tiene ella?,Spanish,4
3277,5625,Supongo que no querría más pan de oro ahora ¿e...,Spanish,4
3278,5626,"Terry, en realidad te pareces un poco a ese án...",Spanish,4
3279,5627,¿Cómo pudiste ser él?,Spanish,4


In [101]:
from sklearn.model_selection import train_test_split

In [103]:
X_train,X_test,y_train,y_test = train_test_split(data_lang['Text'],data_lang['label'],test_size=0.3,random_state=101)

In [105]:
X_train_vectorized = tf_vecotorizer.fit_transform(X_train)

In [106]:
from sklearn.linear_model import LogisticRegression

In [107]:
lr = LogisticRegression()

In [108]:
lr.fit(X_train_vectorized,y_train)

LogisticRegression()

In [109]:
X_test_vectorized = tf_vecotorizer.transform(X_test)

In [110]:
predictions = lr.predict(X_test_vectorized)

In [114]:
from sklearn.metrics import classification_report,confusion_matrix

In [113]:
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           1       0.99      0.99      0.99       431
           2       1.00      1.00      1.00        15
           3       0.99      0.98      0.98       296
           4       0.99      1.00      0.99       243

    accuracy                           0.99       985
   macro avg       0.99      0.99      0.99       985
weighted avg       0.99      0.99      0.99       985



In [115]:
print(confusion_matrix(y_test,predictions))

[[428   0   3   0]
 [  0  15   0   0]
 [  4   0 290   2]
 [  0   0   1 242]]


In [116]:
test_sentence = '¿Cómo estás hoy y qué estás haciendo?'

In [118]:
lr.predict(tf_vecotorizer.transform([test_sentence]))[0]

4

In [122]:
def prediction():
    sentence = input("Enter your Sentence:")
    pred = lr.predict(tf_vecotorizer.transform([sentence]))
    if pred[0]==1:
        print("English Detected")
    elif pred[0]==2:
        print("Hindi Detected")
    elif pred[0]==3:
        print("French Detected")
    else:
        print("Spanish Detected")    

In [123]:
prediction()

Enter your Sentence: Comment allez-vous aujourd'hui et que faites-vous
French Detected


In [124]:
prediction()

Enter your Sentence: आज आप कैसे हैं और आप क्या कर रहे हैं
Hindi Detected


In [125]:
prediction()

Enter your Sentence:How are you today and what were you up to
English Detected


In [126]:
prediction()

Enter your Sentence:¿Cómo estás hoy y qué estabas haciendo?
Spanish Detected
