In [128]:
import string
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns

Loading Data

In [129]:
df = pd.read_csv(r"D:\projects\data\NLP_data\Language Detection.csv")

In [130]:
df.head()

Unnamed: 0,Text,Language
0,"Nature, in the broadest sense, is the natural...",English
1,"""Nature"" can refer to the phenomena of the phy...",English
2,"The study of nature is a large, if not the onl...",English
3,"Although humans are part of nature, human acti...",English
4,[1] The word nature is borrowed from the Old F...,English


Checking null values

In [157]:
df.isnull().sum()

Text        0
Language    0
dtype: int64

Removing punctuation and convert text into lower words

In [132]:
try:
    def remove_punctuation(text):
        for punctuation in string.punctuation:
            text = text.replace(punctuation,"")
        text = text.lower()
        return text
except:
    print('e')    

In [133]:
df['Text']= df['Text'].apply(remove_punctuation)

In [134]:
df['Text']

0         nature in the broadest sense is the natural p...
1        nature can refer to the phenomena of the physi...
2        the study of nature is a large if not the only...
3        although humans are part of nature human activ...
4        1 the word nature is borrowed from the old fre...
                               ...                        
10332    ನಿಮ್ಮ ತಪ್ಪು ಏನು ಬಂದಿದೆಯೆಂದರೆ ಆ ದಿನದಿಂದ ನಿಮಗೆ ಒ...
10333    ನಾರ್ಸಿಸಾ ತಾನು ಮೊದಲಿಗೆ ಹೆಣಗಾಡುತ್ತಿದ್ದ ಮಾರ್ಗಗಳನ್...
10334    ಹೇಗೆ  ನಾರ್ಸಿಸಿಸಮ್ ಈಗ ಮರಿಯನ್ ಅವರಿಗೆ ಸಂಭವಿಸಿದ ಎಲ...
10335    ಅವಳು ಈಗ ಹೆಚ್ಚು ಚಿನ್ನದ ಬ್ರೆಡ್ ಬಯಸುವುದಿಲ್ಲ ಎಂದು ...
10336    ಟೆರ್ರಿ ನೀವು ನಿಜವಾಗಿಯೂ ಆ ದೇವದೂತನಂತೆ ಸ್ವಲ್ಪ ಕಾಣು...
Name: Text, Length: 10337, dtype: object

Data splitting into train and test 

In [135]:
from sklearn.model_selection import train_test_split

In [136]:
X = df.iloc[:,0]
Y = df.iloc[:,1]

In [137]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2)

Convert data into a matrix of TF-IDF features

In [138]:
from sklearn import feature_extraction
vec = feature_extraction.text.TfidfVectorizer(ngram_range=(1,2),analyzer='char')

In [139]:
vec

Create pipeline and build Logistic Regresssion model

In [140]:
from sklearn import pipeline
from sklearn.linear_model import LogisticRegression

In [143]:
model_pipe = pipeline.Pipeline([('vec', vec), ('clf', linear_model.LogisticRegression())])

In [144]:
model_pipe.fit(X_train,Y_train)

In [158]:
y_pred = model_pipe.predict(X_test)

In [159]:
y_pred

array(['Portugeese', 'English', 'Portugeese', ..., 'Portugeese', 'Dutch',
       'English'], dtype=object)

In [147]:
from sklearn import metrics

Finding Accuracy

In [163]:
metrics.accuracy_score(Y_test, y_pred)*100

97.82398452611218

In [164]:
metrics.confusion_matrix(Y_test, y_pred)

array([[100,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0],
       [  0,  96,   1,   4,   2,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   4,   0,   0],
       [  0,   1, 107,   2,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0],
       [  0,   0,   0, 263,   0,   0,   0,   0,   1,   0,   0,   0,   0,
          0,   0,   0,   0],
       [  0,   0,   0,   0, 210,   0,   0,   0,   0,   0,   0,   1,   0,
          1,   0,   0,   0],
       [  0,   1,   2,   2,   1,  99,   0,   0,   0,   0,   0,   0,   0,
          0,   1,   0,   0],
       [  0,   0,   0,   0,   0,   0,  70,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0],
       [  0,   0,   0,   0,   0,   0,   0,  12,   0,   0,   0,   0,   0,
          0,   0,   0,   0],
       [  0,   0,   1,   1,   1,   0,   0,   0, 133,   0,   0,   0,   0,
          4,   0,   0,   0],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,  79,   0,   0,   0,
         

Testing our Model

In [150]:
model_pipe.predict(['My name is Shahin'])

array(['English'], dtype=object)

In [151]:
model_pipe.predict(['मेरा नाम शाहीन है'])

array(['Hindi'], dtype=object)

In [152]:
model_pipe.predict(['আমার নাম শাহিন'])

array(['Danish'], dtype=object)

In [153]:
model_pipe.predict(['l hábito no hace al monje.'])

array(['Spanish'], dtype=object)

In [154]:
model_pipe.predict(['De liefde is als de wind, je kunt het niet zien maar wel voelen'])

array(['Dutch'], dtype=object)

In [155]:
model_pipe.predict([' മലയാള പ്രചോദന ഉദ്ധരണികൾ'])

array(['Malayalam'], dtype=object)