In [147]:
import string
import pandas as pd
import numpy as np

In [148]:
data = pd.read_csv("Language Detection.csv")

In [149]:
data.head()

Unnamed: 0,Text,Language
0,"Nature, in the broadest sense, is the natural...",English
1,"""Nature"" can refer to the phenomena of the phy...",English
2,"The study of nature is a large, if not the onl...",English
3,"Although humans are part of nature, human acti...",English
4,[1] The word nature is borrowed from the Old F...,English


In [150]:
data.shape

(10337, 2)

In [151]:
#cleaning the data using inbuilt python library
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [152]:
def remove_punc(text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation,"")
    text = text.lower()
    return(text)

In [153]:
data['Text'] = data['Text'].apply(remove_punc)

In [154]:
data.head()

Unnamed: 0,Text,Language
0,nature in the broadest sense is the natural p...,English
1,nature can refer to the phenomena of the physi...,English
2,the study of nature is a large if not the only...,English
3,although humans are part of nature human activ...,English
4,1 the word nature is borrowed from the old fre...,English


In [155]:
data.columns

Index(['Text', 'Language'], dtype='object')

In [156]:
import sklearn
from sklearn.model_selection import train_test_split

In [157]:
X = data.iloc[:,0]
Y = data.iloc[:,1]

In [158]:
X

0         nature in the broadest sense is the natural p...
1        nature can refer to the phenomena of the physi...
2        the study of nature is a large if not the only...
3        although humans are part of nature human activ...
4        1 the word nature is borrowed from the old fre...
                               ...                        
10332    ನಿಮ್ಮ ತಪ್ಪು ಏನು ಬಂದಿದೆಯೆಂದರೆ ಆ ದಿನದಿಂದ ನಿಮಗೆ ಒ...
10333    ನಾರ್ಸಿಸಾ ತಾನು ಮೊದಲಿಗೆ ಹೆಣಗಾಡುತ್ತಿದ್ದ ಮಾರ್ಗಗಳನ್...
10334    ಹೇಗೆ  ನಾರ್ಸಿಸಿಸಮ್ ಈಗ ಮರಿಯನ್ ಅವರಿಗೆ ಸಂಭವಿಸಿದ ಎಲ...
10335    ಅವಳು ಈಗ ಹೆಚ್ಚು ಚಿನ್ನದ ಬ್ರೆಡ್ ಬಯಸುವುದಿಲ್ಲ ಎಂದು ...
10336    ಟೆರ್ರಿ ನೀವು ನಿಜವಾಗಿಯೂ ಆ ದೇವದೂತನಂತೆ ಸ್ವಲ್ಪ ಕಾಣು...
Name: Text, Length: 10337, dtype: object

In [159]:
Y

0        English
1        English
2        English
3        English
4        English
          ...   
10332    Kannada
10333    Kannada
10334    Kannada
10335    Kannada
10336    Kannada
Name: Language, Length: 10337, dtype: object

In [160]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.2)

In [161]:
X_train.shape

(8269,)

In [162]:
X_test.shape

(2068,)

In [163]:
Y_train.shape

(8269,)

In [164]:
Y_test.shape

(2068,)

In [165]:
#Vectorization of the dataset
from sklearn import feature_extraction

In [166]:
vec = feature_extraction.text.TfidfVectorizer(ngram_range=(1,2),analyzer='char')

In [167]:
from sklearn import pipeline
from sklearn import linear_model

In [168]:
model_pipeline = pipeline.Pipeline([('vec',vec),('clf',linear_model.LogisticRegression())])

In [169]:
model_pipeline.fit(X_train,Y_train)

In [170]:
predicted_values = model_pipeline.predict(X_test)

In [171]:
predicted_values

array(['Russian', 'Portugeese', 'Turkish', ..., 'English', 'Portugeese',
       'Kannada'], dtype=object)

In [172]:
from sklearn.metrics import accuracy_score

In [173]:
model_accuracy = accuracy_score(Y_test,predicted_values)
model_accuracy*100

97.96905222437138

In [174]:
model_pipeline.predict(['My name is Jay.'])


array(['English'], dtype=object)

In [175]:
model_pipeline.predict(['वाक्य बड़े अक्षर से शुरू होता है.'])

array(['Hindi'], dtype=object)

In [176]:
#creating the web-app
import pickle

In [177]:
#creating a new pkl file to dump the model
new_file = open('model.pckl','wb')

In [178]:
pickle.dump(model_pipeline,new_file)

In [179]:
new_file.close()