In [1]:
import string
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
df = pd.read_csv('Language Detection.csv')
df.head()

Unnamed: 0,Text,Language
0,"Nature, in the broadest sense, is the natural...",English
1,"""Nature"" can refer to the phenomena of the phy...",English
2,"The study of nature is a large, if not the onl...",English
3,"Although humans are part of nature, human acti...",English
4,[1] The word nature is borrowed from the Old F...,English


In [5]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [7]:
def remove_pun(text):
    for pun in string.punctuation:
        text = text.replace(pun ,"")
    text = text.lower()
    return(text)

In [9]:
df['Text'] = df['Text'].apply(remove_pun)

In [10]:
from sklearn.model_selection import train_test_split

In [12]:
X = df.iloc[:,0]
Y = df.iloc[:,1]

0         nature in the broadest sense is the natural p...
1        nature can refer to the phenomena of the physi...
2        the study of nature is a large if not the only...
3        although humans are part of nature human activ...
4        1 the word nature is borrowed from the old fre...
                               ...                        
10332    ನಿಮ್ಮ ತಪ್ಪು ಏನು ಬಂದಿದೆಯೆಂದರೆ ಆ ದಿನದಿಂದ ನಿಮಗೆ ಒ...
10333    ನಾರ್ಸಿಸಾ ತಾನು ಮೊದಲಿಗೆ ಹೆಣಗಾಡುತ್ತಿದ್ದ ಮಾರ್ಗಗಳನ್...
10334    ಹೇಗೆ  ನಾರ್ಸಿಸಿಸಮ್ ಈಗ ಮರಿಯನ್ ಅವರಿಗೆ ಸಂಭವಿಸಿದ ಎಲ...
10335    ಅವಳು ಈಗ ಹೆಚ್ಚು ಚಿನ್ನದ ಬ್ರೆಡ್ ಬಯಸುವುದಿಲ್ಲ ಎಂದು ...
10336    ಟೆರ್ರಿ ನೀವು ನಿಜವಾಗಿಯೂ ಆ ದೇವದೂತನಂತೆ ಸ್ವಲ್ಪ ಕಾಣು...
Name: Text, Length: 10337, dtype: object
0        English
1        English
2        English
3        English
4        English
          ...   
10332    Kannada
10333    Kannada
10334    Kannada
10335    Kannada
10336    Kannada
Name: Language, Length: 10337, dtype: object


In [15]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.2)

In [16]:
from sklearn import feature_extraction

In [17]:
vec = feature_extraction.text.TfidfVectorizer(ngram_range=(1,2),analyzer='char')

In [22]:
from sklearn import pipeline
from sklearn import linear_model

In [25]:
model_pipe = pipeline.Pipeline([('vec',vec), ('clf',linear_model.LogisticRegression())])

In [26]:
model_pipe.fit(X_train,Y_train)

Pipeline(steps=[('vec', TfidfVectorizer(analyzer='char', ngram_range=(1, 2))),
                ('clf', LogisticRegression())])

In [28]:
model_pipe.classes_

array(['Arabic', 'Danish', 'Dutch', 'English', 'French', 'German',
       'Greek', 'Hindi', 'Italian', 'Kannada', 'Malayalam', 'Portugeese',
       'Russian', 'Spanish', 'Sweedish', 'Tamil', 'Turkish'], dtype=object)

In [30]:
predict_val = model_pipe.predict(X_test)

In [32]:
from sklearn import metrics

In [34]:
metrics.accuracy_score(Y_test, predict_val) * 100

97.63056092843327

In [39]:
model_pipe.predict(['My name is Shubham'])

array(['English'], dtype=object)

In [40]:
model_pipe.predict(['Mine name ist Shubham'])

array(['German'], dtype=object)

In [41]:
import pickle

In [45]:
new_file = open('language_detection_model.pckl', 'wb')
pickle.dump(model_pipe , new_file)
new_file.close()