In [14]:
import string
import pickle
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn import feature_extraction
from sklearn import pipeline
from sklearn import linear_model
from sklearn import metrics

In [15]:
df = pd.read_csv('Language Detection.csv')
df.head()

Unnamed: 0,Text,Language
0,"Nature, in the broadest sense, is the natural...",English
1,"""Nature"" can refer to the phenomena of the phy...",English
2,"The study of nature is a large, if not the onl...",English
3,"Although humans are part of nature, human acti...",English
4,[1] The word nature is borrowed from the Old F...,English


# **Preprocessing the data**

In [16]:
def remove_pun(text):
    for pun in string.punctuation:
        text = text.replace(pun,'')
    text.lower()
    return (text)
df['Text'].apply(remove_pun)

0         Nature in the broadest sense is the natural p...
1        Nature can refer to the phenomena of the physi...
2        The study of nature is a large if not the only...
3        Although humans are part of nature human activ...
4        1 The word nature is borrowed from the Old Fre...
                               ...                        
10332    ನಿಮ್ಮ ತಪ್ಪು ಏನು ಬಂದಿದೆಯೆಂದರೆ ಆ ದಿನದಿಂದ ನಿಮಗೆ ಒ...
10333    ನಾರ್ಸಿಸಾ ತಾನು ಮೊದಲಿಗೆ ಹೆಣಗಾಡುತ್ತಿದ್ದ ಮಾರ್ಗಗಳನ್...
10334    ಹೇಗೆ  ನಾರ್ಸಿಸಿಸಮ್ ಈಗ ಮರಿಯನ್ ಅವರಿಗೆ ಸಂಭವಿಸಿದ ಎಲ...
10335    ಅವಳು ಈಗ ಹೆಚ್ಚು ಚಿನ್ನದ ಬ್ರೆಡ್ ಬಯಸುವುದಿಲ್ಲ ಎಂದು ...
10336    ಟೆರ್ರಿ ನೀವು ನಿಜವಾಗಿಯೂ ಆ ದೇವದೂತನಂತೆ ಸ್ವಲ್ಪ ಕಾಣು...
Name: Text, Length: 10337, dtype: object

In [17]:
X = df.iloc[:,0]
y = df.iloc[:,1]

# **Splitting the data**

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2)

# **Traing the model**

In [19]:
vec = feature_extraction.text.TfidfVectorizer(ngram_range = (1,2), analyzer = 'char')

In [20]:
model_pip = pipeline.Pipeline([('vec',vec),('clf',linear_model.LogisticRegression())])

In [21]:
model_pip.fit(X_train,y_train)

In [22]:
model_pip.classes_

array(['Arabic', 'Danish', 'Dutch', 'English', 'French', 'German',
       'Greek', 'Hindi', 'Italian', 'Kannada', 'Malayalam', 'Portugeese',
       'Russian', 'Spanish', 'Sweedish', 'Tamil', 'Turkish'], dtype=object)

In [23]:
predicted_label = model_pip.predict(X_test)

In [24]:
metrics.accuracy_score(y_test,predicted_label)

0.9821083172147002

In [25]:
metrics.confusion_matrix(y_test,predicted_label)

array([[109,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0],
       [  0,  66,   0,   1,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0],
       [  0,   1, 118,   1,   1,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   1,   0,   0],
       [  0,   0,   0, 290,   2,   0,   0,   0,   0,   0,   0,   0,   0,
          1,   0,   0,   0],
       [  0,   0,   0,   0, 195,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0],
       [  0,   0,   3,   1,   1,  96,   0,   0,   0,   0,   0,   0,   0,
          0,   1,   0,   1],
       [  0,   0,   0,   0,   0,   0,  72,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0],
       [  0,   0,   0,   0,   0,   0,   0,  11,   0,   0,   0,   0,   0,
          0,   0,   0,   0],
       [  0,   0,   0,   0,   1,   0,   0,   0, 126,   0,   0,   0,   0,
          2,   0,   0,   0],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,  73,   0,   0,   0,
         

# **Saving the model**

In [26]:
new_file = open('model.pckl','wb')
pickle.dump(model_pip,new_file)

In [27]:
from joblib import dump
dump(model_pip,'model.joblib')

['model.joblib']