In [1]:
import pandas as pd
import numpy as np
import re
import seaborn as sns
import matplotlib.pyplot as plt
import pickle
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import warnings
warnings.simplefilter("ignore")

In [2]:
# import csv 
df = pd.read_csv("../datasets/Language Detection.csv")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10337 entries, 0 to 10336
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Text      10337 non-null  object
 1   Language  10337 non-null  object
dtypes: object(2)
memory usage: 161.6+ KB


In [4]:
# check for null values
df.isnull().sum()

Text        0
Language    0
dtype: int64

In [5]:
# check for duplicates
df.duplicated().sum()

66

In [6]:
# remove duplicates
df.drop_duplicates(inplace=True)

In [7]:
# check for duplicates
df.duplicated().sum()

0

In [8]:
# check for unique values
df['Language'].unique()

array(['English', 'Malayalam', 'Hindi', 'Tamil', 'Portugeese', 'French',
       'Dutch', 'Spanish', 'Greek', 'Russian', 'Danish', 'Italian',
       'Turkish', 'Sweedish', 'Arabic', 'German', 'Kannada'], dtype=object)

In [9]:
# lets see few entries per language
df[df['Language'] == 'English'].head(5)

Unnamed: 0,Text,Language
0,"Nature, in the broadest sense, is the natural...",English
1,"""Nature"" can refer to the phenomena of the phy...",English
2,"The study of nature is a large, if not the onl...",English
3,"Although humans are part of nature, human acti...",English
4,[1] The word nature is borrowed from the Old F...,English


In [10]:

df[df['Language'] == 'French'].head(5)

Unnamed: 0,Text,Language
3250,Si vous disposez d'ouvrages ou d'articles de r...,French
3251,Comment ajouter mes sources ?,French
3252,Cette page ou section est en train d'être trad...,French
3253,Vous pouvez aider au développement de Wikipédi...,French
3254,Le mot nature est un terme polysémique (c’est-...,French


In [11]:

df[df['Language'] == 'Hindi'].head(5)

Unnamed: 0,Text,Language
1979,विकि-शब्दकोष (एक मुक्त शब्दकोष एवं समानांतर को...,Hindi
1980,"[42] अंत में, विकिपीडिया एक पक्ष नहीं लेता है।...",Hindi
1981,बोट्स नामक कंप्यूटर प्रोग्राम के निर्माण के बा...,Hindi
1982,"""""नहीं, हम नहीं जानते"", जिमी ने कहा.",Hindi
1983,[60] कुछ आलोचकों का दावा है कि विकिपीडिया की ख...,Hindi


In [12]:
X = df["Text"]
y = df["Language"]

In [13]:
# label encoding

le = LabelEncoder()
y = le.fit_transform(y)

le.classes_

array(['Arabic', 'Danish', 'Dutch', 'English', 'French', 'German',
       'Greek', 'Hindi', 'Italian', 'Kannada', 'Malayalam', 'Portugeese',
       'Russian', 'Spanish', 'Sweedish', 'Tamil', 'Turkish'], dtype=object)

In [14]:
# preprocessing the text

def preprocess(text):
    text = text.lower()
    text = re.sub(r'[!@#$(),\n"%^*?\:;~`0-9]', ' ', text) # remove punctuations
    text = re.sub(r'[[]]', ' ', text) # remove square brackets
    text = re.sub(r"\s+"," ",text) # remove extra spaces
    text = re.sub(r"\'s"," ",text) # remove 's
    return text

In [15]:

X = X.apply(preprocess)

In [16]:
# Train test split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
# Bag of words on text data

from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()

X_train = cv.fit_transform(X_train)
X_test = cv.transform(X_test)

In [18]:
X_train.shape, X_test.shape

((8216, 34527), (2055, 34527))

In [19]:
# Model building

from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=150, random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)

In [20]:
# Model evaluation

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

y_pred = rf.predict(X_test)

In [21]:
print('train accuracy: ', accuracy_score(y_train, rf.predict(X_train)))

print('train classification report: ', classification_report(y_train, rf.predict(X_train)))

train accuracy:  0.9982960077896786
train classification report:                precision    recall  f1-score   support

           0       1.00      1.00      1.00       411
           1       1.00      1.00      1.00       339
           2       1.00      1.00      1.00       434
           3       1.00      1.00      1.00      1092
           4       1.00      1.00      1.00       797
           5       1.00      1.00      1.00       379
           6       1.00      1.00      1.00       293
           7       1.00      1.00      1.00        54
           8       1.00      1.00      1.00       564
           9       0.96      1.00      0.98       296
          10       1.00      1.00      1.00       469
          11       1.00      1.00      1.00       597
          12       1.00      1.00      1.00       569
          13       1.00      1.00      1.00       656
          14       1.00      1.00      1.00       523
          15       1.00      0.98      0.99       378
          16   

In [22]:

print('test accuracy is {}', accuracy_score(y_test, y_pred))

print('test classification report is {}', classification_report(y_test, y_pred))

test accuracy is {} 0.9318734793187348
test classification report is {}               precision    recall  f1-score   support

           0       1.00      0.91      0.95       121
           1       0.95      0.93      0.94        85
           2       0.98      0.92      0.95       108
           3       0.98      0.97      0.98       290
           4       0.99      0.91      0.95       210
           5       0.99      0.93      0.96        86
           6       1.00      0.91      0.95        65
           7       1.00      0.88      0.93         8
           8       0.96      0.92      0.94       130
           9       0.41      1.00      0.58        70
          10       1.00      0.95      0.97       122
          11       0.98      0.92      0.95       139
          12       1.00      0.93      0.97       119
          13       0.91      0.93      0.92       160
          14       0.97      0.94      0.96       150
          15       1.00      0.98      0.99        86
         

In [23]:


def predict_language(text):
    text = preprocess(text)
    text = cv.transform([text])
    return le.inverse_transform(rf.predict(text))[0]

In [24]:
# Check for a random sentence
predict_language("Hello, how are you?")

'English'

In [25]:
predict_language("Bonjour, comment allez-vous?")

'French'

In [26]:
predict_language("आप कैसे हैं?")

'Hindi'

> seems pretty good so going ahead and saving to deploy

In [28]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('bow', CountVectorizer()),
    ('classifier', RandomForestClassifier(n_estimators=150, random_state=42, n_jobs=-1))
])

pipeline.fit(X, y) # train on entire data

In [31]:
# quick check for a random sentence on the pipeline

sample_text = "आप कैसे हैं?"

#preprocess
sample_text = preprocess(sample_text)
le.inverse_transform(pipeline.predict([sample_text]))[0]


'Hindi'

In [32]:
# saving the pipeline locally. Rewrite if already exists

pickle.dump(pipeline, open("../app/model/language_detection_pipeline.pkl", "wb"))


In [33]:
# save the label encoder
pickle.dump(le, open("../app/model/label_encoder.pkl", "wb"))