In [1]:
import pandas as pd
import numpy as np
import re
import seaborn as sns
import matplotlib.pyplot as plt
import pickle
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import warnings
warnings.simplefilter("ignore")

In [146]:
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger


import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.dataset import random_split

import tqdm

In [8]:
# import csv 
df = pd.read_csv("../datasets/Language Detection.csv")

In [13]:
df.shape, df.columns

((10337, 2), Index(['Text', 'Language'], dtype='object'))

In [26]:
df.Language.nunique()

17

In [68]:
# create a torch dataset.
class LanguageDataset(Dataset):
    def __init__(self, df):
        self.df = df
        self.text = df["Text"].values
        self.label = df["Language"].values
        self.label_encoder = LabelEncoder()
        self.label = self.label_encoder.fit_transform(self.label)

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        text = self.text[idx]
        label = self.label[idx]
        return text, label

In [69]:

dataset_df = LanguageDataset(df)

# split the dataset into train and test
num_train = int(len(dataset_df) * 0.8)
split_train_, split_valid_ = \
    random_split(dataset_df, [num_train, len(dataset_df) - num_train])


In [70]:
len(split_train_), len(split_valid_), len(dataset_df)

(8269, 2068, 10337)

In [76]:
# check few entries of the dataset
for i in range(5):
    print(split_train_[i])
    

("L'apprendimento automatico e il data mining infatti si sovrappongono in modo significativo, ma mentre l'apprendimento automatico si concentra sulla previsione basata su proprietà note apprese dai dati, il data mining si concentra sulla scoperta di proprietà prima sconosciute nei dati.", 8)
('Ich habe versucht, Ihnen zu helfen, konnte aber das Ergebnis nicht erreichen, das Sie sagen können.', 5)
('Υπάρχουν τρία κύρια χαρακτηριστικά του σχεδίου της Βικιπαίδειας, τα οποία σε συνδυασμό ορίζουν τον ιστό του στο Διαδίκτυο (World Wide Web) και το κάνουν μοναδικό: H Βικιπαίδεια, όπως και η Nupedia (άλλο ένα σχέδιο ελεύθερης και δωρεάν εγκυκλοπαίδειας), υποστηρίζεται από τον υπέρμαχο του ελεύθερου και δωρεάν λογισμικού Ρίτσαρντ Στόλλμαν και το Ίδρυμα Ελεύθερου Λογισμικού.', 6)
('[29] Larry Sanger e Jimmy Wales fundaram a Wikipédia.', 11)
('In addition to performing linear classification, SVMs can efficiently perform a non-linear classification using what is called the kernel trick, implicitly

In [77]:
# preprocessing the text

def preprocess(text):
    text = text.lower()
    text = re.sub(r'[!@#$(),\n"%^*?\:;~`0-9]', ' ', text) # remove punctuations
    text = re.sub(r'[[]]', ' ', text) # remove square brackets
    text = re.sub(r"\s+"," ",text) # remove extra spaces
    text = re.sub(r"\'s"," ",text) # remove 's
    return text

In [78]:
# NgramTokenizer

class NgramTokenizer:
    def __init__(self, ngram=1):
        self.ngram = ngram

    def __call__(self, text):
        #preprocess
        text = preprocess(text)
        # split the text into words
        words = text.split()
        # create ngrams
        ngrams = zip(*[words[i:] for i in range(self.ngram)])
        return [" ".join(ngram) for ngram in ngrams]

In [79]:
# unigram tokenizer
tokenizer = NgramTokenizer(ngram=1)
tokenizer("Hello world how are you")

['hello', 'world', 'how', 'are', 'you']

In [80]:
# create a vocabulary
from collections import Counter
from torchtext.vocab import vocab

def create_vocab(tokenizer, dataset, min_freq=1):
    counter = Counter()
    for text, _ in dataset:
        counter.update(tokenizer(text))

    print(f"Unique tokens in dataset: {len(counter)}")
    print('few of them: ', counter.most_common(10))
    return vocab(counter, min_freq=min_freq)

In [81]:
# create a vocab
vocab = create_vocab(tokenizer, split_train_,1)

Unique tokens in dataset: 45042
few of them:  [('de', 2781), ('a', 1514), ('the', 1298), ('en', 1235), ('la', 1189), (']', 1122), ('[', 1120), ('que', 1010), ('in', 974), ('of', 919)]


In [82]:
vocab_size = len(vocab) # vocab size

In [83]:
def encode(x,voc=None,unk=0,tokenizer=tokenizer):
    v = vocab if voc is None else voc
    return [v.get_stoi().get(s,unk) for s in tokenizer(x)]

In [84]:
#checkin the encode function
encode("Hello world how are you, लिए")

[27080, 62, 4063, 545, 542, 305]

In [117]:
def to_bow(text,bow_vocab_size=vocab_size):
    res = torch.zeros(bow_vocab_size,dtype=torch.long)
    for i in encode(text):
        if i<bow_vocab_size:
            res[i] += 1
    return res

print(f"sample text:\n{split_train_[0][0]}")
print(f"\nBoW vector:\n{to_bow(split_train_[0][0])}")


sample text:
L'apprendimento automatico e il data mining infatti si sovrappongono in modo significativo, ma mentre l'apprendimento automatico si concentra sulla previsione basata su proprietà note apprese dai dati, il data mining si concentra sulla scoperta di proprietà prima sconosciute nei dati.

BoW vector:
tensor([2, 2, 1,  ..., 0, 0, 0])


In [118]:
def bowify(b):
    # creates a bag of words representation of a batch of text
    return (
            torch.stack([to_bow(t[0]) for t in b]),
            # torch integer tensor of the labels
                torch.tensor([t[1] for t in b], dtype=torch.int64)
    )


In [157]:
# dataloader
data_loader_train = DataLoader(split_train_, batch_size=32, shuffle=True, collate_fn=bowify)
data_loader_valid = DataLoader(split_valid_, batch_size=32, shuffle=False, collate_fn=bowify)

In [158]:
# check the dataloader output
for text, label in data_loader_train:
    print(text)
    print(text.shape)
    print(label)
    print(len(text), len(label))
    break

tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]])
torch.Size([32, 45042])
tensor([13,  1,  6,  1, 13,  8,  4, 14, 15,  4, 15, 14, 13,  6,  6, 13,  2, 10,
         8, 11, 11,  5, 13, 14, 15,  1,  3,  5,  4,  3,  8,  5])
32 32


In [159]:
vocab_size

45042

In [184]:
# lets build a pytorch lightning model for classification
class LanguageDetectionModel(pl.LightningModule):
    def __init__(self, vocab_size, embedding_dim, num_classes):
        super(LanguageDetectionModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.fc1 = nn.Linear(embedding_dim, 10)
        self.fc2 = nn.Linear(10, num_classes)

    def forward(self, x):
        x = self.embedding(x) # embeddin 
        x = torch.mean(x, dim=1) # average pooling, dimension 1 is the batch dimension
        x = F.relu(self.fc1(x)) # relu activation
        x = self.fc2(x)
        return x

    def training_step(self, batch, batch_idx):
        text, label = batch
        label = label.long() # convert to long
        logits = self(text)
        loss = F.cross_entropy(logits, label) # calculate loss
        self.log("train_loss", loss, prog_bar=True) # log the loss
        return loss

    def validation_step(self, batch, batch_idx):
        text, label = batch
        label = label.long()
        logits = self(text)
        loss = F.cross_entropy(logits, label)
        self.log("val_loss", loss, prog_bar=True)
        return loss

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=1e-3)
        return optimizer

    def train_dataloader(self):
        return data_loader_train

    def val_dataloader(self):
        return data_loader_valid

In [185]:
model = LanguageDetectionModel(vocab_size=vocab_size, embedding_dim=16, num_classes=17)


In [186]:
# one batch of data check
for text, label in data_loader_train:
    print(text.shape)
    print(label.shape)
    out = model(text)
    print(out.shape)
    #validation loss
    loss = F.cross_entropy(out, label.long())
    print(loss)
    break


torch.Size([32, 45042])
torch.Size([32])
torch.Size([32, 17])
tensor(2.8838, grad_fn=<NllLossBackward0>)


In [193]:
trainer = pl.Trainer(max_epochs=1, log_every_n_steps=5, enable_progress_bar=True, fast_dev_run=True)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Running in `fast_dev_run` mode: will run the requested loop using 1 batch(es). Logging and checkpointing is suppressed.


In [194]:
# training the model
trainer.fit(model)


  | Name      | Type      | Params
----------------------------------------
0 | embedding | Embedding | 720 K 
1 | fc1       | Linear    | 170   
2 | fc2       | Linear    | 187   
----------------------------------------
721 K     Trainable params
0         Non-trainable params
721 K     Total params
2.884     Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_steps=1` reached.


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10337 entries, 0 to 10336
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Text      10337 non-null  object
 1   Language  10337 non-null  object
dtypes: object(2)
memory usage: 161.6+ KB


In [4]:
# check for null values
df.isnull().sum()

Text        0
Language    0
dtype: int64

In [5]:
# check for duplicates
df.duplicated().sum()

66

In [6]:
# remove duplicates
df.drop_duplicates(inplace=True)

In [7]:
# check for duplicates
df.duplicated().sum()

0

In [8]:
# check for unique values
df['Language'].unique()

array(['English', 'Malayalam', 'Hindi', 'Tamil', 'Portugeese', 'French',
       'Dutch', 'Spanish', 'Greek', 'Russian', 'Danish', 'Italian',
       'Turkish', 'Sweedish', 'Arabic', 'German', 'Kannada'], dtype=object)

In [9]:
# lets see few entries per language
df[df['Language'] == 'English'].head(5)

Unnamed: 0,Text,Language
0,"Nature, in the broadest sense, is the natural...",English
1,"""Nature"" can refer to the phenomena of the phy...",English
2,"The study of nature is a large, if not the onl...",English
3,"Although humans are part of nature, human acti...",English
4,[1] The word nature is borrowed from the Old F...,English


In [10]:

df[df['Language'] == 'French'].head(5)

Unnamed: 0,Text,Language
3250,Si vous disposez d'ouvrages ou d'articles de r...,French
3251,Comment ajouter mes sources ?,French
3252,Cette page ou section est en train d'être trad...,French
3253,Vous pouvez aider au développement de Wikipédi...,French
3254,Le mot nature est un terme polysémique (c’est-...,French


In [11]:

df[df['Language'] == 'Hindi'].head(5)

Unnamed: 0,Text,Language
1979,विकि-शब्दकोष (एक मुक्त शब्दकोष एवं समानांतर को...,Hindi
1980,"[42] अंत में, विकिपीडिया एक पक्ष नहीं लेता है।...",Hindi
1981,बोट्स नामक कंप्यूटर प्रोग्राम के निर्माण के बा...,Hindi
1982,"""""नहीं, हम नहीं जानते"", जिमी ने कहा.",Hindi
1983,[60] कुछ आलोचकों का दावा है कि विकिपीडिया की ख...,Hindi


In [12]:
X = df["Text"]
y = df["Language"]

In [13]:
# label encoding

le = LabelEncoder()
y = le.fit_transform(y)

le.classes_

array(['Arabic', 'Danish', 'Dutch', 'English', 'French', 'German',
       'Greek', 'Hindi', 'Italian', 'Kannada', 'Malayalam', 'Portugeese',
       'Russian', 'Spanish', 'Sweedish', 'Tamil', 'Turkish'], dtype=object)

In [15]:

X = X.apply(preprocess)

In [16]:
# Train test split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
# Bag of words on text data

from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()

X_train = cv.fit_transform(X_train)
X_test = cv.transform(X_test)

In [18]:
X_train.shape, X_test.shape

((8216, 34527), (2055, 34527))

In [19]:
# Model building

from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=150, random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)

In [20]:
# Model evaluation

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

y_pred = rf.predict(X_test)

In [21]:
print('train accuracy: ', accuracy_score(y_train, rf.predict(X_train)))

print('train classification report: ', classification_report(y_train, rf.predict(X_train)))

train accuracy:  0.9982960077896786
train classification report:                precision    recall  f1-score   support

           0       1.00      1.00      1.00       411
           1       1.00      1.00      1.00       339
           2       1.00      1.00      1.00       434
           3       1.00      1.00      1.00      1092
           4       1.00      1.00      1.00       797
           5       1.00      1.00      1.00       379
           6       1.00      1.00      1.00       293
           7       1.00      1.00      1.00        54
           8       1.00      1.00      1.00       564
           9       0.96      1.00      0.98       296
          10       1.00      1.00      1.00       469
          11       1.00      1.00      1.00       597
          12       1.00      1.00      1.00       569
          13       1.00      1.00      1.00       656
          14       1.00      1.00      1.00       523
          15       1.00      0.98      0.99       378
          16   

In [22]:

print('test accuracy is {}', accuracy_score(y_test, y_pred))

print('test classification report is {}', classification_report(y_test, y_pred))

test accuracy is {} 0.9318734793187348
test classification report is {}               precision    recall  f1-score   support

           0       1.00      0.91      0.95       121
           1       0.95      0.93      0.94        85
           2       0.98      0.92      0.95       108
           3       0.98      0.97      0.98       290
           4       0.99      0.91      0.95       210
           5       0.99      0.93      0.96        86
           6       1.00      0.91      0.95        65
           7       1.00      0.88      0.93         8
           8       0.96      0.92      0.94       130
           9       0.41      1.00      0.58        70
          10       1.00      0.95      0.97       122
          11       0.98      0.92      0.95       139
          12       1.00      0.93      0.97       119
          13       0.91      0.93      0.92       160
          14       0.97      0.94      0.96       150
          15       1.00      0.98      0.99        86
         

In [23]:


def predict_language(text):
    text = preprocess(text)
    text = cv.transform([text])
    return le.inverse_transform(rf.predict(text))[0]

In [24]:
# Check for a random sentence
predict_language("Hello, how are you?")

'English'

In [25]:
predict_language("Bonjour, comment allez-vous?")

'French'

In [26]:
predict_language("आप कैसे हैं?")

'Hindi'

> seems pretty good so going ahead and saving to deploy

In [28]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('bow', CountVectorizer()),
    ('classifier', RandomForestClassifier(n_estimators=150, random_state=42, n_jobs=-1))
])

pipeline.fit(X, y) # train on entire data

In [31]:
# quick check for a random sentence on the pipeline

sample_text = "आप कैसे हैं?"

#preprocess
sample_text = preprocess(sample_text)
le.inverse_transform(pipeline.predict([sample_text]))[0]


'Hindi'

In [32]:
# saving the pipeline locally. Rewrite if already exists

pickle.dump(pipeline, open("../app/model/language_detection_pipeline.pkl", "wb"))


In [33]:
# save the label encoder
pickle.dump(le, open("../app/model/label_encoder.pkl", "wb"))