<a href="https://colab.research.google.com/github/prajwal0210/language-detection/blob/main/Language_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Language Detection model.**

# Imporitng libraries

In [1]:
import string
import re
import codecs
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt 
from sklearn import feature_extraction
from sklearn import linear_model
from sklearn import pipeline
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [4]:
df = pd.read_csv('/content/Language Detection.csv')
df.head()

Unnamed: 0,Text,Language
0,"Nature, in the broadest sense, is the natural...",English
1,"""Nature"" can refer to the phenomena of the phy...",English
2,"The study of nature is a large, if not the onl...",English
3,"Although humans are part of nature, human acti...",English
4,[1] The word nature is borrowed from the Old F...,English


# **Text cleaning**

In [8]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [9]:
def remove_pun(text):
  for pun in string.punctuation:
    text = text.replace(pun,"")
  text = text.lower()
  return (text)

In [10]:
remove_pun('"Nature" can refer to the phenomena of the phy...')

'nature can refer to the phenomena of the phy'

**As you can see the text is cleaner now**

In [11]:
df['Text'] = df['Text'].apply(remove_pun)

In [12]:
df.head()

Unnamed: 0,Text,Language
0,nature in the broadest sense is the natural p...,English
1,nature can refer to the phenomena of the physi...,English
2,the study of nature is a large if not the only...,English
3,although humans are part of nature human activ...,English
4,1 the word nature is borrowed from the old fre...,English


# **spliting the dataset**

In [13]:
x = df.iloc[:,0]
y = df.iloc[:,1]

In [14]:
x

0         nature in the broadest sense is the natural p...
1        nature can refer to the phenomena of the physi...
2        the study of nature is a large if not the only...
3        although humans are part of nature human activ...
4        1 the word nature is borrowed from the old fre...
                               ...                        
10332    ನಿಮ್ಮ ತಪ್ಪು ಏನು ಬಂದಿದೆಯೆಂದರೆ ಆ ದಿನದಿಂದ ನಿಮಗೆ ಒ...
10333    ನಾರ್ಸಿಸಾ ತಾನು ಮೊದಲಿಗೆ ಹೆಣಗಾಡುತ್ತಿದ್ದ ಮಾರ್ಗಗಳನ್...
10334    ಹೇಗೆ  ನಾರ್ಸಿಸಿಸಮ್ ಈಗ ಮರಿಯನ್ ಅವರಿಗೆ ಸಂಭವಿಸಿದ ಎಲ...
10335    ಅವಳು ಈಗ ಹೆಚ್ಚು ಚಿನ್ನದ ಬ್ರೆಡ್ ಬಯಸುವುದಿಲ್ಲ ಎಂದು ...
10336    ಟೆರ್ರಿ ನೀವು ನಿಜವಾಗಿಯೂ ಆ ದೇವದೂತನಂತೆ ಸ್ವಲ್ಪ ಕಾಣು...
Name: Text, Length: 10337, dtype: object

In [15]:
y

0        English
1        English
2        English
3        English
4        English
          ...   
10332    Kannada
10333    Kannada
10334    Kannada
10335    Kannada
10336    Kannada
Name: Language, Length: 10337, dtype: object

In [16]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size= 0.2)

In [17]:
vec = feature_extraction.text.TfidfVectorizer(ngram_range=(1,2), analyzer='char')

In [20]:
model_pipe = pipeline.Pipeline([('vec', vec), ('clf', linear_model.LogisticRegression())])

In [21]:
model_pipe.fit(x_train, y_train)

Pipeline(memory=None,
         steps=[('vec',
                 TfidfVectorizer(analyzer='char', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 2), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('clf',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1

In [22]:
model_pipe.classes_

array(['Arabic', 'Danish', 'Dutch', 'English', 'French', 'German',
       'Greek', 'Hindi', 'Italian', 'Kannada', 'Malayalam', 'Portugeese',
       'Russian', 'Spanish', 'Sweedish', 'Tamil', 'Turkish'], dtype=object)

**As you see there are this many languages in the dataset**

In [23]:
predicted_val = model_pipe.predict(x_test)

In [24]:
#accuracy score
metrics.accuracy_score(y_test, predicted_val)

0.9777562862669246

**our model is 97% accurate as you can see**

# **Testing of model**

In [26]:
model_pipe.predict(['Hello my name is prajwal'])

array(['English'], dtype=object)

In [27]:
model_pipe.predict(['ऐसे शब्दों के समूह को कहते हैं जो एक पूर्ण विचार को व्यक्त करता है। वाक्य चाहे कितना भी छोटा या बड़ा हो उससे एक पूर्ण विचार व्यक्त होना चाहिए। एक पूर्ण वाक्य बनने के लिए, वाक्य के भीतर कम से कम एक स्वतंत्र उपवाक्य होना चाहिए '])

array(['Hindi'], dtype=object)

In [28]:
model_pipe.predict(['வானம் முழுவதும் நட்சத்திரங்கள் இருக்கின்றன'])

array(['Tamil'], dtype=object)

In [29]:
model_pipe.predict(['Encantado de conocerte'])

array(['Spanish'], dtype=object)