# Import data

In [1]:
import pandas as pd

In [2]:
dataset=pd.read_csv(r'local lang.csv',encoding='cp1252')

In [3]:
dataset

Unnamed: 0,Sentences,Language
0,Oke oka lokam nuvve,Telugu
1,Lokamlona andham nuvve,Telugu
2,Andhaanike hrudhayam nuvve naake andhaave,Telugu
3,Ekaa eki kopam nuvve,Telugu
4,Kopamlona deepam nuvve,Telugu
...,...,...
448,The Jamaican mojitos are delicious.,English
449,Which are small and not worth the price.,English
450,the food is rich so order accordingly.,English
451,The shower area is outside so you can only rin...,English


# Text Preprocessing 

In [4]:
import re
import nltk
from nltk.stem import WordNetLemmatizer
wc=WordNetLemmatizer()
corpus=[]
for i in range(0,len(dataset)):
    review=re.sub('[^a-zA-Z]',' ',dataset['Sentences'][i])
    review=review.lower()
    review=review.split()
    review=[wc.lemmatize(words) for words in review]
    review=" ".join(review)
    corpus.append(review)

In [5]:
corpus

['oke oka lokam nuvve',
 'lokamlona andham nuvve',
 'andhaanike hrudhayam nuvve naake andhaave',
 'ekaa eki kopam nuvve',
 'kopamlona deepam nuvve',
 'deepam leni veluthuru nuvve',
 'pranaannilaa veliginchaave',
 'ninnu ninnugaa preminchanaa',
 'nannu nannugaa andhinchanaa',
 'anni velalaa thodundanaa',
 'janma janmalaa jantavvanaa',
 'kaalamanthaa neeke nenu kaavalundanaa',
 'oh kallathoti nithyam',
 'ninne kougilinchanaa',
 'nuvvu naatho maatlaadavaa',
 'nuvvu elaa naatho maatlaadavo choosthaa',
 'nuvvu phone cheyakunte nenu neetho maatlaadanu',
 'ninnane vacchaavu mallee eeroju endhuku vacchaavu',
 'nenu neetho maatlaadaalani anukunnaanu andhuke vacchaanu',
 'nuvvu em maatlaadaali',
 'athadu neeku dabbulu icchaadanta kadhaa athadu dabbulu thirigi iccheyamannaadu',
 'nenu dabbulu repu isthaanannaanani cheppu',
 'aahaarame oushadham meaning in english',
 'andhuke manchi aahaaram theesukunte hospital ki vellaalsina avasaram undadhu',
 'mandhulu vesukunnaavaa',
 'indhaake vesukunnaa',
 

# Feature Representation

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf=TfidfVectorizer(ngram_range=(1,2))
x=tf.fit_transform(corpus).toarray()
y=pd.get_dummies(dataset['Language'])
y=y.iloc[:,1].values

In [7]:
x

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [8]:
y

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

# Split the data

In [9]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=0)

# Building a model using Naive Bayes Classifier

In [10]:
from sklearn.naive_bayes import MultinomialNB
lang_detect_model=MultinomialNB().fit(x_train,y_train)

In [11]:
y_pred=lang_detect_model.predict(x_test)

In [12]:
from sklearn.metrics import accuracy_score

In [13]:
y_pred_train=lang_detect_model.predict(x_train)

In [14]:
print("Train accuracy score:",accuracy_score(y_train,y_pred_train)*100,"%")

Train accuracy score: 100.0 %


In [15]:
print("Test Accuracy score:",accuracy_score(y_test,y_pred)*100,"%")

Test Accuracy score: 97.82608695652173 %


In [16]:
from sklearn.metrics import confusion_matrix
cm_test=confusion_matrix(y_test,y_pred)

In [17]:
cm_test

array([[23,  1],
       [ 0, 22]], dtype=int64)

# Real Time Predictions

In [18]:
#review="ekkada untunaru"
review="she is a good girl"

In [19]:
review=tf.transform([review]).toarray()

In [20]:
lang_detect_model.predict(review)

array([0], dtype=uint8)

In [None]:
#1..Telugu
#0..English