In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('Language Detection.csv')

In [3]:
df.head()

Unnamed: 0,Text,Language
0,"Nature, in the broadest sense, is the natural...",English
1,"""Nature"" can refer to the phenomena of the phy...",English
2,"The study of nature is a large, if not the onl...",English
3,"Although humans are part of nature, human acti...",English
4,[1] The word nature is borrowed from the Old F...,English


In [4]:
df['Language'].value_counts()

English       1385
French        1014
Spanish        819
Portugeese     739
Italian        698
Russian        692
Sweedish       676
Malayalam      594
Dutch          546
Arabic         536
Turkish        474
German         470
Tamil          469
Danish         428
Kannada        369
Greek          365
Hindi           63
Name: Language, dtype: int64

In [5]:
import nltk
import string
import re


from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize


from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

In [6]:

def preprocess(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    translator = str.maketrans('', '', string.punctuation)
    text = text.translate(translator)
    
    
    stop_words = set(stopwords.words("english"))
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word not in stop_words]
    
    stems = [stemmer.stem(word) for word in filtered_text]
    preprocessed_text = ' '.join(stems)
    return  preprocessed_text
 

In [7]:
df['Text'] =df['Text'].apply(preprocess)

In [8]:
df['Text'].head()

0    natur broadest sens natur physic materi world ...
1    natur refer phenomena physic world also life g...
2                         studi natur larg part scienc
3    although human part natur human activ often un...
4    word natur borrow old french natur deriv latin...
Name: Text, dtype: object

In [9]:
X = df['Text']
Y= df['Language']

In [11]:
from sklearn.model_selection import train_test_split as tts
x_train,x_test,y_train,y_test = tts(X,Y,test_size=0.3,random_state=42)

In [12]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
tv = TfidfVectorizer(max_features=8000)
x_train_tv = tv.fit_transform(x_train).toarray()
x_test_tv = tv.transform(x_test).toarray()

In [49]:
cv = CountVectorizer(max_features=8000)
x_train_cv = cv.fit_transform(x_train).toarray()
x_test_cv = cv.transform(x_test).toarray()

In [50]:
def model_maker(model):
    clf=model
    clf.fit(x_train_cv,y_train_encoded)
    y_pred=clf.predict(x_test_cv)
    print(accuracy_score(y_test_encoded,y_pred))
    print(f1_score(y_test_encoded,y_pred,average='micro'))

In [51]:
from sklearn.naive_bayes import GaussianNB , MultinomialNB , BernoulliNB

from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score , precision_score , f1_score
gb = GaussianNB()
mb = MultinomialNB()
bb = BernoulliNB()
forest=RandomForestClassifier()

In [52]:
model_maker(mb)

0.9664732430689877
0.9664732430689877


In [53]:
model_maker(bb)

0.8020631850419084
0.8020631850419084


In [54]:
model_maker(gb)

0.9606705351386202
0.9606705351386202


In [55]:
model_maker(forest)

0.9200515796260477
0.9200515796260477


In [56]:
y_train.value_counts()

English       950
French        705
Spanish       573
Portugeese    534
Russian       492
Sweedish      483
Italian       481
Malayalam     410
Arabic        371
Dutch         371
German        335
Turkish       327
Tamil         324
Danish        305
Kannada       268
Greek         263
Hindi          43
Name: Language, dtype: int64

In [47]:
label_encoder.classes_

array(['Arabic', 'Danish', 'Dutch', 'English', 'French', 'German',
       'Greek', 'Hindi', 'Italian', 'Kannada', 'Malayalam', 'Portugeese',
       'Russian', 'Spanish', 'Sweedish', 'Tamil', 'Turkish'], dtype=object)

In [1]:
import pickle
pickle.dump(cv , open('vectorizer.pkl','wb'))

NameError: name 'cv' is not defined