In [10]:
import pandas as pd
import numpy as np
import nltk
nltk.download('stopwords')
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [50]:
df = pd.read_csv("Language Detection.csv")

In [51]:
df = df.iloc[:7000]

In [52]:
df.shape

(7000, 2)

In [53]:
df["Language"].value_counts()

English       1385
French        1014
Spanish        819
Portugeese     739
Russian        692
Malayalam      594
Dutch          546
Tamil          469
Greek          365
Danish         314
Hindi           63
Name: Language, dtype: int64

In [54]:
import re
ps = PorterStemmer()
corpus = []
for i in range(len(df["Text"])):
    text = re.sub("[^a-zA-Z]", " ", df["Text"][i])
    text = text.lower()
    text = text.split()
    text = [ps.stem(word) for word in text if word not in set(stopwords.words())]
    text = " ".join(text)
    corpus.append(text)

In [55]:
from sklearn.feature_extraction.text import CountVectorizer

In [62]:
cv = CountVectorizer(max_features = 10000)

In [63]:
x = cv.fit_transform(corpus).toarray()

In [64]:
x.shape

(7000, 10000)

In [65]:
from sklearn.preprocessing import LabelEncoder

In [66]:
le = LabelEncoder()
y = le.fit_transform(df["Language"])

In [68]:
y

array([2, 2, 2, ..., 0, 0, 0])

In [69]:
le.classes_

array(['Danish', 'Dutch', 'English', 'French', 'Greek', 'Hindi',
       'Malayalam', 'Portugeese', 'Russian', 'Spanish', 'Tamil'],
      dtype=object)

In [76]:
data = pd.DataFrame(np.c_[corpus , y], columns = ["Text" , "Language"])

In [77]:
data.head()

Unnamed: 0,Text,Language
0,natur broadest natur physic materi world univers,2
1,natur refer phenomena physic world life gener,2
2,studi natur larg part scienc,2
3,human part natur human activ understood separ ...,2
4,word natur borrow french natur deriv latin wor...,2


In [78]:
data["Language"].value_counts()

2     1385
3     1014
9      819
7      739
8      692
6      594
1      546
10     469
4      365
0      314
5       63
Name: Language, dtype: int64

In [79]:
from sklearn.model_selection import train_test_split
x_train , x_test , y_train , y_test = train_test_split(x , y , test_size=0.2 , random_state = 42)

In [80]:
x_train.shape

(5600, 10000)

In [81]:
x_test.shape

(1400, 10000)

In [82]:
from sklearn.naive_bayes import GaussianNB

In [83]:
model = GaussianNB()
model.fit(x_train , y_train)

In [84]:
y_pred = model.predict(x_test)

In [85]:
y_pred

array([10,  7, 10, ...,  9,  2,  1])

In [86]:
y_test

array([8, 7, 5, ..., 9, 2, 1])

In [87]:
from sklearn.metrics import accuracy_score

In [88]:
score = accuracy_score(y_test , y_pred)

In [89]:
score

0.7014285714285714

In [90]:
predict_data = pd.DataFrame(np.c_[y_test , y_pred] , columns=["Actual", "Predicted"])

In [93]:
predict_data.sample(20)

Unnamed: 0,Actual,Predicted
1368,2,2
835,2,2
588,1,1
1195,0,0
798,3,3
1323,1,1
262,3,3
696,0,0
879,1,1
1170,2,2


In [96]:
import pickle
file = "model.pkl"
pickle.dump(model , open(file, "wb"), )