In [9]:
import pandas as pd
import numpy as np

import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
df= pd.read_csv("Language Detection.csv")

In [5]:
df["Language"].value_counts()

Language
English       1385
French        1014
Spanish        819
Portugeese     739
Italian        698
Russian        692
Sweedish       676
Malayalam      594
Dutch          546
Arabic         536
Turkish        474
German         470
Tamil          469
Danish         428
Kannada        369
Greek          365
Hindi           63
Name: count, dtype: int64

In [14]:
df.shape

(10337, 2)

In [7]:
df.isnull().sum()

Text        0
Language    0
dtype: int64

# LABEL ENCODING

In [10]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

In [12]:
df["Language"]= encoder.fit_transform(df["Language"])
mapping = dict(zip(encoder.classes_, range(len(encoder.classes_))))
mapping

{'Arabic': 0,
 'Danish': 1,
 'Dutch': 2,
 'English': 3,
 'French': 4,
 'German': 5,
 'Greek': 6,
 'Hindi': 7,
 'Italian': 8,
 'Kannada': 9,
 'Malayalam': 10,
 'Portugeese': 11,
 'Russian': 12,
 'Spanish': 13,
 'Sweedish': 14,
 'Tamil': 15,
 'Turkish': 16}

In [13]:
# {'Arabic': 0,
#  'Danish': 1,
#  'Dutch': 2,
#  'English': 3,
#  'French': 4,
#  'German': 5,
#  'Greek': 6,
#  'Hindi': 7,
#  'Italian': 8,
#  'Kannada': 9,
#  'Malayalam': 10,
#  'Portugeese': 11,
#  'Russian': 12,
#  'Spanish': 13,
#  'Sweedish': 14,
#  'Tamil': 15,
#  'Turkish': 16}


In [15]:
df.head()

Unnamed: 0,Text,Language
0,"Nature, in the broadest sense, is the natural...",3
1,"""Nature"" can refer to the phenomena of the phy...",3
2,"The study of nature is a large, if not the onl...",3
3,"Although humans are part of nature, human acti...",3
4,[1] The word nature is borrowed from the Old F...,3


# Now time for word to vector

In [16]:
porter_stemmer=PorterStemmer()

In [21]:
#function for stemming :

def stemming(content):
        st_content= re.sub('[^a-zA-Z]',' ',content)
        st_content= st_content.lower()
        
        #for -> [sd sad  as d dg]->[[sd],[sad],[as]]
        st_content=st_content.split()
        st_content= [porter_stemmer.stem(word) for word in st_content if not word in stopwords.words('english')]
        st_content=' '.join(st_content)
        return st_content

In [22]:
df.columns

Index(['Text', 'Language'], dtype='object')

In [23]:
df["Text"]=df["Text"].apply(stemming)

In [41]:
df.head()

Unnamed: 0,Text,Language
0,natur broadest sens natur physic materi world ...,3
1,natur refer phenomena physic world also life g...,3
2,studi natur larg part scienc,3
3,although human part natur human activ often un...,3
4,word natur borrow old french natur deriv latin...,3


In [42]:
x=df["Text"]
y=df["Language"]

# vectorizing

In [43]:
vectorizer= TfidfVectorizer()
vectorizer.fit(x)
x=vectorizer.transform(x)


In [44]:
x

<10337x21840 sparse matrix of type '<class 'numpy.float64'>'
	with 109388 stored elements in Compressed Sparse Row format>

## Splitting data

In [45]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=45)

In [53]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score ,precision_score,recall_score,f1_score,confusion_matrix

In [55]:
models= [LogisticRegression,SVC,DecisionTreeClassifier,RandomForestClassifier,MultinomialNB,KNeighborsClassifier]


In [56]:
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []

In [57]:
for model in models :
    print(f"Evaluating model : {model.__name__}")

    classifier= model().fit(x_train,y_train)
    y_pred=classifier.predict(x_test)
    print(accuracy_score(y_test, y_pred))
    #validation
    accuracy_scores.append(accuracy_score(y_test, y_pred))
    precision_scores.append(precision_score(y_test, y_pred, average='macro'))
    recall_scores.append(recall_score(y_test, y_pred, average='macro'))
    f1_scores.append(f1_score(y_test, y_pred, average='macro'))


Evaluating model : LogisticRegression
0.7292069632495164
Evaluating model : SVC


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


0.7098646034816247
Evaluating model : DecisionTreeClassifier


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


0.652321083172147
Evaluating model : RandomForestClassifier
0.7224371373307543
Evaluating model : MultinomialNB
0.6784332688588007
Evaluating model : KNeighborsClassifier


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


0.08800773694390715


In [58]:
#Testing Time

In [62]:
model = RandomForestClassifier()
classifier = model.fit(x_train,y_train)


texts= ["मुझे आज बहुत खुशी हो रही है","कल का दिन बहुत थकाने वाला था।","أنا سعيد جدًا اليوم.","Hello there fellow","Gestern war ein anstrengender Tag."]

#1stemming
# text= stemming(x)
#2 vectorizing
for text in texts:
    text= vectorizer.transform([text])
    a=model.predict(text)
    print(a)

[12]
[12]
[12]
[3]
[5]


In [None]:
# {'Arabic': 0,
#  'Danish': 1,
#  'Dutch': 2,
#  'English': 3,
#  'French': 4,
#  'German': 5,
#  'Greek': 6,
#  'Hindi': 7,
#  'Italian': 8,
#  'Kannada': 9,
#  'Malayalam': 10,
#  'Portugeese': 11,
#  'Russian': 12,
#  'Spanish': 13,
#  'Sweedish': 14,
#  'Tamil': 15,
#  'Turkish': 16}
