In [55]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score, roc_auc_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW

In [19]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Data from Kaggle

https://www.kaggle.com/code/martinkk5575/language-detection/input

In [20]:
data = pd.read_csv("/content/drive/MyDrive/Github Projects/Language Detection/Language Detection.csv")
data.head()

Unnamed: 0,Text,language
0,klement gottwaldi surnukeha palsameeriti ning ...,Estonian
1,sebes joseph pereira thomas på eng the jesuit...,Swedish
2,ถนนเจริญกรุง อักษรโรมัน thanon charoen krung เ...,Thai
3,விசாகப்பட்டினம் தமிழ்ச்சங்கத்தை இந்துப் பத்திர...,Tamil
4,de spons behoort tot het geslacht haliclona en...,Dutch


In [21]:
data["language"].value_counts()

Estonian      1000
Swedish       1000
English       1000
Russian       1000
Romanian      1000
Persian       1000
Pushto        1000
Spanish       1000
Hindi         1000
Korean        1000
Chinese       1000
French        1000
Portugese     1000
Indonesian    1000
Urdu          1000
Latin         1000
Turkish       1000
Japanese      1000
Dutch         1000
Tamil         1000
Thai          1000
Arabic        1000
Name: language, dtype: int64

Training and Test split

In [22]:
x = np.array(data["Text"])
y = np.array(data["language"])

cv = CountVectorizer()
X = cv.fit_transform(x)
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.33,
                                                    random_state=42)

 Multinomial Naïve Bayes (for Multi Classification)

In [31]:
mnb_model = MultinomialNB()
mnb_model.fit(X_train,y_train)

In [33]:
y_pred = mnb_model.predict(X_test)

precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='macro')

print("Precision:", precision)
print("Recall:", recall)
print("Accuracy:", accuracy)
print("F1 Score:", f1)


Precision: 0.9610079823323606
Recall: 0.9524127300370536
Accuracy: 0.953168044077135
F1 Score: 0.9519871001356179


In [34]:
user = input("Enter a Text: ")
data = cv.transform([user]).toarray()
output = mnb_model.predict(data)
print(output)

Enter a Text: என்ன சொல்ல 
['Tamil']


Support Vector Machine (SVM) model

In [35]:
# Vectorize text data
cv = CountVectorizer()
X_count = cv.fit_transform(x)

In [38]:

# Transform count to tf-idf representation
tfidf_transformer = TfidfTransformer()
X_tfidf = tfidf_transformer.fit_transform(X_count)

In [43]:
# Support Vector Machine (SVM) model
svm_model = SVC(kernel='linear')
svm_model.fit(X_train, y_train)
svm_predictions = svm_model.predict(X_test)

In [45]:

svm_precision = precision_score(y_test, svm_predictions, average='macro')
svm_recall = recall_score(y_test, svm_predictions, average='macro')
svm_accuracy = accuracy_score(y_test, svm_predictions)
svm_f1 = f1_score(y_test, svm_predictions, average='macro')

print("SVM Precision:", svm_precision)
print("SVM Recall:", svm_recall)
print("SVM Accuracy:", svm_accuracy)
print("SVM F1 Score:", svm_f1)


SVM Precision: 0.9652117820702254
SVM Recall: 0.9468141336358836
SVM Accuracy: 0.9466942148760331
SVM F1 Score: 0.9507783207444551


Random Forest Classifier model

In [46]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_predictions = rf_model.predict(X_test)

In [47]:
rf_precision = precision_score(y_test, rf_predictions, average='macro')
rf_recall = recall_score(y_test, rf_predictions, average='macro')
rf_accuracy = accuracy_score(y_test, rf_predictions)
rf_f1 = f1_score(y_test, rf_predictions, average='macro')

print("SVM Precision:", rf_precision)
print("SVM Recall:", rf_recall)
print("RF Accuracy:", rf_accuracy)
print("SVM F1 Score:", rf_f1)

SVM Precision: 0.9525275605017477
SVM Recall: 0.9188854107633241
RF Accuracy: 0.9201101928374655
SVM F1 Score: 0.9189747632934566
