In [7]:
import pandas as pd
import numpy as np
import requests
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

In [6]:
!pip install requests



In [10]:
# Download dataset (language detection dataset)
url = "C:/Users/HP/Downloads/archive (3)/ld.csv"
df = pd.read_csv(url)

df.head()

Unnamed: 0,Text,Language
0,"Nature, in the broadest sense, is the natural...",English
1,"""Nature"" can refer to the phenomena of the phy...",English
2,"The study of nature is a large, if not the onl...",English
3,"Although humans are part of nature, human acti...",English
4,[1] The word nature is borrowed from the Old F...,English


In [11]:
X = df["Text"]
y = df["Language"]

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Character-level TF-IDF
vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(1,4))
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [12]:
model = MultinomialNB()
model.fit(X_train_vec, y_train)

y_pred = model.predict(X_test_vec)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))

Accuracy: 0.8926499032882012

Classification Report:

              precision    recall  f1-score   support

      Arabic       1.00      0.99      1.00       106
      Danish       1.00      0.34      0.51        73
       Dutch       1.00      0.56      0.72       111
     English       0.58      1.00      0.73       291
      French       0.98      0.98      0.98       219
      German       1.00      0.44      0.61        93
       Greek       1.00      0.99      0.99        68
       Hindi       1.00      1.00      1.00        10
     Italian       1.00      0.88      0.94       145
     Kannada       1.00      1.00      1.00        66
   Malayalam       1.00      0.99      1.00       121
  Portugeese       0.99      0.93      0.96       144
     Russian       1.00      1.00      1.00       136
     Spanish       0.99      0.96      0.97       160
    Sweedish       0.96      0.87      0.91       133
       Tamil       1.00      1.00      1.00        87
     Turkish       1.00    

In [14]:
def predict_language(text):
    text_vec = vectorizer.transform([text])
    prediction = model.predict(text_vec)
    return prediction[0]

# Take user input
user_text = input("Enter a word or sentence: ")
print("Predicted Language:", predict_language(user_text))

Enter a word or sentence:  ಗಳಿಂದ


Predicted Language: Kannada
