In [None]:
import pandas as pd # Importing the Pandas library and assigning an alias 'pd' for ease of use.
import numpy as np # Importing the NumPy library and assigning an alias 'np' for ease of use.
from sklearn.feature_extraction.text import TfidfVectorizer # Importing the TfidfVectorizer class from Scikit-learn's feature_extraction.text module. Reduce the number of features in a dataset by creating new features from the existing ones
from sklearn.naive_bayes import MultinomialNB # Importing the MultinomialNB class from Scikit-learn's naive_bayes module. Classification with discrete features (e.g., word counts for text classification)
from sklearn.neighbors import KNeighborsClassifier # Importing the KNeighborsClassifier class from Scikit-learn's neighbors module. Algorithm that makes classifications based on data neighbors
from sklearn.metrics.pairwise import paired_distances # Importing the paired_distances function from Scikit-learn's metrics.pairwise module. Calculates distance between instances in a feature array
from sklearn.model_selection import train_test_split # Importing the train_test_split function from Scikit-learn's model_selection module. Split our data into train and test sets.
from sklearn.preprocessing import LabelEncoder # Importing the LabelEncoder class from Scikit-learn's preprocessing module.  Convert categorical variables into numerical form

In [None]:
# Load labeled dataset
url = "https://raw.githubusercontent.com/amankharwal/Website-data/master/dataset.csv" # Assigning a string containing the URL of the CSV file to the variable 'url'.
data = pd.read_csv(url) # Using Pandas (pd) to read the CSV file from the provided URL and storing it in the variable 'data'.
print(data.head()) # Printing the first few rows (by default, the first five rows) of the DataFrame 'data' to the console.

                                                Text  language
0  klement gottwaldi surnukeha palsameeriti ning ...  Estonian
1  sebes joseph pereira thomas  på eng the jesuit...   Swedish
2  ถนนเจริญกรุง อักษรโรมัน thanon charoen krung เ...      Thai
3  விசாகப்பட்டினம் தமிழ்ச்சங்கத்தை இந்துப் பத்திர...     Tamil
4  de spons behoort tot het geslacht haliclona en...     Dutch


In [None]:
# Rename the columns if they have different capitalization
data.rename(columns={"Text": "text", "language": "language"}, inplace=True)

# Split the data into features (text) and labels (language)
X = data['text'] # 'X' holds the text data (features)
y = data['language'] # 'y' holds the language labels

# Encode the language labels to numeric values
label_encoder = LabelEncoder() # Initialize a LabelEncoder object
y_encoded = label_encoder.fit_transform(y) # Use LabelEncoder to transform categorical labels ('y') into numerical values

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Create a TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer() # Initialize a TF-IDF vectorizer object
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train) # Fit and transform the training text data (X_train) into TF-IDF features
X_test_tfidf = tfidf_vectorizer.transform(X_test) # Transform the testing text data (X_test) into TF-IDF features using the fitted TF-IDF vectorizer

In [None]:
# Train a Multinomial Naive Bayes classifier
naive_bayes_classifier = MultinomialNB() # Initialize a Multinomial Naive Bayes classifier object
naive_bayes_classifier.fit(X_train_tfidf, y_train) # Fit the Multinomial Naive Bayes classifier using the TF-IDF transformed training data (X_train_tfidf)
# and corresponding training labels (y_train)

# Train a K-Nearest Neighbors classifier
knn_classifier = KNeighborsClassifier(n_neighbors=5) # Initialize a K-Nearest Neighbors classifier object with 5 neighbors (n_neighbors=5)
knn_classifier.fit(X_train_tfidf, y_train) # Fit the K-Nearest Neighbors classifier using the TF-IDF transformed training data (X_train_tfidf)
# and corresponding training labels (y_train)


In [None]:
# Calculate accuracy for Naïve Bayes classifier
accuracy_naive_bayes = naive_bayes_classifier.score(X_test_tfidf, y_test)
print(f"Accuracy of Naïve Bayes classifier: {accuracy_naive_bayes * 100:.2f}%")

# Calculate accuracy for K-Nearest Neighbors classifier
accuracy_knn = knn_classifier.score(X_test_tfidf, y_test)
print(f"Accuracy of K-Nearest Neighbors classifier: {accuracy_knn * 100:.2f}%")


Accuracy of Naïve Bayes classifier: 94.30%
Accuracy of K-Nearest Neighbors classifier: 93.57%


In [None]:
# Define a function to calculate Sum of Squared Differences (SSD)
def identify_language_ssd(text):
    text_tfidf = tfidf_vectorizer.transform([text]).toarray() # Transform the input text into TF-IDF features using the fitted TF-IDF vectorizer and convert it to a dense array
    text_tfidf = text_tfidf[0] # Retrieve the TF-IDF features as a 1D array

    ssd = np.sum((X_test_tfidf.toarray() - text_tfidf) ** 2, axis=1) # Calculate Sum of Squared Differences (SSD) between the input text and the test set

    min_ssd_index = np.argmin(ssd)  # Find the index corresponding to the minimum SSD

    predicted_label = y_test[min_ssd_index]  # Retrieve the predicted label using the index of the minimum SSD from the testing labels (y_test)
    predicted_language = label_encoder.inverse_transform([predicted_label])[0] # Convert the predicted label back to the original language using the LabelEncoder
    return predicted_language

def identify_language_knn(text): # Define a function to identify language using K-Nearest Neighbors (KNN) classifier
    text_tfidf = tfidf_vectorizer.transform([text]) # Transform the input text into TF-IDF features using the fitted TF-IDF vectorizer
    predicted_label = knn_classifier.predict(text_tfidf) # Use the trained KNN classifier to predict the language label for the input text
    predicted_language = label_encoder.inverse_transform(predicted_label)[0] # Convert the predicted label back to the original language using the LabelEncoder

    return predicted_language

In [None]:
user_input = input("Enter the text you want to identify the language for: ") # Prompt the user to input text for language identification

# Encode the language labels to numeric values
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Using Naïve Bayes
# Predict the language of the user's input text using Naïve Bayes classifier
identified_language_naive_bayes = label_encoder.inverse_transform(naive_bayes_classifier.predict(tfidf_vectorizer.transform([user_input])))[0]


# Using K-Nearest Neighbors
identified_language_knn = identify_language_knn(user_input) # Predict the language of the user's input text using the K-Nearest Neighbors (KNN) classifier
# Using Sum of Squared Differences (SSD)
identified_language_ssd = identify_language_ssd(user_input) # Predict the language of the user's input text using the SSD method

# Check if the SSD method was able to identify the language
if identified_language_ssd:
    print(f"Identified language using Sum of Squared Differences (SSD): {identified_language_ssd}") # Print the identified language using SSD
else:
    print("Unable to identify the language using the Sum of Squared Differences (SSD).") # If SSD method couldn't identify the language, print a message


# Print the identified language using Naïve Bayes and KNN classifiers
print(f"Identified language using Naïve Bayes: {identified_language_naive_bayes}")
print(f"Identified language using K-Nearest Neighbors: {identified_language_knn}")

Enter the text you want to identify the language for: ncium kakak kelasnya kyoko sejak yuuki meminta agar sakura merahasiakan hal tersebutlah keduanya menjadi akrab yuuki pun akhirnya menyukai sakura sayangnya tidak disadari oleh sakura dan hanya aiko yang tahu karena takut dibenci sakura yuuki akhirnya menganggap sakura ""sahabat spesia
Identified language using Sum of Squared Differences (SSD): Chinese
Identified language using Naïve Bayes: Indonesian
Identified language using K-Nearest Neighbors: Indonesian


Link to the language dataset being used in this project.
https://raw.githubusercontent.com/amankharwal/Website-data/master/dataset.csv