### ðŸš€ Task 1: Load Dataset

In [None]:
import pandas as pd

# Load the language detection dataset
dataset_path = "/mnt/data/Language_Detection.csv"
df = pd.read_csv(dataset_path)

# Display the dataset
import ace_tools as tools
tools.display_dataframe_to_user(name="Language Detection Dataset", dataframe=df)

print("âœ… Language Detection dataset loaded successfully!")


### ðŸš€ Task 2: Use Pretrained Models from Hugging Face

In [None]:
from transformers import pipeline

# Load a language detection model from Hugging Face
language_identifier = pipeline("text-classification", model="facebook/fasttext-language-identification")

# Function to detect language
def detect_language(text):
    try:
        result = language_identifier(text[:512])  # Limit text length to avoid memory issues
        return result[0]["label"]  # Extract the predicted language
    except Exception as e:
        return f"Error: {e}"

# Apply the language detection model
df["predicted_language"] = df["text"].astype(str).apply(detect_language)

# Display results
tools.display_dataframe_to_user(name="Predicted Language Results", dataframe=df)

print("âœ… Language detection completed using Hugging Face model!")


### ðŸš€ Task 3: Compare with Other Models

In [None]:
# Load an alternative Hugging Face model for language detection
alt_language_identifier = pipeline("text-classification", model="papluca/xlm-roberta-base-language-detection")

# Function to detect language using alternative model
def detect_language_alt(text):
    try:
        result = alt_language_identifier(text[:512])
        return result[0]["label"]
    except Exception as e:
        return f"Error: {e}"

# Apply alternative model
df["predicted_language_alt"] = df["text"].astype(str).apply(detect_language_alt)

# Display alternative model results
tools.display_dataframe_to_user(name="Alternative Model Predictions", dataframe=df)

print("âœ… Alternative language detection completed!")


### ðŸš€ Task 4: Evaluate Performance

In [None]:
from sklearn.metrics import accuracy_score

# Compute accuracy for both models
true_labels = df["language"]  # Assuming 'language' column contains true labels
accuracy_primary = accuracy_score(true_labels, df["predicted_language"])
accuracy_alternative = accuracy_score(true_labels, df["predicted_language_alt"])

print(f"âœ… Accuracy of Facebook FastText Model: {accuracy_primary:.4f}")
print(f"âœ… Accuracy of XLM-Roberta Model: {accuracy_alternative:.4f}")
