In [1]:
import pandas as pd

# Load the CSV without header
label_map = pd.read_csv("labels.csv", header=None)

# Check what it looks like
print(label_map.head())
print(label_map.columns)


                                                   0
0  Label;English;Wiki Code;ISO 369-3;German;Langu...
1   ace;Achinese;ace;ace;Achinesisch;Austronesian;;;
2    afr;Afrikaans;af;afr;Afrikaans;Indo-European;;;
3  als;Alemannic German;als;gsw;Alemannisch;Indo-...
4       amh;Amharic;am;amh;Amharisch;Afro-Asiatic;;;
Index([0], dtype='int64')


In [2]:
import pandas as pd

# Load CSV properly
data = pd.read_csv("labels.csv", sep=";", header=0)

# Check the first rows
print(data.head())


  Label           English Wiki Code ISO 369-3       German Language family  \
0   ace          Achinese       ace       ace  Achinesisch    Austronesian   
1   afr         Afrikaans        af       afr    Afrikaans   Indo-European   
2   als  Alemannic German       als       gsw  Alemannisch   Indo-European   
3   amh           Amharic        am       amh    Amharisch    Afro-Asiatic   
4   ang      Old English        ang       ang  Altenglisch   Indo-European   

  Writing system                        Remarks        Synonyms  
0            NaN                            NaN             NaN  
1            NaN                            NaN             NaN  
2            NaN  (ursprünglich nur Elsässisch)             NaN  
3            NaN                            NaN             NaN  
4            NaN                 (ca. 450-1100)  Angelsächsisch  


In [3]:
print(data.columns)


Index(['Label', 'English', 'Wiki Code', 'ISO 369-3', 'German',
       'Language family', 'Writing system', 'Remarks', 'Synonyms'],
      dtype='object')


In [4]:
# Load training and test data
with open("x_train.txt", "r", encoding="utf-8") as f:
    x_train = [line.strip() for line in f]

with open("y_train.txt", "r", encoding="utf-8") as f:
    y_train = [line.strip() for line in f]  # keep as string (ISO code)

with open("x_test.txt", "r", encoding="utf-8") as f:
    x_test = [line.strip() for line in f]

with open("y_test.txt", "r", encoding="utf-8") as f:
    y_test = [line.strip() for line in f]  # keep as string

# Quick check
print("Training samples:", len(x_train))
print("Test samples:", len(x_test))
print("First training example:", x_train[0], "->", y_train[0])


Training samples: 117500
Test samples: 117500
First training example: Klement Gottwaldi surnukeha palsameeriti ning paigutati mausoleumi. Surnukeha oli aga liiga hilja ja oskamatult palsameeritud ning hakkas ilmutama lagunemise tundemärke. 1962. aastal viidi ta surnukeha mausoleumist ära ja kremeeriti. Zlíni linn kandis aastatel 1949–1989 nime Gottwaldov. Ukrainas Harkivi oblastis kandis Zmiivi linn aastatel 1976–1990 nime Gotvald. -> est


In [5]:
import pandas as pd

# Load label CSV
label_map = pd.read_csv("labels.csv", sep=";", header=0)

# Create a dictionary: ISO code -> English language name
iso_to_name = dict(zip(label_map["ISO 369-3"], label_map["English"]))

# Quick test
print("ISO 'est' maps to:", iso_to_name["est"])


ISO 'est' maps to: Estonian


In [6]:
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline

# Improved vectorizer: captures more language patterns
vectorizer = HashingVectorizer(
    analyzer='char',
    ngram_range=(1, 4),  # original working range
    n_features=2**20,
    alternate_sign=False
)


# Pipeline with Naive Bayes
model = make_pipeline(vectorizer, MultinomialNB())

# Train
print("Training improved model...")
model.fit(x_train, y_train)
print("Training completed!")

# Evaluate accuracy
preds = model.predict(x_test)
accuracy = (preds == y_test).mean()
print(f"Test Accuracy: {accuracy:.4f}")


Training improved model...
Training completed!
Test Accuracy: 0.9262


In [7]:
# Function to detect language
def detect_language(text):
    iso_code = model.predict([text])[0]  # predict ISO code
    return iso_to_name.get(iso_code, "Unknown")  # map to English name


In [10]:
import os
from docx import Document
import PyPDF2

def read_document(file_path):
    """
    Reads text from .txt, .docx, or .pdf files.
    Returns the extracted text as a string.
    """
    ext = os.path.splitext(file_path)[1].lower()

    # TXT
    if ext == ".txt":
        with open(file_path, "r", encoding="utf-8") as f:
            return f.read()

    # DOCX
    elif ext == ".docx":
        doc = Document(file_path)
        return "\n".join([para.text for para in doc.paragraphs])

    # PDF
    elif ext == ".pdf":
        reader = PyPDF2.PdfReader(file_path)
        text = ""
        for page in reader.pages:
            extracted = page.extract_text()
            if extracted:
                text += extracted
        return text

    else:
        raise ValueError("Unsupported file format! Use .txt, .docx, or .pdf")


In [14]:
file_path = input("Enter file path: ").strip()

# Read text from any supported file format
text = read_document(file_path)

# Predict language
prediction = model.predict([text])[0]

print("\nUser Provided Document:", file_path)
print("Detected Language:", prediction)



User Provided Document: C:\Users\SU HAN\Desktop\urud1.txt
Detected Language: urd


In [15]:
import joblib

# Save the trained model
joblib.dump(model, "language_detector.pkl")
print("Model saved as 'language_detector.pkl'")


Model saved as 'language_detector.pkl'


In [16]:
import joblib

# Loading the trained model
model = joblib.load("language_detector.pkl")
print("Model loaded successfully!")


Model loaded successfully!


In [None]:
from IPython.display import clear_output

# Make sure detect_language() is defined
def detect_language(text):
    iso_code = model.predict([text])[0]   # predict ISO code
    return iso_to_name.get(iso_code, "Unknown")  # map to English name

# Interactive testing
while True:
    user_input = input("Enter text (or 'quit' to exit): ")
    if user_input.lower() == "quit":
        break
    
    language_name = detect_language(user_input)
    
    clear_output(wait=True)
    print(f"User entered: {user_input}")
    print(f"Detected Language: {language_name}")


User entered: quit 
Detected Language: Latin
