##Setting up the environment




In [42]:
!pip install fasttext

Collecting flask-ngrok
  Downloading flask_ngrok-0.0.25-py3-none-any.whl.metadata (1.8 kB)
Downloading flask_ngrok-0.0.25-py3-none-any.whl (3.1 kB)
Installing collected packages: flask-ngrok
Successfully installed flask-ngrok-0.0.25


##Collecting and Processing data

In [30]:
import pandas as pd
import fasttext
import re

In [31]:
#download and extract Tatoeba dataset
!wget https://downloads.tatoeba.org/exports/sentences.tar.bz2
!tar -xvjf sentences.tar.bz2
#load dataset
df= pd.read_csv("sentences.csv", sep="\t", names=['SentenceID', "Language", "Text"])
#filter for top ten most common languages globally
eight_lang = ["eng", "spa", "ita", "ara", "epo", "por", "rus", "deu"]
filtered_df=df[df["Language"].isin(eight_lang)]
#set a limit for the number of sentences per language
max_sentences=20000
balanced_df=filtered_df.groupby("Language", group_keys=False).apply(lambda x: x.sample(n=min(len(x), max_sentences), random_state=42)).reset_index(drop=True)
balanced_df["Text"]= balanced_df["Text"].str.lower()
#remove all punctuation except ¿ and ¡ (for spanish)
balanced_df["Text"]=balanced_df["Text"].apply(lambda x: re.sub(r"[^\w\s¿¡]", "", x))
#normalize whitespace
balanced_df["Text"]= balanced_df["Text"].str.replace("\s+", " ", regex=True).str.strip()
#split into training and testing data (80-20 ratio)
train_df=balanced_df.sample(frac=0.8, random_state=42)
test_df=balanced_df.drop(train_df.index)
#format for fasttext: __label__en Hello!
train_df["fasttext_format"]= "__label__"+train_df["Language"]+" "+train_df["Text"]
test_df["fasttext_format"]= "__label__"+test_df["Language"]+" "+test_df["Text"]
#save as txt files
train_df["fasttext_format"].to_csv("train.txt", index=False, header=False)
test_df["fasttext_format"].to_csv("test.txt", index=False, header=False)
print("data prepared")


--2025-01-26 22:12:01--  https://downloads.tatoeba.org/exports/sentences.tar.bz2
Resolving downloads.tatoeba.org (downloads.tatoeba.org)... 94.130.77.194
Connecting to downloads.tatoeba.org (downloads.tatoeba.org)|94.130.77.194|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 199708539 (190M) [application/octet-stream]
Saving to: ‘sentences.tar.bz2.4’


2025-01-26 22:12:10 (23.8 MB/s) - ‘sentences.tar.bz2.4’ saved [199708539/199708539]

sentences.csv


  balanced_df=filtered_df.groupby("Language", group_keys=False).apply(lambda x: x.sample(n=min(len(x), max_sentences), random_state=42)).reset_index(drop=True)


data prepared


In [33]:
print(balanced_df["Language"].value_counts())

Language
ara    20000
deu    20000
eng    20000
epo    20000
ita    20000
por    20000
rus    20000
spa    20000
Name: count, dtype: int64


##Training the Model

In [34]:
model=fasttext.train_supervised(
    input="train.txt",
    lr=0.05,
    epoch=70,
    wordNgrams=4,
    dim=100,
    bucket=100000
    )
#save the model
model.save_model("language_identifier.bin")
print("model trained and saved!")

model trained and saved!


##Testing the Model

In [38]:
for lang in eight_lang:
  lang_test_df= test_df[test_df["Language"]==lang]
  lang_test_file= f"test_{lang}.txt"
  lang_test_df["fasttext_format"].to_csv(lang_test_file, index=False, header=False)
  result = model.test(lang_test_file)
  print(f"Language: {lang}")
  print(f"  Number of samples: {result[0]}")
  print(f"  Precision: {result[1]:.2f}")
  print(f"  Recall: {result[2]:.2f}")

Language: eng
  Number of samples: 3939
  Precision: 0.99
  Recall: 0.99
Language: spa
  Number of samples: 4039
  Precision: 0.96
  Recall: 0.96
Language: ita
  Number of samples: 4036
  Precision: 0.98
  Recall: 0.98
Language: ara
  Number of samples: 4123
  Precision: 1.00
  Recall: 1.00
Language: epo
  Number of samples: 3940
  Precision: 0.99
  Recall: 0.99
Language: por
  Number of samples: 4014
  Precision: 0.97
  Recall: 0.97
Language: rus
  Number of samples: 3957
  Precision: 0.97
  Recall: 0.97
Language: deu
  Number of samples: 3952
  Precision: 0.99
  Recall: 0.99


In [39]:
result= model.test("test.txt")
print(f"Number of test samples: {result[0]}")
print(f"Precision: {result[1]:.2f}")
print(f"Recall: {result[2]:.2f}")

Number of test samples: 32000
Precision: 0.98
Recall: 0.98


In [43]:
test_sentences = [
    "Hello, how are you?",  # English
    "Hola, ¿cómo estás?",   # Spanish
    "السلام عليكم",          # Arabic
    "Olá, como vai você?",   # Portuguese
    "Привет, как дела?",     # Russian
    "Wie geht es dir?",       # German
    "Ciao, come stai?",       # Italian
    "Saluton, kiel vi fartas?" # Esperanto
]

for sentence in test_sentences:
    predictions = model.predict(sentence, k=1)  # Get top 3 predictions
    print(f"Sentence: {sentence}")
    print(f"Predictions: {predictions}")

Sentence: Hello, how are you?
Predictions: (('__label__eng',), array([0.99995065]))
Sentence: Hola, ¿cómo estás?
Predictions: (('__label__spa',), array([0.99940026]))
Sentence: السلام عليكم
Predictions: (('__label__ara',), array([0.99407828]))
Sentence: Olá, como vai você?
Predictions: (('__label__por',), array([0.99355209]))
Sentence: Привет, как дела?
Predictions: (('__label__rus',), array([0.99981493]))
Sentence: Wie geht es dir?
Predictions: (('__label__deu',), array([0.85577619]))
Sentence: Ciao, come stai?
Predictions: (('__label__ita',), array([0.96978873]))
Sentence: Saluton, kiel vi fartas?
Predictions: (('__label__epo',), array([1.00000942]))
