# Importing "Transformers" pipeline

In [1]:
import numpy as np
import pandas as pd
from transformers import pipeline

# Reading the texts

In [2]:
text_file_1 = open("test_texts_1.txt", encoding="utf8")
test_texts_1 = text_file_1.read().splitlines()
text_file_1.close()

test_texts_1

['I would rrrrrreally like to know the language of this text',
 'Ik zou blij zijn als ik wist wat deze taal is',
 'Хотел бы я знать, что это за язык!',
 'Jeg ville ønske, jeg vidste, hvilket sprog dette er!',
 'Keşke bunun hangi dil olduğunu bilseydim!',
 'Ich möchte die Sprache lernen, in der dieser Text geschrieben ist',
 "Je n'ai jamais su ce fait",
 'Nunca supe este hecho',
 'Sempre sonhei em visitar os Açores',
 'Jag har alltid drömt om att besöka vikingarna',
 'ನಾನು ಸಮುದ್ರವನ್ನು ಪ್ರೀತಿಸುತ್ತೇನೆ',
 'لقد أحببت دائما السفر',
 'Questa lingua è una delle più belle del mondo',
 'இது மிகவும் சுவாரஸ்யமாக இருக்கிறது',
 'यह बहुत मनोरंजक है',
 'Θα ήταν ενδιαφέρον να μάθουμε περισσότερα για αυτό',
 'ഇതാണ് സൗന്ദര്യം']

In [3]:
test_text_en = test_texts_1[0]
test_text_nl = test_texts_1[1]
test_text_ru = test_texts_1[2]
test_text_dk = test_texts_1[3]
test_text_tr = test_texts_1[4]
test_text_ge = test_texts_1[5]
test_text_fr = test_texts_1[6]
test_text_sp = test_texts_1[7]
test_text_pg = test_texts_1[8]
test_text_sw = test_texts_1[9]
test_text_kd = test_texts_1[10]
test_text_ar = test_texts_1[11]
test_text_it = test_texts_1[12]
test_text_tl = test_texts_1[13]
test_text_hd = test_texts_1[14]
test_text_gk = test_texts_1[15]
test_text_ml = test_texts_1[16]

# Downloading the model

In [4]:
lr_pipeline = pipeline(model="spolivin/lang-recogn-model")

# Function for results representation

In [5]:
def detect_language(text: str) -> None:
    """Outputs pipeline testing results on one text."""
    # Applying the pipeline
    lr_results = lr_pipeline(text)[0]
    language_predicted = lr_results["label"]
    language_proba = lr_results["score"]
    # Displaying the results
    print(f"Text: {text}\n")
    print(f"Language detected: {language_predicted.upper()}")
    print(f"Probability of the text being written in {language_predicted} = {language_proba:.2%}")

# Testing the pipeline (one text in one go)

## English language

In [6]:
detect_language(text=test_text_en)

Text: I would rrrrrreally like to know the language of this text

Language detected: ENGLISH
Probability of the text being written in English = 97.95%


## Dutch language

In [7]:
detect_language(text=test_text_nl)

Text: Ik zou blij zijn als ik wist wat deze taal is

Language detected: DUTCH
Probability of the text being written in Dutch = 96.92%


## Russian language

In [8]:
detect_language(text=test_text_ru)

Text: Хотел бы я знать, что это за язык!

Language detected: RUSSIAN
Probability of the text being written in Russian = 96.31%


## Danish language

In [9]:
detect_language(text=test_text_dk)

Text: Jeg ville ønske, jeg vidste, hvilket sprog dette er!

Language detected: DANISH
Probability of the text being written in Danish = 95.21%


## Turkish language

In [10]:
detect_language(text=test_text_tr)

Text: Keşke bunun hangi dil olduğunu bilseydim!

Language detected: TURKISH
Probability of the text being written in Turkish = 95.39%


## German language

In [11]:
detect_language(text=test_text_ge)

Text: Ich möchte die Sprache lernen, in der dieser Text geschrieben ist

Language detected: GERMAN
Probability of the text being written in German = 95.25%


## French language

In [12]:
detect_language(text=test_text_fr)

Text: Je n'ai jamais su ce fait

Language detected: FRENCH
Probability of the text being written in French = 97.09%


## Spanish language

In [13]:
detect_language(text=test_text_sp)

Text: Nunca supe este hecho

Language detected: SPANISH
Probability of the text being written in Spanish = 95.61%


## Portugeese language

In [14]:
detect_language(text=test_text_pg)

Text: Sempre sonhei em visitar os Açores

Language detected: PORTUGEESE
Probability of the text being written in Portugeese = 97.56%


## Sweedish language

In [15]:
detect_language(text=test_text_sw)

Text: Jag har alltid drömt om att besöka vikingarna

Language detected: SWEEDISH
Probability of the text being written in Sweedish = 95.17%


## Kannada language

In [16]:
detect_language(text=test_text_kd)

Text: ನಾನು ಸಮುದ್ರವನ್ನು ಪ್ರೀತಿಸುತ್ತೇನೆ

Language detected: KANNADA
Probability of the text being written in Kannada = 96.01%


## Arabic language

In [17]:
detect_language(text=test_text_ar)

Text: لقد أحببت دائما السفر

Language detected: ARABIC
Probability of the text being written in Arabic = 97.11%


## Italian language

In [18]:
detect_language(text=test_text_it)

Text: Questa lingua è una delle più belle del mondo

Language detected: ITALIAN
Probability of the text being written in Italian = 96.83%


## Tamil language

In [19]:
detect_language(text=test_text_tl)

Text: இது மிகவும் சுவாரஸ்யமாக இருக்கிறது

Language detected: TAMIL
Probability of the text being written in Tamil = 95.37%


## Hindi language

In [20]:
detect_language(text=test_text_hd)

Text: यह बहुत मनोरंजक है

Language detected: HINDI
Probability of the text being written in Hindi = 81.75%


## Greek language

In [21]:
detect_language(text=test_text_gk)

Text: Θα ήταν ενδιαφέρον να μάθουμε περισσότερα για αυτό

Language detected: GREEK
Probability of the text being written in Greek = 96.12%


## Malayam language

In [22]:
detect_language(text=test_text_ml)

Text: ഇതാണ് സൗന്ദര്യം

Language detected: MALAYALAM
Probability of the text being written in Malayalam = 96.12%


# Testing the pipeline (lots of texts at once)

In [23]:
text_file_2 = open("test_texts_2.txt", encoding="utf8")
test_texts_2 = text_file_2.read().splitlines()
text_file_2.close()

# Applying the pipeline on all test texts
multiple_texts_results = lr_pipeline(test_texts_2)

# Outputting the results as DataFrame
multiple_texts_df = pd.DataFrame(multiple_texts_results)
multiple_texts_df["test_text"] = test_texts_2
multiple_texts_df["language"] = multiple_texts_df["label"].copy()
multiple_texts_df["probability"] = np.round(multiple_texts_df["score"].copy(), 4)

# Rearranging the columns
multiple_texts_df[["test_text", "language", "probability"]]

Unnamed: 0,test_text,language,probability
0,Quite a conundrum you have here,English,0.9784
1,Откуда нам было знать,Russian,0.9651
2,"Eu gostaria de reservar um quarto, por favor",Portugeese,0.9776
3,Hooplijk schrift zij niets negatiefs in haar r...,Dutch,0.9694
4,Hvor er der en pengeautomat?,Danish,0.9523
5,"Καλά, ευχαριστώ, Χάρηκα",Greek,0.9617
6,Tünaydın!,Turkish,0.9517
7,Weisst was ich meine?,German,0.9145
8,Interesting! I will look it up!,English,0.9808
