# Language classifier - Data Analysis

In [73]:
# imports
import pandas as pd
import numpy as np
import plotly.express as px


In [74]:
# load the dataset
dataset = pd.read_csv("../model_code/data/datasets/LanguageDetection.csv", encoding="utf-8")
# print some examples
dataset.sample(5)

Unnamed: 0,Text,Language
5362,Otra forma de decirle a la persona que no se e...,Spanish
3491,"Jusqu'en 2014, il a été possible de commander ...",French
9785,Für mich ist dies das beste Restaurant der Stadt.,German
4038,comme quand nous nous sommes arrêtés où nous é...,French
8859,har du några planer för imorgon?,Sweedish


In [75]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10337 entries, 0 to 10336
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Text      10337 non-null  object
 1   Language  10337 non-null  object
dtypes: object(2)
memory usage: 161.6+ KB


## Understand language distribution
Plot language distribution over the data

In [76]:
# get counter of the target variable 
language_count = dict(dataset["Language"].value_counts())
# print
print(f"Examples per language:\n",language_count)


Examples per language:
 {'English': 1385, 'French': 1014, 'Spanish': 819, 'Portugeese': 739, 'Italian': 698, 'Russian': 692, 'Sweedish': 676, 'Malayalam': 594, 'Dutch': 546, 'Arabic': 536, 'Turkish': 474, 'German': 470, 'Tamil': 469, 'Danish': 428, 'Kannada': 369, 'Greek': 365, 'Hindi': 63}


In [77]:
# X values
x = list(language_count.keys())
# y values
y = list(language_count.values())
# plot distribution
px.histogram(x=x, y=y).update_layout(xaxis_title="Languages", yaxis_title="Examples")

## Focus on Italian/non-italian problem

In [78]:
# create target variable for italian/non-italian 
dataset["target"] = [1 if row=="Italian" else 0 for row in dataset["Language"].to_numpy()]
# check count
target_var = dict(dataset["target"].value_counts())

In [79]:
# plot 
# X values
x = list(target_var.keys())
x = ["not-italian" if example == 0 else "italian" for example in x]
# y values
y = list(target_var.values())
# plot distribution
px.histogram(x=x, y=y).update_layout(xaxis_title="Languages", yaxis_title="Examples").update_layout(xaxis_title="Italian/Non-italian", yaxis_title="Examples")

## Discussion about the distribution
The dataset is unbalanced, is common in this kind of tasks. We can assume that the distribution is a real-world distribution, that is I don't want to perturb a lot the dataset, because I want to preserve a "prior" knowledge using the data distribution (The probability of Italian is also related to the number of italian example that I collect in a given period). This assumption is true if we trust the data collection process and the data itself, it's not true if the distribution is a "not-real world distribution" (for example if we started to collect data when we don't support italian yet).

In this case the data distibution is too unbalanced, so I will perform a mix of undersampling + oversampling techniques.

In [80]:
dataset.sample(10)

Unnamed: 0,Text,Language,target
5675,υπάρχει πάρα πολύ στο πιάτο μου πνίγω στη δουλ...,Greek,0
7348,"Il secondo metodo, più specifico, consiste nel...",Italian,1
4579,kom op.,Dutch,0
5830,βιδώσω το έβγαλα αν θέλετε κάποιος να δουλέψει...,Greek,0
2243,நீங்கள் வீட்டிற்கு வந்தீர்கள் என்று எந்த கவலைய...,Tamil,0
3423,Les anthropologies symbolistes s’intéressent a...,French,0
10297,ನಾನು ಎಚ್ಚರವಾಯಿತು ಅವಳು ಎಲ್ಲಾ ಸ್ಮೈಲ್ಸ್ ಮತ್ತು ಮುಸ...,Kannada,0
2809,"[172] Em dezembro de 2008, a revista científic...",Portugeese,0
1423,ഈ മാറ്റങ്ങൾ എല്ലാംതന്നെ വിക്കിപീഡിയ സൂക്ഷിച്ച്...,Malayalam,0
1752,"അവർ നിങ്ങൾക്ക് നല്ലതുപോലെയാണ്, ഇത് നിങ്ങൾ ഇല്ല...",Malayalam,0


In [81]:
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.pipeline import make_pipeline
rus = RandomUnderSampler(random_state=12, sampling_strategy={0:1500})
ros = RandomOverSampler(random_state=12, sampling_strategy={1:1100})

pipeline = make_pipeline(rus, ros)

X_resampled, y_resampled = pipeline.fit_resample(dataset.drop(["target"], axis=1), dataset["target"])

In [82]:
balanced_dataset = X_resampled
balanced_dataset["target"] = y_resampled
balanced_dataset

Unnamed: 0,Text,Language,target
0,i wasn't listening another phrase that i use a...,English,0
1,هذا لطف كبير منك.,Arabic,0
2,colour versus color)[132] or points of view.,English,0
3,"Toutefois, les études ne concluent pas sur le ...",French,0
4,ಆ ಕ್ಷಣದಲ್ಲಿ ನಾರ್ಸಿಸಸ್ ತನ್ನ ನಿದ್ರೆಯಲ್ಲಿ ನಗಲು ಪ್...,Kannada,0
...,...,...,...
2595,Ho dato di matto un'altra frase che i madrelin...,Italian,1
2596,purtroppo devo dire di no.,Italian,1
2597,puoi chiedere scusa.,Italian,1
2598,La comunità di Wikipedia è stata anche critica...,Italian,1


In [83]:
# create target variable for italian/non-italian 
balanced_dataset["target"] = [1 if row=="Italian" else 0 for row in balanced_dataset["Language"].to_numpy()]
# check count
target_var = dict(balanced_dataset["target"].value_counts())

In [84]:
# plot 
# X values
x = list(target_var.keys())
x = ["not-italian" if example == 0 else "italian" for example in x]
# y values
y = list(target_var.values())
# plot distribution
px.histogram(x=x, y=y).update_layout(xaxis_title="Languages", yaxis_title="Examples").update_layout(xaxis_title="Italian/Non-italian", yaxis_title="Examples")

In [85]:
# save dataset
balanced_dataset.to_csv("../model_code/data/datasets/balanced.csv", index=False)

## Number of different tokens

In [86]:
def create_vocabulary(dataset:pd.DataFrame, text_column:str="Text"):
    words = dict()

    for sentence in dataset[text_column].to_numpy():
        # simple tokenization
        # lower case sentence
        for word in sentence.lower().split():
            if word not in words:  # vocabulary
                words[word] = 1 
            else:
                words[word] += 1
    return words

In [87]:
# token to idx
vocab = create_vocabulary(dataset=balanced_dataset)

In [88]:
# top k words by occurences
K = 10
top_k = {k:v for k,v in sorted(vocab.items(), key=lambda item: item[1], reverse=True)[:K]}
top_k

{'di': 1049,
 'e': 615,
 'in': 570,
 'la': 554,
 'a': 542,
 'de': 513,
 'che': 495,
 'un': 435,
 'è': 387,
 'il': 358}

In [89]:
balanced_dataset

Unnamed: 0,Text,Language,target
0,i wasn't listening another phrase that i use a...,English,0
1,هذا لطف كبير منك.,Arabic,0
2,colour versus color)[132] or points of view.,English,0
3,"Toutefois, les études ne concluent pas sur le ...",French,0
4,ಆ ಕ್ಷಣದಲ್ಲಿ ನಾರ್ಸಿಸಸ್ ತನ್ನ ನಿದ್ರೆಯಲ್ಲಿ ನಗಲು ಪ್...,Kannada,0
...,...,...,...
2595,Ho dato di matto un'altra frase che i madrelin...,Italian,1
2596,purtroppo devo dire di no.,Italian,1
2597,puoi chiedere scusa.,Italian,1
2598,La comunità di Wikipedia è stata anche critica...,Italian,1
