In [1]:
import string
import pandas as pd
import numpy
import scipy
import fasttext
from sklearn.model_selection import train_test_split

# 1. Preprocessing Data
## 1.1 Removing punctuation 

In [2]:
df = pd.read_csv('sentences.csv')
df = df.drop(columns=['id'])

In [3]:
PUNCTUATION = "!\"#$%&'()*+,-./:;<=>?@[\]^_`{|}~？。！"
def format_text(text):
    text.lower()
    for pun in PUNCTUATION:
        text = text.replace(pun, "")
    return text

In [4]:
df['sentence'] = df['sentence'].apply(format_text)

## 1.2 Filter out sentences longer than 20 characters

In [5]:
df_filtered = df[df['sentence'].str.len() < 20] 

## 1.3 Filter out languages with less than 50000 rows

In [6]:
df_filtered2 = df_filtered.groupby('lan_code').filter(lambda x : len(x) > 50000)
df_filtered2.groupby('lan_code').count()

Unnamed: 0_level_0,sentence
lan_code,Unnamed: 1_level_1
ber,107783
cmn,68343
eng,149655
epo,79445
fra,59371
heb,79298
hun,75734
ita,128868
jpn,158953
kab,154835


## 1.4 Splitting into training and testing data

In [7]:
df_filtered2['final_data'] = "__label__"+df_filtered2['lan_code']+" "+df_filtered2['sentence']

train, test = train_test_split(df_filtered2, test_size=0.3)

train.to_csv('training_data.txt', encoding="utf-8",columns=['final_data'],index=False,header=False)
test.to_csv('testing_data.txt', encoding="utf-8",columns=['final_data'],index=False,header=False)

### 1.4.1 Training data

In [8]:
print(train.groupby('lan_code').count())

          sentence  final_data
lan_code                      
ber          75221       75221
cmn          47987       47987
eng         104669      104669
epo          55669       55669
fra          41663       41663
heb          55593       55593
hun          53219       53219
ita          90185       90185
jpn         111348      111348
kab         108080      108080
por          37481       37481
rus         119055      119055
spa          36652       36652
tur          68764       68764
ukr          35669       35669


### 1.4.2 Testing data

In [9]:
print(test.groupby('lan_code').count())

          sentence  final_data
lan_code                      
ber          32562       32562
cmn          20356       20356
eng          44986       44986
epo          23776       23776
fra          17708       17708
heb          23705       23705
hun          22515       22515
ita          38683       38683
jpn          47605       47605
kab          46755       46755
por          16077       16077
rus          51256       51256
spa          15745       15745
tur          29280       29280
ukr          15244       15244


# 2 Training Model

## 2.1 Training FastText model using training data

In [10]:
myModel = fasttext.train_supervised(input="training_data.txt")

## 2.1 Testing trained FastText model using testing data

In [11]:
myModel.test("testing_data.txt")

(446253, 0.8781431161247095, 0.8781431161247095)

In [12]:
myModel.predict("hola")

(('__label__spa',), array([0.999529]))