In [2]:
import string
import pandas as pd
import numpy
import scipy
import fasttext
from sklearn.model_selection import train_test_split

# 1. Preprocessing Data
## 1.1 Removing punctuation 

In [3]:
df = pd.read_csv('sentences.csv')
df = df.drop(columns=['id'])

In [4]:
PUNCTUATION = "!\"#$%&'()*+,-./:;<=>?@[\]^_`{|}~？。！"
def format_text(text):
    text.lower()
    for pun in PUNCTUATION:
        text = text.replace(pun, "")
    return text

In [5]:
df['sentence'] = df['sentence'].apply(format_text)

## 1.2 Filter out sentences longer than 20 characters

In [6]:
df_filtered = df[df['sentence'].str.len() < 20] 

## 1.3 Filter out languages with less than 50000 rows

In [7]:
df_filtered2 = df_filtered.groupby('lan_code').filter(lambda x : len(x) > 50000)
df_filtered2.groupby('lan_code').count()

Unnamed: 0_level_0,sentence
lan_code,Unnamed: 1_level_1
ber,107783
cmn,68343
eng,149655
epo,79445
fra,59371
heb,79298
hun,75734
ita,128868
jpn,158953
kab,154835


## 1.4 Splitting into training and testing data

In [11]:
df_filtered2['final_data'] = "__label__"+ df_filtered2['lan_code'] + " " + df_filtered2['sentence']

train, test = train_test_split(df_filtered2, test_size=0.3)

train.to_csv('training_data.txt', encoding="utf-8",columns=['final_data'],index=False,header=False)
test.to_csv('testing_data.txt', encoding="utf-8",columns=['final_data'],index=False,header=False)

In [19]:
train.sort_values(by=['lan_code'], inplace=True)
test.sort_values(by=['lan_code'], inplace=True)

In [30]:
for ind in train.index:
    fileName = "corpus/" + df['lan_code'][ind] + "/file.txt"
    f = open(fileName, "a", encoding="utf-8")
    f.write(df['sentence'][ind] + "\n")
    f.close()

### 1.4.1 Training data

In [13]:
print(train.groupby('lan_code').count())

          sentence  final_data
lan_code                      
ber          75463       75463
cmn          47865       47865
eng         105129      105129
epo          55646       55646
fra          41537       41537
heb          55243       55243
hun          53032       53032
ita          90296       90296
jpn         111318      111318
kab         108169      108169
por          37479       37479
rus         119235      119235
spa          36467       36467
tur          68774       68774
ukr          35602       35602


### 1.4.2 Testing data

In [14]:
print(test.groupby('lan_code').count())

          sentence  final_data
lan_code                      
ber          32320       32320
cmn          20478       20478
eng          44526       44526
epo          23799       23799
fra          17834       17834
heb          24055       24055
hun          22702       22702
ita          38572       38572
jpn          47635       47635
kab          46666       46666
por          16079       16079
rus          51076       51076
spa          15930       15930
tur          29270       29270
ukr          15311       15311


# 2 Training Model

## 2.1 Training FastText model using training data

In [15]:
myModel = fasttext.train_supervised(input="training_data.txt")

## 2.1 Testing trained FastText model using testing data

In [16]:
myModel.test("testing_data.txt")

(446253, 0.8783604816102076, 0.8783604816102076)

In [17]:
myModel.predict("hola")

(('__label__spa',), array([0.79914647]))