In [None]:
import string
import pandas as pd
import numpy
import scipy
import fasttext
from sklearn.model_selection import train_test_split

# 1. Preprocessing Data
## 1.1 Removing punctuation 

In [None]:
df = pd.read_csv('sentences.csv')
df = df.drop(columns=['id'])

In [None]:
PUNCTUATION = "!\"#$%&'()*+,-./:;<=>?@[\]^_`{|}~？。！"
def format_text(text):
    text.lower()
    for pun in PUNCTUATION:
        text = text.replace(pun, "")
    return text

In [None]:
df['sentence'] = df['sentence'].apply(format_text)

## 1.2 Filter out sentences longer than 20 characters

In [None]:
df_filtered = df[df['sentence'].str.len() < 20] 

## 1.3 Filter out languages with less than 50000 rows

In [None]:
df_filtered2 = df_filtered.groupby('lan_code').filter(lambda x : len(x) > 50000)
df_filtered2.groupby('lan_code').count()

## 1.4 Splitting into training and testing data

In [None]:
df_filtered2['final_data'] = "__label__"+df_filtered2['lan_code']+" "+df_filtered2['sentence']

train, test = train_test_split(df_filtered2, test_size=0.3)

train.to_csv('training_data.txt', encoding="utf-8",columns=['final_data'],index=False,header=False)
test.to_csv('testing_data.txt', encoding="utf-8",columns=['final_data'],index=False,header=False)

### 1.4.1 Training data

In [None]:
print(train.groupby('lan_code').count())

### 1.4.2 Testing data

In [None]:
print(test.groupby('lan_code').count())

# 2 Training Model

## 2.1 Training FastText model using training data

In [None]:
myModel = fasttext.train_supervised(input="training_data.txt")

## 2.1 Testing trained FastText model using testing data

In [None]:
myModel.test("testing_data.txt")

In [None]:
myModel.predict("hola")