In [None]:
!pip install tensorflow_text

In [None]:
import os
from glob import glob

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text

In [None]:
dir_path = 'XFORMAL'
file_path = dir_path + '.zip'
if not os.path.isdir(dir_path):
    if not os.path.isfile(file_path):
        !wget -O XFORMAL.zip "https://docs.google.com/uc?export=download&id=1cF8AXSQ1OZhgIbaIBWc3n8xgj2_prY8C"
    !7z x XFORMAL.zip

    for i in glob('XFORMAL/gyafc_translated/*/*/*/'):
        list_files = os.listdir(i)
        informal_path = os.path.join(i, '1_informal')
        os.mkdir(informal_path)
        formal_path = os.path.join(i, '1_formal')
        os.mkdir(formal_path)
        for j in list_files:
            if 'informal' in j:
                os.rename(
                    os.path.join(i, j),
                    os.path.join(informal_path, j),
                )
            else:
                os.rename(
                    os.path.join(i, j),
                    os.path.join(formal_path, j),
                )
        os.rename(informal_path, os.path.join(i, 'informal'))
        os.rename(formal_path, os.path.join(i, 'formal'))

In [None]:
def get_label(file_path):
    parts = tf.strings.split(file_path, os.path.sep)

    # Note: You'll use indexing here instead of tuple unpacking to enable this 
    # to work in a TensorFlow graph.
    return parts[-2]

In [None]:
class_names = ['formal', 'informal']

In [None]:
def labeler(example, example_path):
    return example, tf.argmax(get_label(example_path) == class_names)

In [None]:
data_path = 'XFORMAL/gyafc_translated/{}/*/{}/*/*'

In [None]:
BATCH_SIZE = 512
BUFFER_SIZE = 200000

In [None]:
def create(path_dataset, train=False):
    labeled_data_sets = []

    for file_name in glob(path_dataset):
        lines_dataset = tf.data.TextLineDataset(file_name)
        labeled_dataset = lines_dataset.map(lambda ex: labeler(ex, file_name))
        labeled_data_sets.append(labeled_dataset)
    
    all_labeled_data = labeled_data_sets[0]
    for labeled_dataset in labeled_data_sets[1:]:
        all_labeled_data = all_labeled_data.concatenate(labeled_dataset)
    if train:
        all_labeled_data = all_labeled_data.shuffle(BUFFER_SIZE)
    all_labeled_data = all_labeled_data.batch(BATCH_SIZE)
    return all_labeled_data

In [None]:
def create_model():
    model = tf.keras.Sequential([
        tf.keras.layers.Input(shape=[], dtype=tf.string),
        hub.KerasLayer("https://tfhub.dev/google/universal-sentence-encoder-multilingual/3", trainable=True),
        tf.keras.layers.Dense(512, "relu"),
        tf.keras.layers.Dropout(.5),
        tf.keras.layers.Dense(256, "relu"),
        tf.keras.layers.Dropout(.5),
        tf.keras.layers.Dense(1, 'sigmoid'),
    ])
    model.compile(
        loss='binary_crossentropy',
        optimizer='adam',
        metrics=['accuracy']
    )
    return model

In [None]:
langs = ['en', 'fr', 'it', 'pt', 'ru']

In [None]:
import numpy as np
from sklearn.metrics import classification_report

In [None]:
for lang in list(reversed(langs)):
    print(lang)
    train_dataset = create(data_path.format(lang, 'train'), True)
    valid_dataset = create(data_path.format(lang, 'test'))

    model = create_model()
    # Create earlystopping callback
    early_stopping_callback = tf.keras.callbacks.EarlyStopping(
        monitor='val_accuracy', min_delta=0,
        patience=2, restore_best_weights = True)

    model.fit(
        train_dataset,
        validation_data=valid_dataset,
        epochs=1024,
        callbacks=[early_stopping_callback]
    )

    tune_dataset = create(data_path.format(lang, 'tune'))

    y_test = np.concatenate([y for x, y in tune_dataset], axis=0)

    y_pred = model.predict(tune_dataset)

    test_predict_around = np.around(y_pred)

    result = classification_report(y_test, test_predict_around, digits=6)
    print(result)

ru
Epoch 1/1024
Epoch 2/1024
Epoch 3/1024
Epoch 4/1024
              precision    recall  f1-score   support

           0   0.761061  0.931262  0.837603     27263
           1   0.895773  0.668937  0.765913     24077

    accuracy                       0.808239     51340
   macro avg   0.828417  0.800100  0.801758     51340
weighted avg   0.824237  0.808239  0.803982     51340

pt
Epoch 1/1024
Epoch 2/1024
Epoch 3/1024
Epoch 4/1024
              precision    recall  f1-score   support

           0   0.721094  0.939038  0.815760     27263
           1   0.895056  0.588736  0.710277     24077

    accuracy                       0.774757     51340
   macro avg   0.808075  0.763887  0.763019     51340
weighted avg   0.802677  0.774757  0.766292     51340

it
Epoch 1/1024
Epoch 2/1024
Epoch 3/1024
              precision    recall  f1-score   support

           0   0.735999  0.925027  0.819757     27263
           1   0.880293  0.624289  0.730511     24077

    accuracy                  

In [None]:
langs = ['en', 'fr', 'it', 'pt', 'ru']
lang_format = '[!{}]*'

In [None]:
for lang in langs:
    print(lang)
    tune_dataset = create(data_path.format(lang, 'tune'))
    y_test = np.concatenate([y for x, y in tune_dataset], axis=0)

    y_pred = model.predict(tune_dataset)

    test_predict_around = np.around(y_pred)

    result = classification_report(y_test, test_predict_around, digits=6)
    print(result)

en
              precision    recall  f1-score   support

           0   0.838003  0.950042  0.890512     27263
           1   0.933340  0.792042  0.856905     24077

    accuracy                       0.875945     51340
   macro avg   0.885671  0.871042  0.873709     51340
weighted avg   0.882713  0.875945  0.874752     51340

fr
              precision    recall  f1-score   support

           0   0.780935  0.901185  0.836762     27263
           1   0.864480  0.713752  0.781918     24077

    accuracy                       0.813284     51340
   macro avg   0.822708  0.807468  0.809340     51340
weighted avg   0.820115  0.813284  0.811042     51340

it
              precision    recall  f1-score   support

           0   0.757978  0.876499  0.812941     27263
           1   0.830070  0.683100  0.749447     24077

    accuracy                       0.785801     51340
   macro avg   0.794024  0.779800  0.781194     51340
weighted avg   0.791787  0.785801  0.783164     51340

pt
       

In [None]:
for lang in langs:
    print(lang)
    train_dataset = create(data_path.format(lang_format.format(lang), 'train'), True)
    valid_dataset = create(data_path.format(lang, 'test'))

    model = create_model()
    # Create earlystopping callback
    early_stopping_callback = tf.keras.callbacks.EarlyStopping(
        monitor='val_accuracy', min_delta=0,
        patience=2, restore_best_weights = True)

    model.fit(
        train_dataset,
        validation_data=valid_dataset,
        epochs=1024,
        callbacks=[early_stopping_callback]
    )

    tune_dataset = create(data_path.format(lang, 'tune'))

    y_test = np.concatenate([y for x, y in tune_dataset], axis=0)

    y_pred = model.predict(tune_dataset)

    test_predict_around = np.around(y_pred)

    result = classification_report(y_test, test_predict_around, digits=6)
    print(result)

en
Epoch 1/1024
Epoch 2/1024
Epoch 3/1024
Epoch 4/1024
              precision    recall  f1-score   support

           0   0.863069  0.737960  0.795626     27263
           1   0.745121  0.867425  0.801635     24077

    accuracy                       0.798675     51340
   macro avg   0.804095  0.802693  0.798631     51340
weighted avg   0.807755  0.798675  0.798444     51340

fr
Epoch 1/1024
Epoch 2/1024
Epoch 3/1024
Epoch 4/1024
              precision    recall  f1-score   support

           0   0.805126  0.624509  0.703408     27263
           1   0.660948  0.828841  0.735434     24077

    accuracy                       0.720335     51340
   macro avg   0.733037  0.726675  0.719421     51340
weighted avg   0.737511  0.720335  0.718427     51340

it
Epoch 1/1024
Epoch 2/1024
Epoch 3/1024
              precision    recall  f1-score   support

           0   0.736894  0.885229  0.804279     27263
           1   0.831675  0.642107  0.724699     24077

    accuracy                  