In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
import pandas as pd
import numpy as np
import seaborn as sns
import math
from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
classes = ['ADJ', 
            'ADJ:FORM', 
            'ADV', 
            'CONJ', 
            'CONTR', 
            'DET', 
            'MORPH', 
            'NOUN', 
            'NOUN:INFL',
            'NOUN:NUM', 
            'NOUN:POSS', 
            'ORTH', 
            'OTHER', 
            'PART', 
            'PREP', 
            'PRON', 
            'PUNCT', 
            'SPELL', 
            'VERB', 
            'VERB:FORM', 
            'VERB:INFL', 
            'VERB:SVA',
            'VERB:TENSE', 
            'WO']

In [4]:
df = pd.read_parquet('../data/lang-8/train_tagged.parquet')

In [5]:
def get_irlbi(labels):
    sums = labels.sum(axis=0)
    max_count = sums.max()
    irlbi = max_count / sums
    return irlbi

In [6]:
def get_mean_ir(labels):
    irlbi = get_irlbi(labels)
    mean_ir = irlbi.mean()
    return mean_ir

In [7]:
def mlp_ros(dataset, percent=10):
    to_clone = math.ceil(percent/100 * len(dataset))
    labels = dataset['label'].to_numpy()
    num_labels = len(labels[0])
    labels = np.concatenate(labels).reshape(-1, num_labels)
    mean_ir = get_mean_ir(labels)
    irlbis = get_irlbi(labels)
    np_dataset = dataset.to_numpy()
    new_dataset = np.copy(np_dataset)

    bags = {}

    for i in range(num_labels):
        irlbi = irlbis[i]
        if irlbi > mean_ir:
            bags[i] = np.where(labels[:,i]==1)[0]

    label_counts = labels.sum(axis=0)
    to_add = (label_counts.max()/mean_ir).round() - label_counts[list(bags.keys())]

    if to_add.sum() > to_clone:
        total_diff = to_add.sum() - to_clone

        for i in range(len(bags)):
            to_subtract = math.floor(total_diff/(len(bags) - i))
            if to_subtract > to_add[i]:
                total_diff -= to_add[i]
                to_add[i] = 0
            else:
                to_add[i] -= to_subtract
    
    to_add = to_add.astype(np.int64)
    samples = np.concatenate([np.random.choice(bags[k], to_add[i]) for i, k in enumerate(bags)])
    new_dataset = np.concatenate([new_dataset, np_dataset[samples]])
    
    new_irlbis = get_irlbi(new_dataset[:,1])

    print(f'Added {len(new_dataset) - len(dataset)} samples')
    print(f'Original irlbis: {irlbis}, mean: {mean_ir}')
    print(f'New IRLBIs: {new_irlbis}, mean: {new_irlbis.mean()}')

    return pd.DataFrame(new_dataset, columns=dataset.columns)

In [8]:
def mlp_rus(dataset, percent=10):
    to_remove = math.ceil(percent/100 * len(dataset))
    labels = dataset['label'].to_numpy()
    num_labels = len(labels[0])
    labels = np.concatenate(labels).reshape(-1, num_labels)
    mean_ir = get_mean_ir(labels)
    irlbis = get_irlbi(labels)
    np_dataset = dataset.to_numpy()

    bags = {}

    for i in range(num_labels):
        irlbi = irlbis[i]
        if irlbi < mean_ir:
            bags[i] = np.where(labels[:,i]==1)[0]

    label_counts = labels.sum(axis=0)
    to_remove_per_label = label_counts[list(bags.keys())] - (label_counts.max()/mean_ir).round()

    if to_remove_per_label.sum() > to_remove:
        total_diff = to_remove_per_label.sum() - to_remove

        for i in range(len(bags)):
            to_subtract = math.floor(total_diff/(len(bags) - i))
            if to_subtract > to_remove_per_label[i]:
                total_diff -= to_remove_per_label[i]
                to_remove_per_label[i] = 0
            else:
                to_remove_per_label[i] -= to_subtract

    to_remove_per_label = to_remove_per_label.astype(np.int64)
    samples = np.concatenate([np.random.choice(bags[k], to_remove_per_label[i]) for i, k in enumerate(bags)])

    new_dataset = np.delete(np_dataset, samples, axis=0)
    
    new_irlbis = get_irlbi(new_dataset[:,1])

    print(f'Removed {len(dataset) - len(new_dataset)} samples')
    print(f'Original irlbis: {irlbis}, mean: {mean_ir}')
    print(f'New IRLBIs: {new_irlbis}, mean: {new_irlbis.mean()}')

    return pd.DataFrame(new_dataset, columns=dataset.columns)

In [9]:
train_oversampled10 = mlp_ros(df, 10)

Added 24869 samples
Original irlbis: [ 11.5718232  167.79971388   5.22295943  16.45049088  17.99785177
   1.54951384   8.62346065   3.80250276 163.13212796   4.59842396
  45.92482381   4.68240883   1.          27.96328525   2.28207872
   5.92010095   3.34809106   3.55311866   2.78388417   7.02137085
 118.          11.12036027   2.99914341  14.42351205], mean: 27.157126931951865
New IRLBIs: [11.5614725  28.0750115   5.12620788 16.27830031 18.01875508  1.54298685
  8.58437401  3.79497069 28.0653249   4.54157671 27.64250113  4.69736285
  1.         27.06010202  2.27406834  5.94479768  3.33316943  3.53679145
  2.7831978   7.05201711 28.0491954  10.87759651  3.00645575 14.52720562], mean: 11.140560063588651


In [10]:
train_oversampled10.to_parquet('../data/lang-8/train_tagged_oversampled10.parquet')

In [9]:
train_oversampled10.label.sum()

array([ 18975,   8159,  42959,  13543,  12232, 142446,  25596,  58025,
         8147,  48443,   8307,  47024, 220208,   8429,  96842,  37151,
        66288,  62232,  78922,  31227,   8151,  20215,  73254,  15149,
       485910])

In [11]:
train_oversampled10.label.sum() - df.label.sum()

array([ 810, 6888, 2509,  715,  476, 6324, 1177, 2516, 6853, 2564, 3699,
       1782, 9046,  839, 4394, 1516, 3177, 2912, 3237, 1177, 6373, 1202,
       2869,  506,    0])

In [6]:
df = pd.read_parquet('../data/lang-8-en-1.0/train_tagged_oversampled10.parquet')

In [7]:
df2 = pd.read_parquet('../data/lang-8-en-1.0/eval_tagged.parquet')

In [8]:
df

Unnamed: 0,text,label
0,"So , I have to get up at 7 : 00 to get to work...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ..."
1,Sorry for the boring entry . . .,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,I 'm not good at English and that is my fatal ...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,You did much for me but what I did for you is ...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,Maybe you do n't know it .,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...
1026713,So when it occurred people were unable to esca...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, ..."
1026714,I heard from my parents that the pollen count ...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ..."
1026715,I told him my name and birth date .,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1026716,"It is not very easy to understand , because it...","[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [14]:
df2 = df2.reset_index(drop=True)

In [15]:
df2.to_parquet('../data/lang-8-en-1.0/eval_tagged.parquet')

In [16]:
df = df.reset_index(drop=True)

In [17]:
df.to_parquet('../data/lang-8-en-1.0/train_tagged_oversampled10.parquet')