In [2]:
# Import libraries
import pandas as pd
import numpy as np
import re
from tqdm import tqdm
from bs4 import BeautifulSoup
import random
from random import shuffle
import multiprocessing

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

import nltk
from nltk.corpus import wordnet 
from nltk.tokenize import word_tokenize

import matplotlib.pyplot as plt
import seaborn as sns

import gensim
from gensim.models import Word2Vec,KeyedVectors

sns.set_style("whitegrid")

import warnings
warnings.filterwarnings("ignore")

## Load Data Train

In [3]:
data_train = pd.read_excel("Data/data_train.xlsx")

In [59]:
data_train.head()

Unnamed: 0,index,Kalimat,Kalimat_prep,label
0,986,Esia yang terlupakan \nGak kayak Smartfren sih...,esia yang terlupakan gak kayak smartfren sih y...,1
1,2670,"Kalau menurut saya, oknum KPAI yg cari panggun...",kalau menurut saya oknum kpai yang cari panggu...,1
2,1169,KUHP .. Kasih Uang Habis Perkara,kuhp kasih uang habis perkara,1
3,880,Kok boleh ngerekam? Bukannya xxi ada camera in...,kok boleh ngerekam bukannya xxi ada camera inf...,1
4,2701,Djarum kudus lgsg menghentikan bingung juga kp...,djarum kudus langsung menghentikan bingung jug...,1


In [4]:
data_train['label'].value_counts()

1    1939
3     211
2      75
Name: label, dtype: int64

## Create Word2Vec Model

In [20]:
tokenized_corpus = [word_tokenize(sentence) for sentence in data_train['Kalimat_prep']]

In [25]:
model = Word2Vec(sentences=tokenized_corpus, vector_size=100, window=5, sg=1, min_count=1)

# Training the Word2Vec model
model.train(tokenized_corpus, total_examples=len(tokenized_corpus), epochs=100)

(3793241, 4424200)

In [44]:
model.save("Model/word2vec/w2v_train.model")

In [381]:
model.wv.most_similar("benci",topn=20)

[('dikorupsi', 0.6762087941169739),
 ('berkarya', 0.6713862419128418),
 ('usemua', 0.6655163168907166),
 ('kurangi', 0.6282694339752197),
 ('medsosnya', 0.6166490316390991),
 ('setujuh', 0.6150714159011841),
 ('harumnya', 0.6131832003593445),
 ('terlanjur', 0.6129735708236694),
 ('pelatih', 0.6114039421081543),
 ('menggabungkan', 0.6039972305297852),
 ('pindahkan', 0.602449357509613),
 ('berjuamg', 0.602048397064209),
 ('mengesampingkan', 0.599072277545929),
 ('merek', 0.5954675674438477),
 ('membimbing', 0.5949916839599609),
 ('bermanfaatnya', 0.5945578217506409),
 ('mewujudkan', 0.5935664176940918),
 ('berbahaya', 0.593377947807312),
 ('block', 0.5928225517272949),
 ('makany', 0.5904486179351807)]

## Easy Data Augmentation (EDA)
https://github.com/jasonwei20/eda_nlp/blob/master/code/eda.py

https://arxiv.org/pdf/1901.11196.pdf

Karena bahasa Indonesia, wordnet diganti dengan word2vec

In [196]:
def get_only_chars(line):

    clean_line = ""

    line = line.replace("’", "")
    line = line.replace("'", "")
    line = line.replace("-", " ") #replace hyphens with spaces
    line = line.replace("\t", " ")
    line = line.replace("\n", " ")
    line = line.lower()

    for char in line:
        if char in 'qwertyuiopasdfghjklzxcvbnm ':
            clean_line += char
        else:
            clean_line += ' '

    clean_line = re.sub(' +',' ',clean_line) #delete extra spaces
    if clean_line[0] == ' ':
        clean_line = clean_line[1:]
    return clean_line

In [85]:
#stop words list
stop_words = ["aku","saya","kamu","anda","kami","kita","kalian","hai",
              "yang","di","ke","dari","untuk","sehingga","karena","oleh","itu",
              "maka","makanya","iya","tidak","gak","dan","tapi","tetapi","akan",
              '']


In [141]:
model = Word2Vec.load("Model/word2vec/w2v_train.model")

In [225]:
random.seed(10)

In [362]:
########################################################################
# Synonym replacement
# Replace n words in the sentence with synonyms from wordnet
########################################################################
def synonym_replacement(words, n):
    new_words = words.copy()
    random_word_list = list(set([word for word in words if word not in stop_words]))
    random.shuffle(random_word_list)
    num_replaced = 0
    for random_word in random_word_list:
        synonyms = get_synonyms(random_word)
        if len(synonyms) >= 1:
            synonym = random.choice(list(synonyms))
            new_words = [synonym if word == random_word else word for word in new_words]
            #print("replaced", random_word, "with", synonym)
            num_replaced += 1
        if num_replaced >= n: #only replace up to n words
            break

    #this is stupid but we need it, trust me
    sentence = ' '.join(new_words)
    new_words = sentence.split(' ')

    return new_words

def get_synonyms(word):
    try:
        synonyms = [i[0] for i in model.wv.most_similar(word,topn=20)]
        return list(synonyms)
    except:
        return []

########################################################################
# Random deletion
# Randomly delete words from the sentence with probability p
########################################################################

def random_deletion(words, p):

    #obviously, if there's only one word, don't delete it
    if len(words) == 1:
        return words

    #randomly delete words with probability p
    new_words = []
    for word in words:
        r = random.uniform(0, 1)
        if r > p:
            new_words.append(word)

    #if you end up deleting all words, just return a random word
    if len(new_words) == 0:
        rand_int = random.randint(0, len(words)-1)
        return [words[rand_int]]

    return new_words

########################################################################
# Random swap
# Randomly swap two words in the sentence n times
########################################################################

def random_swap(words, n):
    new_words = words.copy()
    for _ in range(n):
        new_words = swap_word(new_words)
    return new_words

def swap_word(new_words):
    random_idx_1 = random.randint(0, len(new_words)-1)
    random_idx_2 = random_idx_1
    counter = 0
    while random_idx_2 == random_idx_1:
        random_idx_2 = random.randint(0, len(new_words)-1)
        counter += 1
        if counter > 3:
            return new_words
    new_words[random_idx_1], new_words[random_idx_2] = new_words[random_idx_2], new_words[random_idx_1] 
    return new_words

########################################################################
# Random insertion
# Randomly insert n words into the sentence
########################################################################

def random_insertion(words, n):
    new_words = words.copy()
    for _ in range(n):
        add_word(new_words)
    return new_words

def add_word(new_words):
    synonyms = []
    counter = 0
    while len(synonyms) < 1:
        random_word = new_words[random.randint(0, len(new_words)-1)]
        synonyms = get_synonyms(random_word)
        counter += 1
        if counter >= 10:
            return
    random_synonym = synonyms[0]
    random_idx = random.randint(0, len(new_words)-1)
    new_words.insert(random_idx, random_synonym)

########################################################################
# main data augmentation function
########################################################################

def eda(sentence, alpha_sr=0.1, alpha_ri=0.1, alpha_rs=0.1, p_rd=0.1, num_aug=9):
    
    sentence = get_only_chars(sentence)
    words = sentence.split(' ')
    words = [word for word in words if word is not '']
    num_words = len(words)
    
    augmented_sentences = []
    num_tech = int(sum([np.ceil(alpha_sr),np.ceil(alpha_ri),np.ceil(alpha_rs),np.ceil(p_rd)]))
    num_new_per_technique = int(num_aug/num_tech)+1 if num_aug%num_tech!=0 else int(num_aug/num_tech)

    #sr
    if (alpha_sr > 0):
        n_sr = max(1, int(alpha_sr*num_words))
        for _ in range(num_new_per_technique):
            a_words = synonym_replacement(words, n_sr)
            augmented_sentences.append(' '.join(a_words))

    #ri
    if (alpha_ri > 0):
        n_ri = max(1, int(alpha_ri*num_words))
        for _ in range(num_new_per_technique):
            a_words = random_insertion(words, n_ri)
            augmented_sentences.append(' '.join(a_words))

    #rs
    if (alpha_rs > 0):
        n_rs = max(1, int(alpha_rs*num_words))
        for _ in range(num_new_per_technique):
            a_words = random_swap(words, n_rs)
            augmented_sentences.append(' '.join(a_words))

    #rd
    if (p_rd > 0):
        for _ in range(num_new_per_technique):
            a_words = random_deletion(words, p_rd)
            augmented_sentences.append(' '.join(a_words))

    augmented_sentences = [get_only_chars(sentence) for sentence in augmented_sentences]
    shuffle(augmented_sentences)

    #trim so that we have the desired number of augmented sentences
    if num_aug >= 1:
        augmented_sentences = augmented_sentences[:num_aug]
    else:
        keep_prob = num_aug / len(augmented_sentences)
        augmented_sentences = [s for s in augmented_sentences if random.uniform(0, 1) < keep_prob]

    #append the original sentence
    # augmented_sentences.append(sentence)

    return augmented_sentences

In [363]:
def generate_eda(sentence_ls, alpha_sr=0.05, alpha_ri=0.05, alpha_rs=0.05, p_rd=0.05, num_aug=8):
    new_sentence_ls = []
    for sent in tqdm(sentence_ls):
        new_sentences = eda(sent,alpha_sr=alpha_sr, alpha_ri=alpha_ri, alpha_rs=alpha_rs, p_rd=p_rd,num_aug=num_aug)
        new_sentence_ls = new_sentence_ls + new_sentences
    return new_sentence_ls

## Oversampling EDA

In [228]:
data_train_ovr = data_train.copy()
data_train_ovr = data_train_ovr.drop(["index","Kalimat"],axis=1)
data_train_ovr['label'].value_counts()

1    1939
3     211
2      75
Name: label, dtype: int64

In [229]:
major_class = data_train_ovr['label'].value_counts()[1]
num_aug_dict = {
    2 : int(np.ceil(major_class/data_train_ovr['label'].value_counts()[2])),
    3 : int(np.ceil(major_class/data_train_ovr['label'].value_counts()[3]))
}
num_aug_dict

{2: 26, 3: 10}

In [230]:
#oversampling label
for i in [2,3]:
    sent_ls = data_train_ovr[data_train_ovr['label']==i]['Kalimat_prep'].values.tolist()
    new_sent_ls = generate_eda(sent_ls,num_aug=num_aug_dict[i])
    max_new_sent = major_class - data_train_ovr['label'].value_counts()[i]#data_train_ovr['label'].value_counts()[i]*8 
    data_train_ovr = pd.concat([data_train_ovr,pd.DataFrame({'Kalimat_prep':new_sent_ls[:max_new_sent],
                                                             'label':np.ones(max_new_sent).astype(int)*i})])

100%|██████████████████████████████████████████████████████████████████████████████████| 75/75 [00:00<00:00, 91.85it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 211/211 [00:00<00:00, 219.28it/s]


In [231]:
data_train_ovr['label'].value_counts()

1    1939
3    1939
2    1939
Name: label, dtype: int64

In [232]:
data_train_ovr.to_excel("Data/data_train_oversampling.xlsx")

### SR only

In [242]:
random.seed(10)

In [243]:
data_train_sr = data_train.copy()
data_train_sr = data_train_sr.drop(["index","Kalimat"],axis=1)
data_train_sr['label'].value_counts()

1    1939
3     211
2      75
Name: label, dtype: int64

In [244]:
#oversampling label
for i in [2,3]:
    sent_ls = data_train_sr[data_train_sr['label']==i]['Kalimat_prep'].values.tolist()
    new_sent_ls = generate_eda(sent_ls,alpha_sr=0.05, alpha_ri=0, alpha_rs=0, p_rd=0,num_aug=num_aug_dict[i])
    max_new_sent = major_class - data_train_sr['label'].value_counts()[i]
    data_train_sr = pd.concat([data_train_sr,pd.DataFrame({'Kalimat_prep':new_sent_ls[:max_new_sent],
                                                             'label':np.ones(max_new_sent).astype(int)*i})])

100%|██████████████████████████████████████████████████████████████████████████████████| 75/75 [00:01<00:00, 40.31it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 211/211 [00:02<00:00, 92.49it/s]


In [245]:
data_train_sr['label'].value_counts()

1    1939
3    1939
2    1939
Name: label, dtype: int64

In [247]:
data_train_sr.to_excel("Data/data_train_edasr.xlsx")

### RI Only

In [267]:
random.seed(10)

In [268]:
data_train_ri = data_train.copy()
data_train_ri = data_train_ri.drop(["index","Kalimat"],axis=1)
data_train_ri['label'].value_counts()

1    1939
3     211
2      75
Name: label, dtype: int64

In [269]:
#oversampling label
for i in [2,3]:
    sent_ls = data_train_ri[data_train_ri['label']==i]['Kalimat_prep'].values.tolist()
    new_sent_ls = generate_eda(sent_ls,alpha_sr=0, alpha_ri=0.05, alpha_rs=0, p_rd=0,num_aug=num_aug_dict[i])
    max_new_sent = major_class - data_train_ri['label'].value_counts()[i]
    data_train_ri = pd.concat([data_train_ri,pd.DataFrame({'Kalimat_prep':new_sent_ls[:max_new_sent],
                                                             'label':np.ones(max_new_sent).astype(int)*i})])

100%|██████████████████████████████████████████████████████████████████████████████████| 75/75 [00:01<00:00, 41.66it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 211/211 [00:01<00:00, 107.73it/s]


In [270]:
data_train_ri['label'].value_counts()

1    1939
3    1939
2    1939
Name: label, dtype: int64

In [271]:
data_train_ri.to_excel("Data/data_train_edari.xlsx")

### RS only

In [254]:
random.seed(10)

In [255]:
data_train_rs = data_train.copy()
data_train_rs = data_train_rs.drop(["index","Kalimat"],axis=1)
data_train_rs['label'].value_counts()

1    1939
3     211
2      75
Name: label, dtype: int64

In [256]:
#oversampling label
for i in [2,3]:
    sent_ls = data_train_rs[data_train_rs['label']==i]['Kalimat_prep'].values.tolist()
    new_sent_ls = generate_eda(sent_ls,alpha_sr=0, alpha_ri=0, alpha_rs=0.05, p_rd=0,num_aug=num_aug_dict[i])
    max_new_sent = major_class - data_train_rs['label'].value_counts()[i]
    data_train_rs = pd.concat([data_train_rs,pd.DataFrame({'Kalimat_prep':new_sent_ls[:max_new_sent],
                                                             'label':np.ones(max_new_sent).astype(int)*i})])

100%|████████████████████████████████████████████████████████████████████████████████| 75/75 [00:00<00:00, 1302.81it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 211/211 [00:00<00:00, 2178.20it/s]


In [257]:
data_train_rs['label'].value_counts()

1    1939
3    1939
2    1939
Name: label, dtype: int64

In [258]:
data_train_rs.to_excel("Data/data_train_edars.xlsx")

### RD Only

In [272]:
random.seed(10)

In [273]:
data_train_rd = data_train.copy()
data_train_rd = data_train_rd.drop(["index","Kalimat"],axis=1)
data_train_rd['label'].value_counts()

1    1939
3     211
2      75
Name: label, dtype: int64

In [274]:
#oversampling label
for i in [2,3]:
    sent_ls = data_train_rd[data_train_rd['label']==i]['Kalimat_prep'].values.tolist()
    new_sent_ls = generate_eda(sent_ls,alpha_sr=0, alpha_ri=0, alpha_rs=0, p_rd=0.05,num_aug=num_aug_dict[i])
    max_new_sent = major_class - data_train_rd['label'].value_counts()[i]
    data_train_rd = pd.concat([data_train_rd,pd.DataFrame({'Kalimat_prep':new_sent_ls[:max_new_sent],
                                                             'label':np.ones(max_new_sent).astype(int)*i})])

100%|████████████████████████████████████████████████████████████████████████████████| 75/75 [00:00<00:00, 1093.98it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 211/211 [00:00<00:00, 2304.11it/s]


In [275]:
data_train_rd['label'].value_counts()

1    1939
3    1939
2    1939
Name: label, dtype: int64

In [277]:
data_train_rd.to_excel("Data/data_train_edard.xlsx")

### RI and RD

In [278]:
random.seed(10)

In [279]:
data_train_rird = data_train.copy()
data_train_rird = data_train_rird.drop(["index","Kalimat"],axis=1)
data_train_rird['label'].value_counts()

1    1939
3     211
2      75
Name: label, dtype: int64

In [280]:
#oversampling label
for i in [2,3]:
    sent_ls = data_train_rird[data_train_rird['label']==i]['Kalimat_prep'].values.tolist()
    new_sent_ls = generate_eda(sent_ls,alpha_sr=0, alpha_ri=0.05, alpha_rs=0, p_rd=0.05,num_aug=num_aug_dict[i])
    max_new_sent = major_class - data_train_rird['label'].value_counts()[i]
    data_train_rird = pd.concat([data_train_rird,pd.DataFrame({'Kalimat_prep':new_sent_ls[:max_new_sent],
                                                             'label':np.ones(max_new_sent).astype(int)*i})])

100%|██████████████████████████████████████████████████████████████████████████████████| 75/75 [00:00<00:00, 86.23it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 211/211 [00:00<00:00, 227.51it/s]


In [281]:
data_train_rird.to_excel("Data/data_train_edarird.xlsx")

## Augmentation

In [237]:
random.seed(10)

In [238]:
data_train_ovr = pd.read_excel("Data/data_train_oversampling.xlsx")

In [239]:
data_train_aug = data_train_ovr.copy()
data_train_aug = data_train_aug[data_train_aug['Kalimat_prep']!="'"]
data_train_aug['label'].value_counts()

3    1939
2    1939
1    1938
Name: label, dtype: int64

In [240]:
#augmentation label
for i in [1,2,3]:
    sent_ls = data_train_aug[data_train_aug['label']==i]['Kalimat_prep'].values.tolist()
    new_sent_ls = generate_eda(sent_ls,num_aug=8)
    max_new_sent = data_train_aug['label'].value_counts()[i]*8 
    data_train_aug = pd.concat([data_train_aug,pd.DataFrame({'Kalimat_prep':new_sent_ls[:max_new_sent],
                                                             'label':np.ones(max_new_sent).astype(int)*i})])

100%|█████████████████████████████████████████████████████████████████████████████| 1938/1938 [00:06<00:00, 316.63it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 1939/1939 [00:05<00:00, 338.34it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 1939/1939 [00:06<00:00, 312.20it/s]


In [241]:
data_train_aug.to_excel("Data/data_train_augment.xlsx")

## EDA Example

In [357]:
data_train_eg = data_train.copy()
data_train_eg = data_train_eg.drop(["index"],axis=1)
data_train_eg = data_train_eg[data_train_eg['label'].isin([2,3])]
data_train_eg['label'].value_counts()

3    211
2     75
Name: label, dtype: int64

In [358]:
data_train_eg = data_train_eg.sample(5,random_state=60).reset_index(drop=True)
data_train_eg

Unnamed: 0,Kalimat,Kalimat_prep,label
0,kedunguan kpai dalam menegakkan uu... sampah!!!,kedunguan kpai dalam menegakkan uu sampah,3
1,Ga usah dikomenin kalo yang ini. Jenis-jenis m...,ga usah dikomenin kalo yang ini jenis jenis ma...,3
2,kpai .... pancen goblok,kpai pancen goblok,3
3,"intinya cmn 1: demi ngeksis, cari perhatian. u...",intinya cuman demi ngeksis cari perhatian udah...,2
4,perusahaan rugi dibeli.. oooonnn koq dipiara y...,perusahaan rugi dibeli oon koq dipiara ya demi...,3


In [401]:
model.wv.most_similar("kedunguan",topn=20)

[('menegakkan', 0.8644261956214905),
 ('selesai', 0.7761455178260803),
 ('pekan', 0.7281691431999207),
 ('targetkan', 0.7059224247932434),
 ('membuati', 0.6981822848320007),
 ('elemen', 0.6782402396202087),
 ('kecakapannya', 0.668877124786377),
 ('diundangkan', 0.6674259901046753),
 ('afiliasi', 0.6637691259384155),
 ('perubahan', 0.6615577936172485),
 ('disahkan', 0.6608225703239441),
 ('tameng', 0.6543505191802979),
 ('resmi', 0.6526904106140137),
 ('tiga', 0.6505138278007507),
 ('kakean', 0.6488603949546814),
 ('dievaluasi', 0.6433383822441101),
 ('rujukan', 0.6426711678504944),
 ('pintarlah', 0.6423753499984741),
 ('terjal', 0.6406939029693604),
 ('rok', 0.6405861973762512)]

In [400]:
model.wv.most_similar("sampah",topn=20)

[('masarakat', 0.6190102100372314),
 ('mulutmu', 0.6125452518463135),
 ('gentlr', 0.608872652053833),
 ('nyindir', 0.5933667421340942),
 ('kedunguan', 0.5758902430534363),
 ('plin', 0.5740864872932434),
 ('dibuang', 0.5739468932151794),
 ('mukanya', 0.5628902316093445),
 ('lndonesia', 0.5627104043960571),
 ('bubarkankpi', 0.5578151345252991),
 ('betmanfaat', 0.5497796535491943),
 ('busuk', 0.5355801582336426),
 ('ngabalin', 0.5260036587715149),
 ('bau', 0.525847852230072),
 ('berisi', 0.518186092376709),
 ('gosip', 0.50596022605896),
 ('nirprestasi', 0.48198845982551575),
 ('seru', 0.4745867848396301),
 ('uy', 0.4695127010345459),
 ('rahim', 0.4652632772922516)]

In [383]:
data_train_eg['Kalimat_prep'].values[0]

'kedunguan kpai dalam menegakkan uu sampah'

In [368]:
for i in data_train_eg['Kalimat_prep'].values:
    random.seed(10)
    print(generate_eda([i],p_rd=0.2,num_aug=4))

100%|███████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 496.07it/s]


['kedunguan dalam uu sampah', 'kedunguan kpai dalam kedunguan menegakkan uu sampah', 'kedunguan kpai dalam menegakkan bertntangan sampah', 'kedunguan kpai sampah menegakkan uu dalam']


100%|███████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 496.43it/s]


['ga usah dikomenin kalo yang ini jenis jenis mahluk yang kebodohannya minta kebodohannya dibikin viral', 'ga usah dikomenin kalo yang ini jenis jahat jenis mahluk yang sengaja minta kebodohannya dibikin viral', 'ga ini dikomenin kalo yang usah jenis jenis mahluk yang sengaja minta kebodohannya dibikin viral', 'ga usah dikomenin yang ini jenis jenis mahluk yang sengaja minta dibikin viral']


100%|███████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 500.10it/s]


['pancen kpai goblok', 'kpai pancen goblok', 'gantiin pancen goblok', 'kpai pancen ketuane goblok']


100%|███████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 500.45it/s]


['intinya cuman demi ngeksis cari perhatian udah itu aja ane kalau tiketnya film yang nge hype paling banter foto nonton doang sambil ngetag cewe ane udah itu aja gak sampai ngerekam segala kampungan menurut ane', 'intinya demi ngeksis cari perhatian ane kalau nonton film yang nge hype banter foto doang sambil cewe ane udah itu aja sampai ngerekam segala menurut', 'intinya cuman demi ngeksis cari perhatian udah itu aja ane kalau nonton film yang nge hype paling banter foto tiketnya doang nikmati ngetag cewe ane udah itu aja gak sampai ngerekam segala kampungan menurut ane', 'intinya cuman demi ngeksis cari perhatian udah itu aja ane kalau nonton film yang nge hype paling banter foto tiketnya doang sambil ngetag cewe ane udah itu aja gak fenomenal sampai ngerekam segala kampungan menurut ane']


100%|███████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 199.31it/s]

['perusahaan dibeli oon koq dipiara ya demi kemenangan jabatan doang yang kaga ngerti mah bener yang mah ini cuman sandiwara sinetron buat bikin pendukungnya akhirnya asset negara kembali negara yang rugi guoblokk bukan macam untung', 'perusahaan rugi dibeli oon koq dipiara ya demi kemenangan berkuasa doang yang kaga ngerti mah ngerasanya bener yang ngerti mah tau ini cuman sandiwara alias sinetron buat bikin pendukungnya terbuai akhirnya asset negara kembali asset negara yang rugi guoblokk menyuruh macam freeport yang untung', 'perusahaan rugi dibeli oon koq dipiara ya demi kemenangan jabatan doang yang kaga ngerti mah pendukungnya ngerasanya bener yang ngerti mah tau ini cuman sandiwara alias sinetron buat bikin pendukungnya terbuai akhirnya asset negara kembali asset negara yang rugi leher guoblokk bukan macam freeport yang untung', 'rugi rugi alias oon koq dipiara ya demi kemenangan jabatan doang yang kaga ngerti mah ngerasanya bener yang ngerti mah tau ini cuman sandiwara dibeli s




In [313]:
#augmentation label
data_train_eg_aug = pd.DataFrame()
for i in [2,3]:
    sent_ls = data_train_eg[data_train_eg['label']==i]['Kalimat_prep'].values.tolist()
    new_sent_ls = generate_eda(sent_ls,num_aug=4)
    max_new_sent = data_train_eg['label'].value_counts()[i]*4 
    data_train_eg_aug = pd.concat([data_train_eg_aug,pd.DataFrame({'Kalimat_prep':new_sent_ls[:max_new_sent],
                                                                   'label':np.ones(max_new_sent).astype(int)*i})])

100%|███████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 709.73it/s]
100%|███████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 333.70it/s]


In [314]:
data_train_eg_aug

Unnamed: 0,Kalimat_prep,label
0,yang gan rupiah kontol,2
1,yang kontol rupiah paripurna,2
2,keenakan yang kontol rupiah gan,2
3,yang kontol rupiah gan,2
4,capek disikat ini orang cuma mau begoin rakyat...,2
5,capek deh ini orang cuma mau begoin excel raky...,2
6,capek deh ini orang cuma mau aja,2
7,capek deh cuma orang ini mau begoin rakyat aja,2
8,kalo mau apa murah tunggu ntar jagoan lu mimpi...,2
9,kalo mau apa murah tunggu ntar jagoan lu mimpi...,2
