In [1]:
!pip install requests nlpaug

Collecting nlpaug
  Downloading nlpaug-1.0.1-py3-none-any.whl (376 kB)
[K     |████████████████████████████████| 376 kB 402 kB/s 
Installing collected packages: nlpaug
Successfully installed nlpaug-1.0.1
You should consider upgrading via the '/opt/conda/bin/python3.7 -m pip install --upgrade pip' command.[0m


In [2]:
import pandas as pd
import numpy as np
df = pd.read_csv('../input/bmw-intent-training/bmw_training_set.csv')
df['Intent'].value_counts()
a = pd.DataFrame(df['Intent'].value_counts()<35) # classes with less than 35 values
b = pd.DataFrame(df['Intent'].value_counts()>=35)
a = a[a['Intent']==True]
b = b[b['Intent']==True]
cat2inc = a.index.values
print(len(cat2inc))


139


In [3]:
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas
import nlpaug.flow as nafc
from nlpaug.util import Action
import random



In [4]:
import nlpaug.augmenter.word as naw
import nlpaug.model.word_stats as nmw
import re

def _tokenizer(text, token_pattern=r"(?u)\b\w\w+\b"):
    token_pattern = re.compile(token_pattern)
    return token_pattern.findall(text)

# Load sample data
train_x = df['Utterance'].values

# Tokenize input
train_x_tokens = [_tokenizer(x) for x in train_x]

# Train TF-IDF model
tfidf_model = nmw.TfIdf()
tfidf_model.train(train_x_tokens)
tfidf_model.save('.')

# Load TF-IDF augmenter
aug = naw.TfIdfAug(model_path='.', tokenizer=_tokenizer)

In [5]:
def augment_data(cat):
    """ 
    Here in this function we will augment the classes with 
    very less values (<35) to augment and create more data 
    points per class through process of inserting , substituting
    
    Every method used to augment the data some specific examples 
    are given below
    
    """
    
    prev_len = len(df[df['Intent']==cat]['Utterance'].values)
    prev_list = list(df[df['Intent']==cat]['Utterance'].values)
    
    if prev_len>4:
        aug_len = 5
    else:
        aug_len = prev_len
    text = np.random.choice(df[df['Intent']==cat]['Utterance'].values, aug_len, replace=False)
    l = []
    
    for i in text:
        aug = naw.SynonymAug(aug_src='wordnet')
        
        """Original: The quick brown fox jumps over the lazy dog .
           Augmented Text: The speedy brown fox jumps complete the lazy dog .
        """
        augmented_text1 = aug.augment(i,n=3)
        l.append(augmented_text1[0])
        l.append(augmented_text1[1])
        l.append(augmented_text1[2])
        
        
        try:
            aug = nas.ContextualWordEmbsForSentenceAug(model_path='xlnet-base-cased')
            """Original: The quick brown fox jumps over the lazy dog .
               Augmented Texts: 'The quick brown fox jumps over the lazy dog . A terrible , 
                  messy split second presents itself to the heart - which is we lose our heart.
            """
            
            augmented_text2 = aug.augment(i,n=3)
            l.append(augmented_text2[0])
            l.append(augmented_text2[1])
            l.append(augmented_text2[2])
        except:
            pass
        
        
        aug = naw.ContextualWordEmbsAug(model_path='bert-base-uncased', action="insert")
        """
        Original: The quick brown fox jumps over the lazy dog
        Augmented Text: little quick brown fox jumps over the lazy dog
        """
        augmented_text3 = aug.augment(i,n=3)
        l.append(augmented_text3[0])
        l.append(augmented_text3[1])
        l.append(augmented_text3[2])
        
        
        aug = naw.ContextualWordEmbsAug(model_path='bert-base-uncased', action="substitute")
        
        """Original: The quick brown fox jumps over the lazy dog
           Augmented Text: even the quick brown fox usually jumps over the lazy dog
        """
        augmented_text4 = aug.augment(i,n=3)
        l.append(augmented_text4[0])
        l.append(augmented_text4[1])
        l.append(augmented_text4[2])
        
        
        aug = naw.TfIdfAug(model_path='.',action="insert")
        """ 
        Original: The quick brown fox jumps over the lazy dog
        Augmented Text: The quick brown fox Baked over the polygraphy dog
        """
        augmented_text5 = aug.augment(i)
        l.append(augmented_text5)
        
        
        aug = naw.ContextualWordEmbsAug(model_path='roberta-base', action="substitute")
        """  Original: The quick brown fox jumps over the lazy dog .
             Augmented Text: The quick brown fox jumps Into the bull dog .
        """
        augmented_text6 = aug.augment(i,n=3)
        l.append(augmented_text6[0])
        l.append(augmented_text6[1])
        l.append(augmented_text6[2])
        
        
        try:
            aug = nas.ContextualWordEmbsForSentenceAug(model_path='gpt2')
            """
            Original: The quick brown fox jumps over the lazy dog .
            Augmented Text: The quick brown fox jumps over the lazy dog . They start shooting wildly.
            """
            augmented_text7 = aug.augment(i,n=3)
            l.append(augmented_text7[0])
            l.append(augmented_text7[1])
            l.append(augmented_text7[2])
        except:
            pass
        
        
    prev_list.extend(l)
    aug_df = pd.DataFrame()
    aug_df['Utterance'] = pd.Series(prev_list)
    aug_df['Intent'] = cat
    return aug_df

In [6]:
inconv_data =  df[df['Intent']=='inconvenience']
bmw_book_data = df[df['Intent']=='bmw_book']
bmw_feat_data = df[df['Intent']=='bmwfeature']
bmw_spec_data = df[df['Intent']=='bmw_specification']
bmw_nextcarlaunch = df[df['Intent']=='nextcarlaunch']
#these are the classes which have good amount of data samples so they are not augmented
# merged as it is

#Driver Function
aug = augment_data(cat2inc[0])
aug = pd.concat([aug,inconv_data,bmw_book_data,bmw_feat_data,bmw_spec_data,bmw_nextcarlaunch])
for i in range(1,len(cat2inc)):
    aug_1 = augment_data(cat2inc[i])
    aug = pd.concat([aug,aug_1])
    del aug_1
print(len(set(aug['Intent'].values)))

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=760.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=798011.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=467042463.0, style=ProgressStyle(descri…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=481.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=898823.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=501200538.0, style=ProgressStyle(descri…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=665.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1042301.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=548118077.0, style=ProgressStyle(descri…


144


In [7]:
aug.to_csv('data_file.csv',index=False)