In [None]:
!pip install nlpaug

In [None]:
import pandas as pd
import numpy as py
import random
import math
import time
import re

import seaborn as sns 
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter('ignore')

import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas
import nlpaug.model.word_stats as nmw
import nlpaug.flow as nafc
from nlpaug.util import Action

In [None]:
df = pd.read_csv('../input/shopee-product-matching/train.csv')

In [None]:
df.head()

In [None]:
# 句子长度列数量分布图
sentence_length = list(map(lambda x:len(x), df['title']))
sns.countplot(sentence_length)
plt.xticks([]) #不显示x坐标的标签
plt.show()

# dist长度分布图
sns.distplot(sentence_length)
plt.yticks([])#不显示y坐标的标签
plt.show()

In [None]:
def _tokenizer(text, token_pattern=r"(?u)\b\w\w+\b"):
    token_pattern = re.compile(token_pattern)
    return token_pattern.findall(text)

# Tokenize input
train_x_tokens = [_tokenizer(x) for x in df['title']]

# Train TF-IDF model
tfidf_model = nmw.TfIdf()
tfidf_model.train(train_x_tokens)
tfidf_model.save('.')

print('loading aug...')
# Load TF-IDF augmenter
aug_TF_IDF = naw.TfIdfAug(model_path='.', tokenizer=_tokenizer)
aug_word_vec = naw.WordEmbsAug(
    model_type='fasttext', model_path='../input/fasttext-wikinews/wiki-news-300d-1M.vec')
aug_contextual = nas.ContextualWordEmbsForSentenceAug(model_path='xlnet-base-cased')
print('done')

In [None]:
def textAugmer(train, mode=1):
    # 1. swap character randomly
#     aug = nac.RandomCharAug(action="swap")
#     texts = train['title'].iloc[:10]
#     for text in texts:
#         augmented_text = aug.augment(text)
#         print('-'*20)
#         print('Original Input:{}'.format(text))
#         print('Agumented Output:{}'.format(augmented_text))
# 
    # 2. delete character randomly
#     aug = nac.RandomCharAug(action="delete")
#     texts = train['title'].iloc[:10]
#     for text in texts:
#         augmented_text = aug.augment(text)
#         print('-'*20)
#         print('Original Input:{}'.format(text))
#         print('Agumented Output:{}'.format(augmented_text))

    # 3. word Augmenter
    # model_type: word2vec, glove or fasttext
    
    if mode==1:    
        length = len(train)
        for i in range(length):
            tmp = train['title'][i]
            if len(train['title'][i]) < 30:
                train['title'][i] = aug_contextual.augment(train['title'][i])
            else:
                train['title'][i] = aug_word_vec.augment(train['title'][i])
            if i+1 % 500 == 0:
                print('-'*30)
                print(i,'/',length, 'proccesed')
                print("Original:")
                print(tmp)
                print("Augmented Text:")
                print(train['title'][i])
    if mode==2:
        length = len(train)
        for i in range(length):
            tmp = train['title'][i]
            if len(train['title'][i]) < 30:
                train['title'][i] = aug_contextual.augment(train['title'][i])
                train['title'][i] = aug_TF_IDF.augment(train['title'][i])
            else:
                train['title'][i] = aug_TF_IDF.augment(train['title'][i])
            if i+1 % 500 == 0:
                print('-'*30)
                print(i,'/',length, 'proccesed')
                print("Original:")
                print(tmp)
                print("Augmented Text:")
                print(train['title'][i])

df_text_aug_2 = df.copy()
textAugmer(df_text_aug_2, mode=2)

In [None]:
df.head()

In [None]:
df_text_aug_2.head()

In [None]:
df_text_aug_2.shape

In [None]:
# 句子长度列数量分布图
sentence_length = list(map(lambda x:len(x), df_text_aug_2['title']))
sns.countplot(sentence_length)
plt.xticks([]) #不显示x坐标的标签
plt.show()

# dist长度分布图
sns.distplot(sentence_length)
plt.yticks([])#不显示y坐标的标签
plt.show()

In [None]:
df_text_aug_2.to_csv('df_text_aug_2.csv', index = False)