In [3]:
import os, re, random
from glob import glob
import pandas as pd
from tqdm import tqdm
tqdm.pandas()
from itertools import combinations 
from nltk.corpus import stopwords as stopwords_nltk 
from nltk import word_tokenize, pos_tag
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
import re
from keybert import KeyBERT
from sklearn.utils import shuffle

In [4]:
mdsd_labeled_filepaths = glob('/media/dmlab/My Passport/DATA/BenchmarkDataset/MDSD/*_labeled_*.csv')
mdsd_unlabeled_filepaths = glob('/media/dmlab/My Passport/DATA/BenchmarkDataset/MDSD/*_unlabeled_*.csv')
save_dir = '/media/dmlab/My Passport/DATA/cross-domain/data'

* 4,000 unlabeled reviews per domain
* 2,000 labeled reviews per domain

In [12]:
# lemmatizer = WordNetLemmatizer()
# MULTIPLE_SPACES = re.compile(' +', re.UNICODE)
# removal_list = "|,‘, ’, ◇, ‘, ”,  ’, ·, \“, ·, △, ➤, ●,  , ■, (, ), \", >>, `, /, -,∼,=,ㆍ<,>, ?, !,【,】, …, ◆,%"
# stopwords = stopwords_nltk.words('english')
# kw_model = KeyBERT()

def prepare_data(filepaths, sample_num=4000, drop_duplicates=True, process_random_masked=True):
    def concat_dataframes(filepaths, sample_num=None):
        dfs = []
        for filepath in filepaths:
            df = pd.read_csv(filepath)
            domain = os.path.basename(filepath).split('_')[0]
            df['domain'] = domain
            original_len = len(df)
            if drop_duplicates:
                df.drop_duplicates(['text'], keep='last', inplace=True)   # 중복된 텍스트 제거
                print('[{}] Droped {} rows having duplicated text'.format(domain, original_len-len(df)))
            if sample_num is not None:
                df = df.sample(n=sample_num)   # 4000개 데이터만 랜덤하게 선택
            dfs.append(df)
        concat_df = pd.concat(dfs)
        concat_df = shuffle(concat_df)   # Shuffle
        concat_df.reset_index(inplace=True)   # Reset index
        return concat_df

    def get_preprocessed_tokens(text):
        text = text.translate(str.maketrans(removal_list, ' '*len(removal_list)))   # 특수문자 제거
        text = re.sub(MULTIPLE_SPACES, ' ', text)   # 무의미한 공백 제거
        words = word_tokenize(text.lower())   # 소문자로 변경 후 tokenization
        nouns = [token for token, tag in pos_tag(words) if tag in ['NN', 'NNS', 'NNP', 'NNPS']]   # 명사 추출
        nouns = [lemmatizer.lemmatize(token) for token in nouns]   # lemmatization (e.g., movies -> movie)
        nouns = [token for token in nouns if token not in stopwords]   # 불용어 제거
        nouns = [token for token in nouns if len(token)>1]   # 길이가 1 이하인 단어 제거
        return nouns

    def mask_keywords(doc, keywords):
        words = doc.split()
        for i in range(len(words)):
            for keyword in keywords:
                if keyword.lower() in words[i].lower():
                    words[i] = '[UNK]'
        return ' '.join(words)
    
    def random_masked_text(num_of_unks, doc):
        words = doc.split()
        completed_unks = 0
        while True:
            random_idx = random.choice(range(len(words)))
            if completed_unks == num_of_unks: break
            if words[random_idx] != '[UNK]':
                words[random_idx] = '[UNK]'
                completed_unks += 1
        return ' '.join(words)

    concat_df = concat_dataframes(filepaths, sample_num=sample_num)

    # Keyword 추출 using KeyBERT
    concat_df['keywords'] = concat_df['text'].progress_apply(lambda x: \
        [word for (word, score) in kw_model.extract_keywords(x) if word in get_preprocessed_tokens(x)])
    
    # Keyword masking
    concat_df['masked_text'] = concat_df.progress_apply(lambda x: mask_keywords(x['text'], x['keywords']), axis=1)
    
    # Random word masking
    # keyword 개수 만큼 랜덤하게 단어를 골라서 [UNK]로 처리
    if process_random_masked:
        concat_df['random_masked_text'] = concat_df.progress_apply(lambda x: \
            random_masked_text(x['masked_text'].count('[UNK]'), x['text']), axis=1)
    return concat_df

# unlabeled_df = prepare_data(mdsd_unlabeled_filepaths)
# filepath = os.path.join(save_dir, 'MDSD_unlabeled.json')
# unlabeled_df.to_json(filepath)
# print('Created {}'.format(filepath))

labeled_df = prepare_data(mdsd_labeled_filepaths, sample_num=None, \
                          drop_duplicates=False, process_random_masked=False)
filepath = os.path.join(save_dir, 'MDSD_labeled.json')
labeled_df.to_json(filepath)
print('Created {}'.format(filepath))

100%|██████████| 8000/8000 [12:31<00:00, 10.64it/s]
100%|██████████| 8000/8000 [00:00<00:00, 9415.88it/s]


Created /media/dmlab/My Passport/DATA/cross-domain/data/MDSD_labeled.json


In [13]:
unlabeled_df = pd.read_json(os.path.join(save_dir, 'MDSD_unlabeled.json'))
unlabeled_df

Unnamed: 0,index,text,domain,keywords,masked_text,random_masked_text
0,3152,Have had this TV for about a week and am very ...,electronics,"[lcd, glare, tv]",Have had this [UNK] for about a week and am ve...,Have had this TV for about a week and am very ...
1,11330,I have been a big fan of these printers. I own...,electronics,"[labelwriters, labelwriter]",I have been a big fan of these printers. I own...,I have been a big fan of these printers. I own...
2,55284,I was first introduced to the Salsa Crazy seri...,dvd,"[salsa, dance, routine, costume]",I was first introduced to the [UNK] Crazy seri...,I was first introduced to the Salsa Crazy seri...
3,39147,"Right off the bat, i've been a fan since 1979,...",dvd,"[dvd, ct, trick, band]","Right off the bat, i've been a fan since 1979,...","[UNK] [UNK] the bat, i've been a fan [UNK] [UN..."
4,112780,A welcome Return for Michael Myers; one of the...,dvd,"[divimax, myers, dvd]",A welcome Return for Michael [UNK] one of the ...,A welcome Return for Michael Myers; one of the...
...,...,...,...,...,...,...
15995,8293,The Memorex 10OZ 152A AIR DUSTER is a lifesave...,electronics,"[keyboard, duster, dust]",The Memorex 10OZ 152A AIR [UNK] is a lifesaver...,The Memorex 10OZ 152A AIR DUSTER is a lifesave...
15996,15340,This would of been a great sd card had it actu...,electronics,"[mda, card]",This would of been a great sd [UNK] had it act...,This would of been a great sd card had it actu...
15997,90158,This box totally rox my sox! The extra 100 bux...,dvd,"[box, sox, bux, disx, collecter]",This [UNK] totally rox my [UNK] The extra 100 ...,[UNK] box [UNK] rox my sox! [UNK] extra 100 bu...
15998,110836,Who knew that Gordon from Sesame Street was a ...,dvd,"[pimp, gordon, movie, street]",Who knew that [UNK] from Sesame [UNK] was a [U...,Who knew that Gordon [UNK] [UNK] Street was a ...


In [14]:
labeled_df = pd.read_json(os.path.join(save_dir, 'MDSD_labeled.json'))
labeled_df

Unnamed: 0,index,text,label,domain,keywords,masked_text
0,494,"I mean that statement in two ways. First, in m...",positive,dvd,"[goodfellas, hd, dvd]","I mean that statement in two ways. First, in m..."
1,455,I bought the cooler for my Dell Inspiron which...,positive,electronics,"[dell, cooler, inspiron, fan, pavilion]",I bought the [UNK] for my [UNK] [UNK] which te...
2,745,I'm so happy with these speakers. For the pric...,positive,electronics,"[bass, sound]",I'm so happy with these speakers. For the pric...
3,740,"I found the film, ""Malice"" to be a very intrig...",positive,dvd,"[murder, malice, killer]","I found the film, [UNK] to be a very intriguin..."
4,259,This pan is a good example of Calphalon qualit...,positive,kitchen,"[pan, calphalon, oven, stove, roast]",This [UNK] is a good example of [UNK] quality ...
...,...,...,...,...,...,...
7995,1128,It takes indeed some real power to get so many...,negative,books,"[book, paradigm, innovation]",It takes indeed some real power to get so many...
7996,268,The Penguin Guide is still the best in the bus...,positive,books,"[penguin, guide, music, comprehensiveness]",The [UNK] [UNK] is still the best in the busin...
7997,783,I got this pan because I've been coveting All-...,positive,kitchen,"[pan, cookware, stove, clad]",I got this [UNK] because I've been coveting [U...
7998,1048,"The scale does not come with a battery, even t...",negative,kitchen,"[battery, scale, cell, load]",The [UNK] does not come with a [UNK] even thou...


* Post-training용 데이터셋
    - 도메인 2개 (파일명에 알파벳 순으로 기재)
    - Post-training options
        1. Raw (baseline)
        2. keyword
        3. random word (baseline): keyword 개수 만큼 랜덤하게 단어를 골라서 [UNK]로 처리
    
> When generating the post-training data, each sentence in the target domain gets duplicated 10 times with different masks ~~and sentences pair~~.

In [85]:
def create_txt_for_post_training(docs, save_filepath, num_of_duplicates=10):    
    with open(save_filepath, 'w') as output_file:
        for _ in range(num_of_duplicates): # each sentence in the target domain gets duplicated 10 times
            for doc_idx, doc in enumerate(docs):
                output_file.write('{}\n\n'.format(doc))
        output_file.write('[EOD]')
    print(f'Created {save_filepath}')
    
domains = unlabeled_df.domain.unique()
for mode in ['raw', 'keyword', 'random']:
    for (domain1, domain2) in list(combinations(domains, 2)):
        df = unlabeled_df[unlabeled_df['domain'].isin([domain1, domain2])]
        if mode == 'raw':
            docs = df['text'].values
        elif mode == 'keyword':
            docs = df['masked_text'].values
        elif mode == 'random':
            docs = df['random_masked_text'].values

        save_filepath = os.path.join(save_dir, 'MDSD_{}_{}_for_post.txt'.format('&'.join(sorted([domain1, domain2])), mode))
        create_txt_for_post_training(docs, save_filepath)

Created /media/dmlab/My Passport/DATA/cross-domain/data/MDSD_dvd&electronics_raw_for_post.txt
Created /media/dmlab/My Passport/DATA/cross-domain/data/MDSD_electronics&kitchen_raw_for_post.txt
Created /media/dmlab/My Passport/DATA/cross-domain/data/MDSD_books&electronics_raw_for_post.txt
Created /media/dmlab/My Passport/DATA/cross-domain/data/MDSD_dvd&kitchen_raw_for_post.txt
Created /media/dmlab/My Passport/DATA/cross-domain/data/MDSD_books&dvd_raw_for_post.txt
Created /media/dmlab/My Passport/DATA/cross-domain/data/MDSD_books&kitchen_raw_for_post.txt
Created /media/dmlab/My Passport/DATA/cross-domain/data/MDSD_dvd&electronics_keyword_for_post.txt
Created /media/dmlab/My Passport/DATA/cross-domain/data/MDSD_electronics&kitchen_keyword_for_post.txt
Created /media/dmlab/My Passport/DATA/cross-domain/data/MDSD_books&electronics_keyword_for_post.txt
Created /media/dmlab/My Passport/DATA/cross-domain/data/MDSD_dvd&kitchen_keyword_for_post.txt
Created /media/dmlab/My Passport/DATA/cross-doma