In [43]:
import os, re, random
from glob import glob
import pandas as pd
from tqdm import tqdm
tqdm.pandas()
from itertools import combinations 
import re, copy
from sklearn.utils import shuffle
import numpy as np
from nltk import sent_tokenize, word_tokenize

In [4]:
mdsd_labeled_filepaths = glob('/media/dmlab/My Passport/DATA/BenchmarkDataset/MDSD/*_labeled_*.csv')
mdsd_unlabeled_filepaths = glob('/media/dmlab/My Passport/DATA/BenchmarkDataset/MDSD/*_unlabeled_*.csv')
save_dir = '/media/dmlab/My Passport/DATA/cross-domain/data'
if not os.path.exists(save_dir): os.makedirs(save_dir)

* 4,000 unlabeled reviews per domain
* 2,000 labeled reviews per domain

In [17]:
def concat_dataframes(filepaths, drop_duplicates=True, sample_num=4000):
    dfs = []
    for filepath in filepaths:
        df = pd.read_csv(filepath)
        domain = os.path.basename(filepath).split('_')[0]
        df['domain'] = domain
        original_len = len(df)
        if drop_duplicates:
            df.drop_duplicates(['text'], keep='last', inplace=True)   # 중복된 텍스트 제거
            print('[{}] Droped {} rows having duplicated text'.format(domain, original_len-len(df)))
        if sample_num is not None:
            df = df.sample(n=sample_num)   # 4000개 데이터만 랜덤하게 선택
        dfs.append(df)
    concat_df = pd.concat(dfs)
    concat_df = shuffle(concat_df)   # Shuffle
    concat_df.reset_index(inplace=True)   # Reset index
    return concat_df

unlabeled_df = concat_dataframes(mdsd_unlabeled_filepaths)
filepath = os.path.join(save_dir, 'MDSD_unlabeled.json')
unlabeled_df.to_json(filepath)
print('Created {}'.format(filepath))

labeled_df = concat_dataframes(mdsd_labeled_filepaths, drop_duplicates=False, sample_num=None)
filepath = os.path.join(save_dir, 'MDSD_labeled.json')
labeled_df.to_json(filepath)
print('Created {}'.format(filepath))

[books] Droped 477734 rows having duplicated text
[dvd] Droped 39540 rows having duplicated text
[electronics] Droped 2213 rows having duplicated text
[kitchen] Droped 1218 rows having duplicated text
Created /media/dmlab/My Passport/DATA/cross-domain/data/MDSD_unlabeled.json
Created /media/dmlab/My Passport/DATA/cross-domain/data/MDSD_labeled.json


In [6]:
labeled_df = pd.read_json(os.path.join(save_dir, 'MDSD_labeled.json'))
labeled_df

Unnamed: 0,index,text,label,domain
0,366,Elaine Pagels is a wonderful writer. Her expla...,positive,books
1,1024,I baked six loaves in this machine. I have bee...,negative,kitchen
2,548,I BOUGHT THIS LIKE A IPOD BUT ITS BETTER I HAV...,positive,electronics
3,675,"These are called ""Fruit Bowl"" but they are the...",positive,kitchen
4,1098,Corkscrew has a nice heavy duty feel and does ...,negative,kitchen
...,...,...,...,...
7995,156,"This soap dish is beautiful, practical and stu...",positive,kitchen
7996,462,WE ARE HALFWAY THROUGH THE FRIENDS SERIES...5 ...,positive,dvd
7997,314,I was a little hesitant to spend over $10 for ...,positive,kitchen
7998,635,"Excellent item, very powerful and stylish. Wou...",positive,kitchen


In [5]:
unlabeled_df = pd.read_json(os.path.join(save_dir, 'MDSD_unlabeled.json'))
unlabeled_df

Unnamed: 0,index,text,domain
0,12429,the ease and timelyness of the product was ver...,electronics
1,813196,I'm a man about to turn 50 with 4 yr. olds twi...,books
2,527488,There is tons to do in Washington DC and despi...,books
3,701823,Good grounding book in strategy,books
4,14177,super excellent cd player with remote control/...,electronics
...,...,...,...
15995,597945,I work in the healthcare field and have seen q...,books
15996,9786,I have a miniature poodle puppy who eats every...,kitchen
15997,606221,I wasnt a big fan of this book. it didnt keep ...,books
15998,30590,This movie has one the best performances by Ja...,dvd


# Post-training용 데이터셋
#### for MLM
* 도메인 2개 (파일명에 알파벳 순으로 기재)

In [9]:
def mask_keywords(doc, keywords):
    words = doc.split()
    for i in range(len(words)):
        for keyword in keywords:
            if keyword.lower() in words[i].lower():
                words[i] = '[UNK]'
    return ' '.join(words)

def create_txt_for_post_training(docs, save_filepath, num_of_duplicates=10):    
    with open(save_filepath, 'w') as output_file:
        for _ in range(num_of_duplicates): # each sentence in the target domain gets duplicated 10 times
            for doc_idx, doc in enumerate(docs):
                output_file.write('{}\n\n'.format(doc))
        output_file.write('[EOD]')
    print(f'Created {save_filepath}')
    
mode = 'MLM'
    
domains = unlabeled_df.domain.unique()
for (domain1, domain2) in list(combinations(domains, 2)):
    moniker = '&'.join(sorted([domain1, domain2]))
    df = copy.copy(unlabeled_df[unlabeled_df['domain'].isin([domain1, domain2])])
    docs = df['text'].values
    
    save_filepath = os.path.join(save_dir, 'MDSD_{}_{}_for_post.txt'.format('&'.join(sorted([domain1, domain2])), mode))
    create_txt_for_post_training(docs, save_filepath)

Created /media/dmlab/My Passport/DATA/cross-domain/data/MDSD_books&electronics_MLM_for_post.txt
Created /media/dmlab/My Passport/DATA/cross-domain/data/MDSD_dvd&electronics_MLM_for_post.txt
Created /media/dmlab/My Passport/DATA/cross-domain/data/MDSD_electronics&kitchen_MLM_for_post.txt
Created /media/dmlab/My Passport/DATA/cross-domain/data/MDSD_books&dvd_MLM_for_post.txt
Created /media/dmlab/My Passport/DATA/cross-domain/data/MDSD_books&kitchen_MLM_for_post.txt
Created /media/dmlab/My Passport/DATA/cross-domain/data/MDSD_dvd&kitchen_MLM_for_post.txt


#### for DDT (Domain Distinguish Task)
* 소스 도메인, 타겟 도메인 구분되어야 함
    - TargetDomain: 1,000 pairs 
    - MixDomain: 1,000 pairs

> 50% of time sentence A and sentence B are all randomly sampled from target domain reviews, we label it TargetDomain. And 50% of time sentence A and sentence B come from target domain and another domain, whose label is MixDomain. We do not fix the collocation, in another word, we only ensure that the two sentences come from different domains but the order is random. 
```
Input = [CLS] The mouse is smooth and great [SEP] The screen is plain [SEP]
Label = TargetDomain
Input = [CLS] This book is boring [SEP] The system of the laptop is stable [SEP]
Label = MixDomain
```

In [68]:
def create_sentence_pair_df(label, df1, df2, num_pairs):
    def pick_one_sentence(df):
        while True:
            doc = df.iloc[np.random.randint(0, len(df))]['text']
            sentences = sent_tokenize(doc)
            
            # 11개 이상의 단어를 가진 문장만을 대상으로.
            sentences = [sent for sent in sentences if len(word_tokenize(sent)) > 10]
            if len(sentences) > 0:
                break
                
        idx = np.random.randint(0, len(sentences))
        return sentences[idx]

    records = []
    for _ in tqdm(range(num_pairs)):
        sent1 = pick_one_sentence(df1)
        sent2 = pick_one_sentence(df2)
        records.append((label, '[CLS] {} [SEP] {} [SEP]'.format(*shuffle([sent1, sent2]))))
    
    return pd.DataFrame(records, columns=['label', 'sentence_pair'])

mode = 'DDT'

domains = unlabeled_df.domain.unique()
for source_domain in domains:
    for target_domain in [d for d in domains if d!=source_domain]:
        
        source_df = unlabeled_df[unlabeled_df['domain']==source_domain]
        target_df = unlabeled_df[unlabeled_df['domain']==target_domain]

        mix_df = create_sentence_pair_df('MixDomain', source_df, target_df, num_pairs=1000)
        target_df = create_sentence_pair_df('TargetDomain', target_df, target_df, num_pairs=1000)
        ddt_df = shuffle(pd.concat([mix_df, target_df])).reset_index()

        save_filepath = os.path.join(save_dir, 'MDSD_{}_{}_for_post.json'.format('->'.join([source_domain, target_domain]), mode))
        ddt_df.to_json(save_filepath)
        print(f'Created {save_filepath}')

100%|██████████| 1000/1000 [00:02<00:00, 346.04it/s]
100%|██████████| 1000/1000 [00:03<00:00, 312.94it/s]


Created /media/dmlab/My Passport/DATA/cross-domain/data/MDSD_electronics->books_DDT_for_post.json


100%|██████████| 1000/1000 [00:03<00:00, 320.47it/s]
100%|██████████| 1000/1000 [00:03<00:00, 265.48it/s]


Created /media/dmlab/My Passport/DATA/cross-domain/data/MDSD_electronics->dvd_DDT_for_post.json


100%|██████████| 1000/1000 [00:02<00:00, 393.84it/s]
100%|██████████| 1000/1000 [00:02<00:00, 413.45it/s]


Created /media/dmlab/My Passport/DATA/cross-domain/data/MDSD_electronics->kitchen_DDT_for_post.json


100%|██████████| 1000/1000 [00:02<00:00, 352.45it/s]
100%|██████████| 1000/1000 [00:02<00:00, 391.09it/s]


Created /media/dmlab/My Passport/DATA/cross-domain/data/MDSD_books->electronics_DDT_for_post.json


100%|██████████| 1000/1000 [00:03<00:00, 292.70it/s]
100%|██████████| 1000/1000 [00:03<00:00, 273.26it/s]


Created /media/dmlab/My Passport/DATA/cross-domain/data/MDSD_books->dvd_DDT_for_post.json


100%|██████████| 1000/1000 [00:02<00:00, 345.56it/s]
100%|██████████| 1000/1000 [00:02<00:00, 431.20it/s]


Created /media/dmlab/My Passport/DATA/cross-domain/data/MDSD_books->kitchen_DDT_for_post.json


100%|██████████| 1000/1000 [00:03<00:00, 309.61it/s]
100%|██████████| 1000/1000 [00:02<00:00, 381.93it/s]


Created /media/dmlab/My Passport/DATA/cross-domain/data/MDSD_dvd->electronics_DDT_for_post.json


100%|██████████| 1000/1000 [00:03<00:00, 289.65it/s]
100%|██████████| 1000/1000 [00:03<00:00, 315.15it/s]


Created /media/dmlab/My Passport/DATA/cross-domain/data/MDSD_dvd->books_DDT_for_post.json


100%|██████████| 1000/1000 [00:03<00:00, 326.88it/s]
100%|██████████| 1000/1000 [00:02<00:00, 421.85it/s]


Created /media/dmlab/My Passport/DATA/cross-domain/data/MDSD_dvd->kitchen_DDT_for_post.json


100%|██████████| 1000/1000 [00:02<00:00, 392.70it/s]
100%|██████████| 1000/1000 [00:02<00:00, 378.39it/s]


Created /media/dmlab/My Passport/DATA/cross-domain/data/MDSD_kitchen->electronics_DDT_for_post.json


100%|██████████| 1000/1000 [00:02<00:00, 361.68it/s]
100%|██████████| 1000/1000 [00:03<00:00, 307.11it/s]


Created /media/dmlab/My Passport/DATA/cross-domain/data/MDSD_kitchen->books_DDT_for_post.json


100%|██████████| 1000/1000 [00:03<00:00, 327.33it/s]
100%|██████████| 1000/1000 [00:03<00:00, 267.62it/s]

Created /media/dmlab/My Passport/DATA/cross-domain/data/MDSD_kitchen->dvd_DDT_for_post.json



