In [1]:
import os, re, random
from glob import glob
import pandas as pd
from tqdm import tqdm
tqdm.pandas()
from itertools import combinations 
import re, copy
from sklearn.utils import shuffle
import numpy as np
from nltk import sent_tokenize, word_tokenize

In [9]:
mdsd_labeled_filepaths = glob('/media/dmlab/My Passport/DATA/BenchmarkDataset/MDSD/*_labeled_*.csv')
mdsd_unlabeled_filepaths = glob('/media/dmlab/My Passport/DATA/BenchmarkDataset/MDSD/*_unlabeled_*.csv')
domain_cls_filepaths = glob('/media/dmlab/My Passport/DATA/cross-domain/domain-cls/*&*/target_labeled_*.csv')
save_dir = '/media/dmlab/My Passport/DATA/cross-domain/data'
if not os.path.exists(save_dir): os.makedirs(save_dir)

* 4,000 unlabeled reviews per domain
* 2,000 labeled reviews per domain

In [3]:
def concat_dataframes(filepaths, drop_duplicates=True, sample_num=4000):
    dfs = []
    for filepath in filepaths:
        df = pd.read_csv(filepath)
        domain = os.path.basename(filepath).split('_')[0]
        df['domain'] = domain
        original_len = len(df)
        if drop_duplicates:
            df.drop_duplicates(['text'], keep='last', inplace=True)   # 중복된 텍스트 제거
            print('[{}] Droped {} rows having duplicated text'.format(domain, original_len-len(df)))
        if sample_num is not None:
            df = df.sample(n=sample_num)   # 4000개 데이터만 랜덤하게 선택
        dfs.append(df)
    concat_df = pd.concat(dfs)
    concat_df = shuffle(concat_df)   # Shuffle
    concat_df.reset_index(inplace=True)   # Reset index
    return concat_df

unlabeled_df = concat_dataframes(mdsd_unlabeled_filepaths)
filepath = os.path.join(save_dir, 'MDSD_unlabeled.json')
unlabeled_df.to_json(filepath)
print('Created {}'.format(filepath))

labeled_df = concat_dataframes(mdsd_labeled_filepaths, drop_duplicates=False, sample_num=None)
filepath = os.path.join(save_dir, 'MDSD_labeled.json')
labeled_df.to_json(filepath)
print('Created {}'.format(filepath))

[books] Droped 477734 rows having duplicated text
[dvd] Droped 39540 rows having duplicated text
[electronics] Droped 2213 rows having duplicated text
[kitchen] Droped 1218 rows having duplicated text
Created /media/dmlab/My Passport/DATA/cross-domain/data/MDSD_unlabeled.json
Created /media/dmlab/My Passport/DATA/cross-domain/data/MDSD_labeled.json


In [4]:
labeled_df = pd.read_json(os.path.join(save_dir, 'MDSD_labeled.json'))
labeled_df

Unnamed: 0,index,text,label,domain
0,574,This Bluetooth remote device works as advertiz...,positive,electronics
1,1822,I almost threw this coffee maker across my liv...,negative,kitchen
2,1669,This edition of the film sucks and sucks again...,negative,dvd
3,1147,The flight into New York had been a long one; ...,negative,dvd
4,1074,I can't believe I spent good money on this vac...,negative,kitchen
...,...,...,...,...
7995,107,This is an awesome tablet! It has great pressu...,positive,electronics
7996,1649,"I bought this cable to connect 50"" Plasma TV t...",negative,electronics
7997,1400,My timer didn't work either. It arrived broken...,negative,kitchen
7998,912,Everything works great. The only marginal issu...,positive,electronics


In [5]:
unlabeled_df = pd.read_json(os.path.join(save_dir, 'MDSD_unlabeled.json'))
unlabeled_df

Unnamed: 0,index,text,domain
0,97230,"After reading the review below, I have got to ...",dvd
1,8352,After step 8 of adding your songs to the Virgi...,electronics
2,16895,I transfered video directly to the DVD with 15...,electronics
3,638090,Noble physics laureate Abdus Salam calls this ...,books
4,5980,I have purchased several wine openers over the...,kitchen
...,...,...,...
15995,11102,Firstly this is my first purchase through Amaz...,electronics
15996,386447,The last time I bought this was the Second Edi...,books
15997,91833,I bought this for my Best Friend who is a Full...,dvd
15998,8122,I really wanted to love this little player and...,electronics


# Post-training용 데이터셋

In [6]:
def create_txt_for_post_training(docs, save_filepath, num_of_duplicates=10):    
    with open(save_filepath, 'w') as output_file:
        for _ in range(num_of_duplicates): # each sentence in the target domain gets duplicated 10 times
            for doc_idx, doc in enumerate(docs):
                output_file.write('{}\n\n'.format(doc))
        output_file.write('[EOD]')
    print(f'Created {save_filepath}')

#### Baseline: Source+Target MLM
* 도메인 2개 (파일명에 알파벳 순으로 기재)

In [7]:
mode = 'ST'
    
domains = unlabeled_df.domain.unique()
for (domain1, domain2) in list(combinations(domains, 2)):
    df = copy.copy(unlabeled_df[unlabeled_df['domain'].isin([domain1, domain2])])
    docs = df['text'].values
    
    save_filepath = os.path.join(save_dir, 'MDSD_{}_{}_for_post.txt'.format('&'.join(sorted([domain1, domain2])), mode))
    create_txt_for_post_training(docs, save_filepath)

Created /media/dmlab/My Passport/DATA/cross-domain/data/MDSD_dvd&electronics_ST_for_post.txt
Created /media/dmlab/My Passport/DATA/cross-domain/data/MDSD_books&dvd_ST_for_post.txt
Created /media/dmlab/My Passport/DATA/cross-domain/data/MDSD_dvd&kitchen_ST_for_post.txt
Created /media/dmlab/My Passport/DATA/cross-domain/data/MDSD_books&electronics_ST_for_post.txt
Created /media/dmlab/My Passport/DATA/cross-domain/data/MDSD_electronics&kitchen_ST_for_post.txt
Created /media/dmlab/My Passport/DATA/cross-domain/data/MDSD_books&kitchen_ST_for_post.txt


#### Baseline: Target MLM
* 도메인 1개

In [8]:
mode = 'T'
    
domains = unlabeled_df.domain.unique()
for domain1 in domains:
    df = copy.copy(unlabeled_df[unlabeled_df['domain'].isin([domain1])])
    docs = df['text'].values
    
    save_filepath = os.path.join(save_dir, 'MDSD_{}_{}_for_post.txt'.format(domain1, mode))
    create_txt_for_post_training(docs, save_filepath)

Created /media/dmlab/My Passport/DATA/cross-domain/data/MDSD_dvd_T_for_post.txt
Created /media/dmlab/My Passport/DATA/cross-domain/data/MDSD_electronics_T_for_post.txt
Created /media/dmlab/My Passport/DATA/cross-domain/data/MDSD_books_T_for_post.txt
Created /media/dmlab/My Passport/DATA/cross-domain/data/MDSD_kitchen_T_for_post.txt


#### Proposed: Similar Source + Target MLM
* 타겟 labeled와 유사한 소스 unlabeled 텍스트를 수집 
* 소스와 타겟 구분

In [26]:
mode = 'SimST'

for filepath in domain_cls_filepaths:
    target_domain = os.path.basename(filepath).split('_')[-1].replace('.csv', '')
    source_domain = os.path.basename(os.path.dirname(filepath)).replace(target_domain, '').replace('&', '')
    df = pd.read_csv(filepath)
    docs = df['most-similar_text'].unique()
    print('Source={}, Target={}, Number of similar texts={}'.format(source_domain, target_domain, len(docs)))
    
    save_filepath = os.path.join(save_dir, 'MDSD_source={}-target={}_{}_for_post.txt'.format(source_domain, target_domain, mode))
    create_txt_for_post_training(docs, save_filepath)

Source=dvd, Target=books, Number of similar texts=884
Created /media/dmlab/My Passport/DATA/cross-domain/data/MDSD_source=dvd-target=books_SimST_for_post.txt
Source=books, Target=dvd, Number of similar texts=816
Created /media/dmlab/My Passport/DATA/cross-domain/data/MDSD_source=books-target=dvd_SimST_for_post.txt
Source=electronics, Target=books, Number of similar texts=744
Created /media/dmlab/My Passport/DATA/cross-domain/data/MDSD_source=electronics-target=books_SimST_for_post.txt
Source=books, Target=electronics, Number of similar texts=684
Created /media/dmlab/My Passport/DATA/cross-domain/data/MDSD_source=books-target=electronics_SimST_for_post.txt
Source=kitchen, Target=books, Number of similar texts=742
Created /media/dmlab/My Passport/DATA/cross-domain/data/MDSD_source=kitchen-target=books_SimST_for_post.txt
Source=books, Target=kitchen, Number of similar texts=654
Created /media/dmlab/My Passport/DATA/cross-domain/data/MDSD_source=books-target=kitchen_SimST_for_post.txt
Sour