In [1]:
import os, re, random
from glob import glob
import pandas as pd
from tqdm import tqdm
tqdm.pandas()
from itertools import combinations 
from nltk.corpus import stopwords as stopwords_nltk 
from nltk import word_tokenize, pos_tag
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
import re
from sklearn.utils import shuffle

In [5]:
mdsd_labeled_filepaths = glob('/media/dmlab/My Passport/DATA/BenchmarkDataset/MDSD/*_labeled_*.csv')
mdsd_unlabeled_filepaths = glob('/media/dmlab/My Passport/DATA/BenchmarkDataset/MDSD/*_unlabeled_*.csv')
save_dir = '/media/dmlab/My Passport/DATA/cross-domain/data'
if not os.path.exists(save_dir): os.makedirs(save_dir)

* 4,000 unlabeled reviews per domain
* 2,000 labeled reviews per domain

In [17]:
lemmatizer = WordNetLemmatizer()
MULTIPLE_SPACES = re.compile(' +', re.UNICODE)
removal_list = "|,‘, ’, ◇, ‘, ”,  ’, ·, \“, ·, △, ➤, ●,  , ■, (, ), \", >>, `, /, -,∼,=,ㆍ<,>, ?, !,【,】, …, ◆,%"
stopwords = stopwords_nltk.words('english')

def concat_dataframes(filepaths, drop_duplicates=True, sample_num=4000):
    dfs = []
    for filepath in filepaths:
        df = pd.read_csv(filepath)
        domain = os.path.basename(filepath).split('_')[0]
        df['domain'] = domain
        original_len = len(df)
        if drop_duplicates:
            df.drop_duplicates(['text'], keep='last', inplace=True)   # 중복된 텍스트 제거
            print('[{}] Droped {} rows having duplicated text'.format(domain, original_len-len(df)))
        if sample_num is not None:
            df = df.sample(n=sample_num)   # 4000개 데이터만 랜덤하게 선택
        dfs.append(df)
    concat_df = pd.concat(dfs)
    concat_df = shuffle(concat_df)   # Shuffle
    concat_df.reset_index(inplace=True)   # Reset index
    return concat_df

unlabeled_df = concat_dataframes(mdsd_unlabeled_filepaths)
filepath = os.path.join(save_dir, 'MDSD_unlabeled.json')
unlabeled_df.to_json(filepath)
print('Created {}'.format(filepath))

labeled_df = concat_dataframes(mdsd_labeled_filepaths, drop_duplicates=False, sample_num=None)
filepath = os.path.join(save_dir, 'MDSD_labeled.json')
labeled_df.to_json(filepath)
print('Created {}'.format(filepath))

[books] Droped 477734 rows having duplicated text
[dvd] Droped 39540 rows having duplicated text
[electronics] Droped 2213 rows having duplicated text
[kitchen] Droped 1218 rows having duplicated text
Created /media/dmlab/My Passport/DATA/cross-domain/data/MDSD_unlabeled.json
Created /media/dmlab/My Passport/DATA/cross-domain/data/MDSD_labeled.json


In [18]:
unlabeled_df = pd.read_json(os.path.join(save_dir, 'MDSD_unlabeled.json'))
unlabeled_df

Unnamed: 0,index,text,domain
0,12429,the ease and timelyness of the product was ver...,electronics
1,813196,I'm a man about to turn 50 with 4 yr. olds twi...,books
2,527488,There is tons to do in Washington DC and despi...,books
3,701823,Good grounding book in strategy,books
4,14177,super excellent cd player with remote control/...,electronics
...,...,...,...
15995,597945,I work in the healthcare field and have seen q...,books
15996,9786,I have a miniature poodle puppy who eats every...,kitchen
15997,606221,I wasnt a big fan of this book. it didnt keep ...,books
15998,30590,This movie has one the best performances by Ja...,dvd


In [19]:
labeled_df = pd.read_json(os.path.join(save_dir, 'MDSD_labeled.json'))
labeled_df

Unnamed: 0,index,text,label,domain
0,366,Elaine Pagels is a wonderful writer. Her expla...,positive,books
1,1024,I baked six loaves in this machine. I have bee...,negative,kitchen
2,548,I BOUGHT THIS LIKE A IPOD BUT ITS BETTER I HAV...,positive,electronics
3,675,"These are called ""Fruit Bowl"" but they are the...",positive,kitchen
4,1098,Corkscrew has a nice heavy duty feel and does ...,negative,kitchen
...,...,...,...,...
7995,156,"This soap dish is beautiful, practical and stu...",positive,kitchen
7996,462,WE ARE HALFWAY THROUGH THE FRIENDS SERIES...5 ...,positive,dvd
7997,314,I was a little hesitant to spend over $10 for ...,positive,kitchen
7998,635,"Excellent item, very powerful and stylish. Wou...",positive,kitchen


* Logistic Regression으로 Source/Target 분류하는 문제 풀게 한 후, Coefficient가 높은 단어를 domain-specific feature로 취급
    - (Source-Target) 쌍 구분 유의
    - https://github.com/tapilab/emnlp-2020-spurious/blob/5ae0b718f7bbf6216731453e4e59a72f38d90bb5/Step1_get_matched_sentences.ipynb 참고

In [33]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from collections import Counter
import numpy as np
import copy

In [135]:
class Dataset:
    def __init__(self, X, y, vec, df, moniker):
        """
        X: feature matrix;
        y: labels;
        vec: CountVectorizer
        df: dataframe
        feats: features from CountVectorizer
        moniker: reference name to the dataset
        """
        self.X = X
        self.y = y
        self.vec = vec
        self.df = df
        self.feats = np.array(vec.get_feature_names())
        self.moniker = moniker # name of the dataset
        
def simple_vectorize(df):
    """
    Vectorize text
    """
    vec = CountVectorizer(min_df=5, binary=True, max_df=.8)
    X = vec.fit_transform(df.text)
    y = df.label.values
    feats = np.array(vec.get_feature_names())
    
    return X, y, vec, feats

def print_coef(clf, feats, n=100):
    """
    sort and print words by coef stregth (abs(coef))
    """
    if len(clf.classes_) == 2:
        coefs = [-1*clf.coef_[0], clf.coef_[0]] # change the coef relation corresponding with each class
    else:
        coefs = clf.coef_

    records = []
    for label, coef in zip(clf.classes_, coefs):
        topi = coef.argsort()[::-1][:n]
        records.extend([(label,f,c) for f, c in zip(feats[topi], coef[topi])])
    return pd.DataFrame(records, columns=['label', 'word', 'coef'])
        
def get_top_terms(dataset, top_n):
    """
    Fit classifier, print top-n terms;
    Top features (features have high coef): abs(coef) >= thresh
    Placebos (features have low coef): abs(coef) <= thresh
    """
    clf = LogisticRegression(class_weight='auto', C=1, solver='lbfgs', max_iter=1000)
    clf.fit(dataset.X, dataset.y)
    
    coef_df = print_coef(clf, dataset.feats, n=top_n)
    return coef_df

In [145]:
for (domain1, domain2) in list(combinations(unlabeled_df.domain.unique(), 2)):
    moniker = '&'.join(sorted([domain1, domain2]))
    df = copy.copy(unlabeled_df[unlabeled_df['domain'].isin([domain1, domain2])])
    df = df[['text', 'domain']]
    df.columns = ['text', 'label']
    X, y, vec, feats = simple_vectorize(df)
    ds = Dataset(X, y, vec, df, moniker) # construct dataset object

    keyword_df = get_top_terms(ds, top_n=100)
    filepath = os.path.join(save_dir, 'MDSD_{}_keywords.csv'.format(moniker))
    keyword_df.to_csv(filepath, index=False)
    print('Created {}'.format(filepath))

Created /media/dmlab/My Passport/DATA/cross-domain/data/MDSD_books&electronics_keywords.csv
Created /media/dmlab/My Passport/DATA/cross-domain/data/MDSD_dvd&electronics_keywords.csv
Created /media/dmlab/My Passport/DATA/cross-domain/data/MDSD_electronics&kitchen_keywords.csv
Created /media/dmlab/My Passport/DATA/cross-domain/data/MDSD_books&dvd_keywords.csv
Created /media/dmlab/My Passport/DATA/cross-domain/data/MDSD_books&kitchen_keywords.csv
Created /media/dmlab/My Passport/DATA/cross-domain/data/MDSD_dvd&kitchen_keywords.csv


* Post-training용 데이터셋
    - 도메인 2개 (파일명에 알파벳 순으로 기재)
    - Proposed Post-training option
        - ds-keyword: 소스/타겟의 domain-specific features 전체를 [UNK]로 처리

In [152]:
def mask_keywords(doc, keywords):
    words = doc.split()
    for i in range(len(words)):
        for keyword in keywords:
            if keyword.lower() in words[i].lower():
                words[i] = '[UNK]'
    return ' '.join(words)

def create_txt_for_post_training(docs, save_filepath, num_of_duplicates=10):    
    with open(save_filepath, 'w') as output_file:
        for _ in range(num_of_duplicates): # each sentence in the target domain gets duplicated 10 times
            for doc_idx, doc in enumerate(docs):
                output_file.write('{}\n\n'.format(doc))
        output_file.write('[EOD]')
    print(f'Created {save_filepath}')
    
mode = 'ds-keyword'
    
domains = unlabeled_df.domain.unique()
for (domain1, domain2) in list(combinations(domains, 2)):
    moniker = '&'.join(sorted([domain1, domain2]))
    df = copy.copy(unlabeled_df[unlabeled_df['domain'].isin([domain1, domain2])])
    keyword_df = pd.read_csv(os.path.join(save_dir, 'MDSD_{}_keywords.csv'.format(moniker)))    
    df['masked_text'] = df['text'].progress_apply(lambda x: mask_keywords(x, keyword_df.word.values))
    docs = df['masked_text'].values
    
    save_filepath = os.path.join(save_dir, 'MDSD_{}_{}_for_post.txt'.format('&'.join(sorted([domain1, domain2])), mode))
    create_txt_for_post_training(docs, save_filepath)

100%|██████████| 8000/8000 [00:38<00:00, 210.26it/s]


Created /media/dmlab/My Passport/DATA/cross-domain/data/MDSD_books&electronics_ds-keyword_for_post.txt


100%|██████████| 8000/8000 [00:41<00:00, 193.16it/s]


Created /media/dmlab/My Passport/DATA/cross-domain/data/MDSD_dvd&electronics_ds-keyword_for_post.txt


100%|██████████| 8000/8000 [00:29<00:00, 272.64it/s]


Created /media/dmlab/My Passport/DATA/cross-domain/data/MDSD_electronics&kitchen_ds-keyword_for_post.txt


100%|██████████| 8000/8000 [00:46<00:00, 171.72it/s]


Created /media/dmlab/My Passport/DATA/cross-domain/data/MDSD_books&dvd_ds-keyword_for_post.txt


100%|██████████| 8000/8000 [00:35<00:00, 224.70it/s]


Created /media/dmlab/My Passport/DATA/cross-domain/data/MDSD_books&kitchen_ds-keyword_for_post.txt


100%|██████████| 8000/8000 [00:38<00:00, 206.05it/s]


Created /media/dmlab/My Passport/DATA/cross-domain/data/MDSD_dvd&kitchen_ds-keyword_for_post.txt
