In [112]:
import os
from glob import glob
import pandas as pd
from tqdm import tqdm
tqdm.pandas()
from itertools import combinations 
from nltk.corpus import stopwords as stopwords_nltk 
from nltk import word_tokenize, pos_tag
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
import re
from keybert import KeyBERT

In [24]:
filepaths = glob('/media/dmlab/My Passport/DATA/BenchmarkDataset/MDSD/*_labeled_2000.csv')
save_dir = '/media/dmlab/My Passport/DATA/cross-domain'
train_val_dir = os.path.join(save_dir, 'train&val')
if not os.path.exists(train_val_dir): os.makedirs(train_val_dir)
kfold_train_val_dir = os.path.join(save_dir, 'kfold_train&val')
if not os.path.exists(kfold_train_val_dir): os.makedirs(kfold_train_val_dir)

전처리

In [3]:
lemmatizer = WordNetLemmatizer()
MULTIPLE_SPACES = re.compile(' +', re.UNICODE)
removal_list = "|,‘, ’, ◇, ‘, ”,  ’, ·, \“, ·, △, ➤, ●,  , ■, (, ), \", >>, `, /, -,∼,=,ㆍ<,>, ?, !,【,】, …, ◆,%"
stopwords = stopwords_nltk.words('english')
def get_preprocessed_tokens(text):
    text = text.translate(str.maketrans(removal_list, ' '*len(removal_list)))   # 특수문자 제거
    text = re.sub(MULTIPLE_SPACES, ' ', text)   # 무의미한 공백 제거
    words = word_tokenize(text.lower())   # 소문자로 변경 후 tokenization
    nouns = [token for token, tag in pos_tag(words) if tag in ['NN', 'NNS', 'NNP', 'NNPS']]   # 명사 추출
    nouns = [lemmatizer.lemmatize(token) for token in nouns]   # lemmatization (e.g., movies -> movie)
    nouns = [token for token in nouns if token not in stopwords]   # 불용어 제거
    nouns = [token for token in nouns if len(token)>1]   # 길이가 1 이하인 단어 제거
    return nouns
kw_model = KeyBERT()

  return torch._C._cuda_getDeviceCount() > 0


In [15]:
dfs = []
for filepath in filepaths:
    df = pd.read_csv(filepath)
    domain = os.path.basename(filepath).split('_')[0]
    df['domain'] = domain

    original_len = len(df)
    df.drop_duplicates(['text'], keep='last', inplace=True)   # 중복된 제목 제거
    print('[{}] Droped {} rows having duplicated text'.format(domain, original_len-len(df)))
    df['nouns'] = df.progress_apply(lambda x: get_preprocessed_tokens(x['text']), axis=1)   # 전처리 함수 실행
    dfs.append(df)
raw_df = pd.concat(dfs)
raw_df

[books] Droped 18 rows having duplicated text


  0%|          | 0/1982 [00:00<?, ?it/s]

[dvd] Droped 33 rows having duplicated text


  0%|          | 0/1967 [00:00<?, ?it/s]

[electronics] Droped 39 rows having duplicated text


  0%|          | 0/1961 [00:00<?, ?it/s]

[kitchen] Droped 23 rows having duplicated text


  0%|          | 0/1977 [00:00<?, ?it/s]

Unnamed: 0,text,label,domain,nouns
0,"Bridget Jones, modern day woman, brillant and ...",positive,books,"[bridget, jones, day, woman, brillant, acciden..."
1,I am ordering copies for all 23 middle school ...,positive,books,"[copy, school, principal, assistant, principal..."
2,As a casual piano player and a Broadway fanati...,positive,books,"[piano, player, broadway, song, avenue, book, ..."
3,This is one of the best biographies I have eve...,positive,books,"[biography, author, lot, time, effort, work, l..."
4,"I read this book many, many years ago on a ver...",positive,books,"[book, year, flight, philosophy, money, month,..."
...,...,...,...,...
1995,I purchased this toy for a friend's dog a whil...,negative,kitchen,"[toy, friend, dog, dog, quack, quack, toy, add..."
1996,I received the first topper and it was not sat...,negative,kitchen,"[topper, etc, pad, box]"
1997,Some how my previous review text got a little ...,negative,kitchen,"[review, text, thing, knife, cut, manufacture,..."
1998,Ditto the other's observations... The thermost...,negative,kitchen,"[observation, thermostat, temperature, unit, r..."


Keyword 추출 using KeyBERT

In [23]:
raw_df['keywords'] = raw_df.progress_apply(lambda x: [word for (word, score) in kw_model.extract_keywords(x['text']) if word in x['nouns']], axis=1)
raw_df

  0%|          | 0/7887 [00:00<?, ?it/s]

Unnamed: 0,text,label,domain,nouns,keywords
0,"Bridget Jones, modern day woman, brillant and ...",positive,books,"[bridget, jones, day, woman, brillant, acciden...","[bridget, book, woman, chick, brillant]"
1,I am ordering copies for all 23 middle school ...,positive,books,"[copy, school, principal, assistant, principal...","[einstein, philosophy, wheatley]"
2,As a casual piano player and a Broadway fanati...,positive,books,"[piano, player, broadway, song, avenue, book, ...","[piano, broadway, sonata, avenue]"
3,This is one of the best biographies I have eve...,positive,books,"[biography, author, lot, time, effort, work, l...","[francis, book]"
4,"I read this book many, many years ago on a ver...",positive,books,"[book, year, flight, philosophy, money, month,...","[millionaire, philosophy]"
...,...,...,...,...,...
1995,I purchased this toy for a friend's dog a whil...,negative,kitchen,"[toy, friend, dog, dog, quack, quack, toy, add...","[toy, dog, stuffing]"
1996,I received the first topper and it was not sat...,negative,kitchen,"[topper, etc, pad, box]","[topper, box]"
1997,Some how my previous review text got a little ...,negative,kitchen,"[review, text, thing, knife, cut, manufacture,...","[quality, cut, manufacture]"
1998,Ditto the other's observations... The thermost...,negative,kitchen,"[observation, thermostat, temperature, unit, r...","[thermostat, temperature]"


maskted_text: 모든 keyword masking

In [24]:
def mask_keywords(doc, keywords):
    for to_be_masked in keywords:
        doc = doc.replace(to_be_masked, '[UNK]')
        doc = doc.replace(to_be_masked[0].upper()+to_be_masked[1:], '[UNK]')
        doc = doc.replace(to_be_masked.upper(), '[UNK]')
    return doc

raw_df['masked_text'] = raw_df.progress_apply(lambda x: mask_keywords(x['text'], x['keywords']), axis=1)
raw_df

  0%|          | 0/7887 [00:00<?, ?it/s]

Unnamed: 0,text,label,domain,nouns,keywords,masked_text
0,"Bridget Jones, modern day woman, brillant and ...",positive,books,"[bridget, jones, day, woman, brillant, acciden...","[bridget, book, woman, chick, brillant]","[UNK] Jones, modern day [UNK], [UNK] and doesn..."
1,I am ordering copies for all 23 middle school ...,positive,books,"[copy, school, principal, assistant, principal...","[einstein, philosophy, wheatley]",I am ordering copies for all 23 middle school ...
2,As a casual piano player and a Broadway fanati...,positive,books,"[piano, player, broadway, song, avenue, book, ...","[piano, broadway, sonata, avenue]","As a casual [UNK] player and a [UNK] fanatic, ..."
3,This is one of the best biographies I have eve...,positive,books,"[biography, author, lot, time, effort, work, l...","[francis, book]",This is one of the best biographies I have eve...
4,"I read this book many, many years ago on a ver...",positive,books,"[book, year, flight, philosophy, money, month,...","[millionaire, philosophy]","I read this book many, many years ago on a ver..."
...,...,...,...,...,...,...
1995,I purchased this toy for a friend's dog a whil...,negative,kitchen,"[toy, friend, dog, dog, quack, quack, toy, add...","[toy, dog, stuffing]",I purchased this [UNK] for a friend's [UNK] a ...
1996,I received the first topper and it was not sat...,negative,kitchen,"[topper, etc, pad, box]","[topper, box]",I received the first [UNK] and it was not sati...
1997,Some how my previous review text got a little ...,negative,kitchen,"[review, text, thing, knife, cut, manufacture,...","[quality, cut, manufacture]",Some how my previous review text got a little ...
1998,Ditto the other's observations... The thermost...,negative,kitchen,"[observation, thermostat, temperature, unit, r...","[thermostat, temperature]",Ditto the other's observations... The [UNK] se...


domain-specific feature: KeyBERT로 추출한 keywords에 한해, source or target 등장비율이 0.7이상인 단어

In [76]:
def get_freq_of_one_domain(raw_df, domain, keyword):
    _df = raw_df[raw_df['domain']==domain]
    _df = _df[_df['keywords'].apply(lambda x: keyword in x)]
    return len(_df)

keywords_set = set([k for sub in raw_df.keywords for k in sub])
keywords_df = pd.DataFrame(keywords_set, columns=['keyword'])
for domain in raw_df.domain.unique():
    keywords_df[domain] = keywords_df['keyword'].progress_apply(lambda x: get_freq_of_one_domain(raw_df, domain, x))

keywords_df.to_csv(os.path.join(save_dir, 'MDSD_keywords.csv'), index=False)
print('Created {}'.format(os.path.join(save_dir, 'MDSD_keywords.csv')))

keywords_df.sort_values(by=['electronics'], ascending=False).head()

Created /media/dmlab/My Passport/DATA/cross-domain/MDSD_keywords.csv


Unnamed: 0,keyword,books,dvd,electronics,kitchen
2192,printer,0,0,94,0
5417,ipod,0,0,93,2
6471,card,0,0,86,1
4803,tv,1,14,85,3
2287,dvd,2,256,84,1


In [128]:
def is_domain_specific_keyword(keyword, keywords_df, domain1, domain2, threshold=0.7):    
    k_record = keywords_df[keywords_df['keyword']==keyword].iloc[0]
    ratio_for_one_domain = k_record[domain1] / (k_record[domain1]+k_record[domain2])
    return ratio_for_one_domain > threshold or ratio_for_one_domain < (1-threshold)

for (domain1, domain2) in list(combinations(raw_df.domain.unique(), 2)):
    raw_df['masked_text_{}&{}'.format(*sorted([domain1, domain2]))] = raw_df.progress_apply\
        (lambda x: mask_keywords(x['text'], [k for k in x['keywords'] \
        if is_domain_specific_keyword(k, keywords_df, domain1, domain2)]) \
         if x['domain'] in (domain1, domain2) else None, axis=1)

raw_df.head()

100%|██████████| 7887/7887 [00:10<00:00, 763.99it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
100%|██████████| 7887/7887 [00:10<00:00, 778.31it/s] 
100%|██████████| 7887/7887 [00:09<00:00, 803.16it/s]  
100%|██████████| 7887/7887 [00:10<00:00, 773.83it/s]  
100%|██████████| 7887/7887 [00:09<00:00, 796.56it/s]  
100%|██████████| 7887/7887 [00:09<00:00, 800.97it/s]  


Unnamed: 0,index,text,label,domain,nouns,keywords,masked_text,masked_text_books&dvd,masked_text_books&electronics,masked_text_books&kitchen,masked_text_dvd&electronics,masked_text_dvd&kitchen,masked_text_electronics&kitchen
0,0,"Bridget Jones, modern day woman, brillant and ...",positive,books,"[bridget, jones, day, woman, brillant, acciden...","[bridget, book, woman, chick, brillant]","[UNK] Jones, modern day [UNK], [UNK] and doesn...","[UNK] Jones, modern day woman, [UNK] and doesn...","[UNK] Jones, modern day [UNK], [UNK] and doesn...","[UNK] Jones, modern day [UNK], [UNK] and doesn...",,,
1,1,I am ordering copies for all 23 middle school ...,positive,books,"[copy, school, principal, assistant, principal...","[einstein, philosophy, wheatley]",I am ordering copies for all 23 middle school ...,I am ordering copies for all 23 middle school ...,I am ordering copies for all 23 middle school ...,I am ordering copies for all 23 middle school ...,,,
2,2,As a casual piano player and a Broadway fanati...,positive,books,"[piano, player, broadway, song, avenue, book, ...","[piano, broadway, sonata, avenue]","As a casual [UNK] player and a [UNK] fanatic, ...",As a casual [UNK] player and a Broadway fanati...,"As a casual piano player and a [UNK] fanatic, ...","As a casual [UNK] player and a [UNK] fanatic, ...",,,
3,3,This is one of the best biographies I have eve...,positive,books,"[biography, author, lot, time, effort, work, l...","[francis, book]",This is one of the best biographies I have eve...,This is one of the best biographies I have eve...,This is one of the best biographies I have eve...,This is one of the best biographies I have eve...,,,
4,4,"I read this book many, many years ago on a ver...",positive,books,"[book, year, flight, philosophy, money, month,...","[millionaire, philosophy]","I read this book many, many years ago on a ver...","I read this book many, many years ago on a ver...","I read this book many, many years ago on a ver...","I read this book many, many years ago on a ver...",,,


In [130]:
pd.read_json(os.path.join(save_dir, 'MDSD_masked.json'))

Unnamed: 0,index,text,label,domain,nouns,keywords,masked_text,masked_text_books&dvd,masked_text_books&electronics,masked_text_books&kitchen,masked_text_dvd&electronics,masked_text_dvd&kitchen,masked_text_electronics&kitchen
0,0,"Bridget Jones, modern day woman, brillant and ...",positive,books,"[bridget, jones, day, woman, brillant, acciden...","[bridget, book, woman, chick, brillant]","[UNK] Jones, modern day [UNK], [UNK] and doesn...","[UNK] Jones, modern day woman, [UNK] and doesn...","[UNK] Jones, modern day [UNK], [UNK] and doesn...","[UNK] Jones, modern day [UNK], [UNK] and doesn...",,,
1,1,I am ordering copies for all 23 middle school ...,positive,books,"[copy, school, principal, assistant, principal...","[einstein, philosophy, wheatley]",I am ordering copies for all 23 middle school ...,I am ordering copies for all 23 middle school ...,I am ordering copies for all 23 middle school ...,I am ordering copies for all 23 middle school ...,,,
2,2,As a casual piano player and a Broadway fanati...,positive,books,"[piano, player, broadway, song, avenue, book, ...","[piano, broadway, sonata, avenue]","As a casual [UNK] player and a [UNK] fanatic, ...",As a casual [UNK] player and a Broadway fanati...,"As a casual piano player and a [UNK] fanatic, ...","As a casual [UNK] player and a [UNK] fanatic, ...",,,
3,3,This is one of the best biographies I have eve...,positive,books,"[biography, author, lot, time, effort, work, l...","[francis, book]",This is one of the best biographies I have eve...,This is one of the best biographies I have eve...,This is one of the best biographies I have eve...,This is one of the best biographies I have eve...,,,
4,4,"I read this book many, many years ago on a ver...",positive,books,"[book, year, flight, philosophy, money, month,...","[millionaire, philosophy]","I read this book many, many years ago on a ver...","I read this book many, many years ago on a ver...","I read this book many, many years ago on a ver...","I read this book many, many years ago on a ver...",,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7882,7882,I purchased this toy for a friend's dog a whil...,negative,kitchen,"[toy, friend, dog, dog, quack, quack, toy, add...","[toy, dog, stuffing]",I purchased this [UNK] for a friend's [UNK] a ...,,,I purchased this [UNK] for a friend's [UNK] a ...,,I purchased this [UNK] for a friend's [UNK] a ...,I purchased this [UNK] for a friend's [UNK] a ...
7883,7883,I received the first topper and it was not sat...,negative,kitchen,"[topper, etc, pad, box]","[topper, box]",I received the first [UNK] and it was not sati...,,,I received the first [UNK] and it was not sati...,,I received the first [UNK] and it was not sati...,I received the first [UNK] and it was not sati...
7884,7884,Some how my previous review text got a little ...,negative,kitchen,"[review, text, thing, knife, cut, manufacture,...","[quality, cut, manufacture]",Some how my previous review text got a little ...,,,Some how my previous review text got a little ...,,Some how my previous review text got a little ...,Some how my previous review text got a little ...
7885,7885,Ditto the other's observations... The thermost...,negative,kitchen,"[observation, thermostat, temperature, unit, r...","[thermostat, temperature]",Ditto the other's observations... The [UNK] se...,,,Ditto the other's observations... The [UNK] se...,,Ditto the other's observations... The [UNK] se...,Ditto the other's observations... The [UNK] se...


In [129]:
# raw_df.reset_index(inplace=True)
raw_df.to_json(os.path.join(save_dir, 'MDSD_masked.json'))
print('Created {}'.format(os.path.join(save_dir, 'MDSD_masked.json')))

Created /media/dmlab/My Passport/DATA/cross-domain/MDSD_masked.json


training set, validation set 생성

In [19]:
for domain in raw_df.domain.unique():
    one_df = raw_df[raw_df['domain']==domain]
    train_df, val_df = train_test_split(one_df, test_size=.2, shuffle=True, stratify=one_df['label'].values)
    filepath = os.path.join(train_val_dir, '{}_train.json'.format(domain))
    train_df.to_json(filepath)
    print('Created {}'.format(filepath))
    filepath = os.path.join(train_val_dir, '{}_val.json'.format(domain))
    val_df.to_json(filepath)
    print('Created {}'.format(filepath))

Created /media/dmlab/My Passport/DATA/cross-domain/train&val/books_train.json
Created /media/dmlab/My Passport/DATA/cross-domain/train&val/books_val.json
Created /media/dmlab/My Passport/DATA/cross-domain/train&val/dvd_train.json
Created /media/dmlab/My Passport/DATA/cross-domain/train&val/dvd_val.json
Created /media/dmlab/My Passport/DATA/cross-domain/train&val/electronics_train.json
Created /media/dmlab/My Passport/DATA/cross-domain/train&val/electronics_val.json
Created /media/dmlab/My Passport/DATA/cross-domain/train&val/kitchen_train.json
Created /media/dmlab/My Passport/DATA/cross-domain/train&val/kitchen_val.json


5-fold set 생성

In [26]:
for domain in raw_df.domain.unique():
    one_df = raw_df[raw_df['domain']==domain]
    
    kf = StratifiedKFold(n_splits=5)
    for i, (train_val_indices, test_indices) in enumerate(kf.split(one_df, one_df['label'].values)):
        train_val_df = one_df.iloc[train_val_indices]
        train_df, val_df = train_test_split(train_val_df, test_size=.2, shuffle=True, stratify=train_val_df['label'].values)
        test_df = one_df.iloc[test_indices]
    
        filepath = os.path.join(kfold_train_val_dir, '{}_k={}_train.json'.format(domain, i))
        train_df.to_json(filepath)
        print('Created {}'.format(filepath))
        filepath = os.path.join(kfold_train_val_dir, '{}_k={}_val.json'.format(domain, i))
        val_df.to_json(filepath)
        print('Created {}'.format(filepath))
        filepath = os.path.join(kfold_train_val_dir, '{}_k={}_test.json'.format(domain, i))
        test_df.to_json(filepath)
        print('Created {}'.format(filepath))

Created /media/dmlab/My Passport/DATA/cross-domain/kfold_train&val/books_k=0_train.json
Created /media/dmlab/My Passport/DATA/cross-domain/kfold_train&val/books_k=0_val.json
Created /media/dmlab/My Passport/DATA/cross-domain/kfold_train&val/books_k=0_test.json
Created /media/dmlab/My Passport/DATA/cross-domain/kfold_train&val/books_k=1_train.json
Created /media/dmlab/My Passport/DATA/cross-domain/kfold_train&val/books_k=1_val.json
Created /media/dmlab/My Passport/DATA/cross-domain/kfold_train&val/books_k=1_test.json
Created /media/dmlab/My Passport/DATA/cross-domain/kfold_train&val/books_k=2_train.json
Created /media/dmlab/My Passport/DATA/cross-domain/kfold_train&val/books_k=2_val.json
Created /media/dmlab/My Passport/DATA/cross-domain/kfold_train&val/books_k=2_test.json
Created /media/dmlab/My Passport/DATA/cross-domain/kfold_train&val/books_k=3_train.json
Created /media/dmlab/My Passport/DATA/cross-domain/kfold_train&val/books_k=3_val.json
Created /media/dmlab/My Passport/DATA/cross