In [1]:
import os
from glob import glob
import pandas as pd
from tqdm.notebook import tqdm
tqdm.pandas()
from nltk.corpus import stopwords as stopwords_nltk 
from nltk import word_tokenize, pos_tag
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
import re
from keybert import KeyBERT

In [18]:
filepaths = glob('/media/dmlab/My Passport/DATA/BenchmarkDataset/MDSD/*_labeled_2000.csv')
save_dir = '/media/dmlab/My Passport/DATA/cross-domain'
train_val_dir = os.path.join(save_dir, 'train&val')
if not os.path.exists(train_val_dir): os.makedirs(train_val_dir)

In [3]:
lemmatizer = WordNetLemmatizer()
MULTIPLE_SPACES = re.compile(' +', re.UNICODE)
removal_list = "|,‘, ’, ◇, ‘, ”,  ’, ·, \“, ·, △, ➤, ●,  , ■, (, ), \", >>, `, /, -,∼,=,ㆍ<,>, ?, !,【,】, …, ◆,%"
stopwords = stopwords_nltk.words('english')
def get_preprocessed_tokens(text):
    text = text.translate(str.maketrans(removal_list, ' '*len(removal_list)))   # 특수문자 제거
    text = re.sub(MULTIPLE_SPACES, ' ', text)   # 무의미한 공백 제거
    words = word_tokenize(text.lower())   # 소문자로 변경 후 tokenization
    nouns = [token for token, tag in pos_tag(words) if tag in ['NN', 'NNS', 'NNP', 'NNPS']]   # 명사 추출
    nouns = [lemmatizer.lemmatize(token) for token in nouns]   # lemmatization (e.g., movies -> movie)
    nouns = [token for token in nouns if token not in stopwords]   # 불용어 제거
    nouns = [token for token in nouns if len(token)>1]   # 길이가 1 이하인 단어 제거
    return nouns
kw_model = KeyBERT()

  return torch._C._cuda_getDeviceCount() > 0


In [15]:
dfs = []
for filepath in filepaths:
    df = pd.read_csv(filepath)
    domain = os.path.basename(filepath).split('_')[0]
    df['domain'] = domain

    original_len = len(df)
    df.drop_duplicates(['text'], keep='last', inplace=True)   # 중복된 제목 제거
    print('[{}] Droped {} rows having duplicated text'.format(domain, original_len-len(df)))
    df['nouns'] = df.progress_apply(lambda x: get_preprocessed_tokens(x['text']), axis=1)   # 전처리 함수 실행
    dfs.append(df)
raw_df = pd.concat(dfs)
raw_df

[books] Droped 18 rows having duplicated text


  0%|          | 0/1982 [00:00<?, ?it/s]

[dvd] Droped 33 rows having duplicated text


  0%|          | 0/1967 [00:00<?, ?it/s]

[electronics] Droped 39 rows having duplicated text


  0%|          | 0/1961 [00:00<?, ?it/s]

[kitchen] Droped 23 rows having duplicated text


  0%|          | 0/1977 [00:00<?, ?it/s]

Unnamed: 0,text,label,domain,nouns
0,"Bridget Jones, modern day woman, brillant and ...",positive,books,"[bridget, jones, day, woman, brillant, acciden..."
1,I am ordering copies for all 23 middle school ...,positive,books,"[copy, school, principal, assistant, principal..."
2,As a casual piano player and a Broadway fanati...,positive,books,"[piano, player, broadway, song, avenue, book, ..."
3,This is one of the best biographies I have eve...,positive,books,"[biography, author, lot, time, effort, work, l..."
4,"I read this book many, many years ago on a ver...",positive,books,"[book, year, flight, philosophy, money, month,..."
...,...,...,...,...
1995,I purchased this toy for a friend's dog a whil...,negative,kitchen,"[toy, friend, dog, dog, quack, quack, toy, add..."
1996,I received the first topper and it was not sat...,negative,kitchen,"[topper, etc, pad, box]"
1997,Some how my previous review text got a little ...,negative,kitchen,"[review, text, thing, knife, cut, manufacture,..."
1998,Ditto the other's observations... The thermost...,negative,kitchen,"[observation, thermostat, temperature, unit, r..."


In [23]:
raw_df['keywords'] = raw_df.progress_apply(lambda x: [word for (word, score) in kw_model.extract_keywords(x['text']) if word in x['nouns']], axis=1)
raw_df

  0%|          | 0/7887 [00:00<?, ?it/s]

Unnamed: 0,text,label,domain,nouns,keywords
0,"Bridget Jones, modern day woman, brillant and ...",positive,books,"[bridget, jones, day, woman, brillant, acciden...","[bridget, book, woman, chick, brillant]"
1,I am ordering copies for all 23 middle school ...,positive,books,"[copy, school, principal, assistant, principal...","[einstein, philosophy, wheatley]"
2,As a casual piano player and a Broadway fanati...,positive,books,"[piano, player, broadway, song, avenue, book, ...","[piano, broadway, sonata, avenue]"
3,This is one of the best biographies I have eve...,positive,books,"[biography, author, lot, time, effort, work, l...","[francis, book]"
4,"I read this book many, many years ago on a ver...",positive,books,"[book, year, flight, philosophy, money, month,...","[millionaire, philosophy]"
...,...,...,...,...,...
1995,I purchased this toy for a friend's dog a whil...,negative,kitchen,"[toy, friend, dog, dog, quack, quack, toy, add...","[toy, dog, stuffing]"
1996,I received the first topper and it was not sat...,negative,kitchen,"[topper, etc, pad, box]","[topper, box]"
1997,Some how my previous review text got a little ...,negative,kitchen,"[review, text, thing, knife, cut, manufacture,...","[quality, cut, manufacture]"
1998,Ditto the other's observations... The thermost...,negative,kitchen,"[observation, thermostat, temperature, unit, r...","[thermostat, temperature]"


In [24]:
def mask_keywords(doc, keywords):
    for to_be_masked in keywords:
        doc = doc.replace(to_be_masked, '[UNK]')
        doc = doc.replace(to_be_masked[0].upper()+to_be_masked[1:], '[UNK]')
        doc = doc.replace(to_be_masked.upper(), '[UNK]')
    return doc

raw_df['masked_text'] = raw_df.progress_apply(lambda x: mask_keywords(x['text'], x['keywords']), axis=1)
raw_df

  0%|          | 0/7887 [00:00<?, ?it/s]

Unnamed: 0,text,label,domain,nouns,keywords,masked_text
0,"Bridget Jones, modern day woman, brillant and ...",positive,books,"[bridget, jones, day, woman, brillant, acciden...","[bridget, book, woman, chick, brillant]","[UNK] Jones, modern day [UNK], [UNK] and doesn..."
1,I am ordering copies for all 23 middle school ...,positive,books,"[copy, school, principal, assistant, principal...","[einstein, philosophy, wheatley]",I am ordering copies for all 23 middle school ...
2,As a casual piano player and a Broadway fanati...,positive,books,"[piano, player, broadway, song, avenue, book, ...","[piano, broadway, sonata, avenue]","As a casual [UNK] player and a [UNK] fanatic, ..."
3,This is one of the best biographies I have eve...,positive,books,"[biography, author, lot, time, effort, work, l...","[francis, book]",This is one of the best biographies I have eve...
4,"I read this book many, many years ago on a ver...",positive,books,"[book, year, flight, philosophy, money, month,...","[millionaire, philosophy]","I read this book many, many years ago on a ver..."
...,...,...,...,...,...,...
1995,I purchased this toy for a friend's dog a whil...,negative,kitchen,"[toy, friend, dog, dog, quack, quack, toy, add...","[toy, dog, stuffing]",I purchased this [UNK] for a friend's [UNK] a ...
1996,I received the first topper and it was not sat...,negative,kitchen,"[topper, etc, pad, box]","[topper, box]",I received the first [UNK] and it was not sati...
1997,Some how my previous review text got a little ...,negative,kitchen,"[review, text, thing, knife, cut, manufacture,...","[quality, cut, manufacture]",Some how my previous review text got a little ...
1998,Ditto the other's observations... The thermost...,negative,kitchen,"[observation, thermostat, temperature, unit, r...","[thermostat, temperature]",Ditto the other's observations... The [UNK] se...


In [32]:
raw_df.reset_index(inplace=True)
raw_df.to_json(os.path.join(save_dir, 'MDSD_masked.json'))
print('Created {}'.format(os.path.join(save_dir, 'MDSD_masked.json')))

Created /media/dmlab/My Passport/DATA/cross-domain/MDSD_masked.json


training set, validation set 생성

In [19]:
for domain in raw_df.domain.unique():
    one_df = raw_df[raw_df['domain']==domain]
    train_df, val_df = train_test_split(one_df, test_size=.2, shuffle=True, stratify=one_df['label'].values)
    filepath = os.path.join(train_val_dir, '{}_train.json'.format(domain))
    train_df.to_json(filepath)
    print('Created {}'.format(filepath))
    filepath = os.path.join(train_val_dir, '{}_val.json'.format(domain))
    val_df.to_json(filepath)
    print('Created {}'.format(filepath))

Created /media/dmlab/My Passport/DATA/cross-domain/train&val/books_train.json
Created /media/dmlab/My Passport/DATA/cross-domain/train&val/books_val.json
Created /media/dmlab/My Passport/DATA/cross-domain/train&val/dvd_train.json
Created /media/dmlab/My Passport/DATA/cross-domain/train&val/dvd_val.json
Created /media/dmlab/My Passport/DATA/cross-domain/train&val/electronics_train.json
Created /media/dmlab/My Passport/DATA/cross-domain/train&val/electronics_val.json
Created /media/dmlab/My Passport/DATA/cross-domain/train&val/kitchen_train.json
Created /media/dmlab/My Passport/DATA/cross-domain/train&val/kitchen_val.json
