## CustomDataset

In [60]:
import pandas as pd
import glob2
import torch
from transformers import AutoTokenizer, AutoConfig
import config
from plato.configuration_plato import PlatoConfig

from tqdm import tqdm

from torch.utils.data import Dataset

class DialogueFeatures():
    def __init__(self, input_ids, input_mask, segment_ids, role_ids, turn_ids=None, position_ids=None):
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.role_ids = role_ids
        self.turn_ids = turn_ids
        self.position_ids = position_ids

        self.batch_size = len(self.input_ids)

class DialogueDataset(Dataset):
    def __init__(self, file_path):
        """
        data: 각 대화 샘플을 포함하는 리스트.
        예시)
        각 대화 샘플은 positive 대화 2개와 negative 대화 2개를 포함하는 리스트이다.
        """
        self.tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
        self.load_data(file_path)

    def load_data(self, file_path):
        df = pd.read_csv(file_path)[:10]  # 확인용으로 10개만 추출
        self.len_data = len(df)
        self.positive_pairs = []
        self.negative_pairs = []
        self.labels = []
        
        for _, row in df.iterrows():
            pair = row['pairs'].split('||')
            positive_pair = pair[0].split('|')  # Positive pair 분리
            # print(positive_pair)
            negative_pair = pair[1].split('|')[:len(positive_pair)]  # Negative pair 분리 (positive pair와 개수 동일하게)
            label = [1] * len(positive_pair) + [0] * len(negative_pair)
            
            self.positive_pairs.append(positive_pair)
            self.negative_pairs.append(negative_pair)
            self.labels.append(label)
        
        self.positive_features = self.get_dialfeature(self.positive_pairs)
        self.negative_features = self.get_dialfeature(self.negative_pairs)
        # print(len(self.positive_features))
        # print(self.positive_features)
        # print(self.positive_pairs)
        
            
            
    def get_dialfeature(self, pairs):  # pos, neg 각각 실행
        """
        pairs 예시
        
        [
            ['new contact add#name', "5''1'#eye color"],
            ["No, that is it, thanks so much!#You're welcome.", "5''1'#eye color"]
        ]
        """                

        features = []
        for data_index, pair in enumerate(pairs):
            bert_features_pair = []  # 대화 내 pair 여러개이므로 구분해주기 위해 list of list
                     
            for t, turn in enumerate(pair):
                
                use_response = False
  
                # 하나의 pair에 대한 role
                role_list = [1, 0]

                sample_input_ids = []
                sample_segment_ids = []
                sample_role_ids = []
                sample_input_mask = []
                sample_turn_ids = []
                sample_position_ids = [] 
                sample_tokens = [] 

                text_tokens = []
                text_turn_ids = []
                text_role_ids = []
                text_segment_ids = []
                    
                # config.turn_sep_token='#'
                text_list = turn.split('#')

                # token: token [eou] token [eou] [bos] token [eos]
                # role:   0     0     1     1     0     0      0
                # turn:   2     2     1     1     0     0      0
                # pos:    0     1     0     1     0     1      2
                bou, eou, bos, eos = "[unused0]", "[unused1]", "[unused0]", "[unused1]"

                # use [CLS] as the latent variable of PLATO
                # text_list[0] = self.args.start_token + ' ' + text_list[0]

                if use_response == True:   # specify the context and response
                    context, response = text_list[:-1], text_list[-1]
                    word_list = self.tokenizer.tokenize(response)
                    uttr_len = len(word_list)

                    start_token, end_token = bou, eou

                    role_id, turn_id = role_list[-1], 0

                    response_tokens = [start_token] + word_list + [end_token]
                    response_role_ids = [role_id] * (1 + uttr_len + 1)
                    response_turn_ids = [turn_id] * (1 + uttr_len + 1)
                    response_segment_ids = [0] * (1 + uttr_len + 1)                   # not use

                else:
                    context = text_list
                    response_tokens, response_role_ids, response_turn_ids, response_segment_ids = [], [], [], []

                    # limit the context length
                    # context = context[-self.args.max_context_length:]
                    # context = context[-16:]  # 가장 최근 512개 턴만 유지(특정 길이 데이터 제한, hyperparameter 변경 필요)

                    '''
                    use_response == False일 경우, 한 대화(샘플)에서 분리된 턴이 하나씩 들어감
                    
                    '''
                    # 한 turn씩 반복
                    for i, text in enumerate(context):
                        # print(text)
                        word_list = self.tokenizer.tokenize(text)
                        uttr_len = len(word_list)

                        end_token = eou

                        role_id, turn_id = role_list[i], len(context) - i

                        text_tokens.extend(word_list + [end_token])
                        text_role_ids.extend([role_id] * (uttr_len + 1))
                        text_turn_ids.extend([turn_id] * (uttr_len + 1))
                        text_segment_ids.extend([0] * (uttr_len + 1))

                    
                    text_tokens.extend(response_tokens)
                    text_role_ids.extend(response_role_ids)
                    text_turn_ids.extend(response_turn_ids)
                    text_segment_ids.extend(response_segment_ids)

                    if len(text_tokens) > 16:
                        text_tokens = text_tokens[:16]
                        text_turn_ids = text_turn_ids[:16]
                        text_role_ids = text_role_ids[:16]
                        text_segment_ids = text_segment_ids[:16]

                    #  max_context_length=15
                    assert (max(text_turn_ids) <= 15)

                    # 制作text_position_id序列  -> Make text_position_id sequence
                    text_position_ids = []
                    text_position_id = 0
                    for i, turn_id in enumerate(text_turn_ids):
                        # print(i, turn_id)
                        if i != 0 and turn_id < text_turn_ids[i - 1]:   # PLATO
                            text_position_id = 0
                        # print(text_position_id)
                        text_position_ids.append(text_position_id)
                        text_position_id += 1
                    

                    # max_turn_id = max(text_turn_ids)
                    # text_turn_ids = [max_turn_id - t for t in text_turn_ids]

                    text_input_ids = self.tokenizer.convert_tokens_to_ids(text_tokens)
                    text_input_mask = [1] * len(text_input_ids)
                    

                    # Zero-pad up to the sequence length.
                    while len(text_input_ids) < 16:
                        text_input_ids.append(0)
                        text_turn_ids.append(0)
                        text_role_ids.append(0)
                        text_segment_ids.append(0)
                        text_position_ids.append(0)
                        text_input_mask.append(0)

                    # max_context_lengt=512
                    assert len(text_input_ids) == 16
                    assert len(text_turn_ids) == 16
                    assert len(text_role_ids) == 16
                    assert len(text_segment_ids) ==16
                    assert len(text_position_ids) == 16
                    assert len(text_input_mask) == 16
                    
                    sample_input_ids.append(text_input_ids)
                    sample_turn_ids.append(text_turn_ids)
                    sample_role_ids.append(text_role_ids)
                    sample_segment_ids.append(text_segment_ids)
                    sample_position_ids.append(text_position_ids)
                    sample_input_mask.append(text_input_mask)
                    sample_tokens.append(text_tokens)

                bert_feature = DialogueFeatures(input_ids=sample_input_ids,
                                            input_mask=sample_input_mask,
                                            segment_ids=sample_segment_ids,
                                            role_ids=sample_role_ids,
                                            turn_ids=sample_turn_ids,
                                            position_ids=sample_position_ids)
                
                bert_features_pair.append(bert_feature)
                # print(bert_features_pair)
            features.append(bert_features_pair)
        return features
            

    def __getitem__(self, idx):
        """
        주어진 인덱스에 해당하는 대화 샘플을 반환한다.
        각 대화 샘플은 positive 대화 2개와 negative 대화 2개를 포함한다.
        """
        # 선택된 대화 샘플을 반환
        positive_features = self.positive_features[idx]        
        negative_features = self.negative_features[idx]
        
        import itertools

        # positive_features는 DialogueFeatures 객체의 리스트
        all_pos_input_ids = list(itertools.chain.from_iterable(feature.input_ids for feature in positive_features))
        all_pos_input_mask = list(itertools.chain.from_iterable(feature.input_mask for feature in positive_features))
        all_pos_segment_ids = list(itertools.chain.from_iterable(feature.segment_ids for feature in positive_features))
        all_pos_role_ids = list(itertools.chain.from_iterable(feature.role_ids for feature in positive_features))
        all_pos_turn_ids = list(itertools.chain.from_iterable(feature.turn_ids for feature in positive_features))
        all_pos_position_ids = list(itertools.chain.from_iterable(feature.position_ids for feature in positive_features))
        
        all_neg_input_ids = list(itertools.chain.from_iterable(feature.input_ids for feature in negative_features))
        all_neg_input_mask = list(itertools.chain.from_iterable(feature.input_mask for feature in negative_features))
        all_neg_segment_ids = list(itertools.chain.from_iterable(feature.segment_ids for feature in negative_features))
        all_neg_role_ids = list(itertools.chain.from_iterable(feature.role_ids for feature in negative_features))
        all_neg_turn_ids = list(itertools.chain.from_iterable(feature.turn_ids for feature in negative_features))
        all_neg_position_ids = list(itertools.chain.from_iterable(feature.position_ids for feature in negative_features))
        

        
        # 하나의 대화 샘플에 대한 정보 반환
        inputs = {'positive':{
                        'input_ids':torch.LongTensor(all_pos_input_ids),
                        # 'input_mask':torch.LongTensor(all_pos_input_mask),
                        # 'segment_ids':torch.LongTensor(all_pos_segment_ids),
                        'role_ids':torch.LongTensor(all_pos_role_ids),
                        'turn_ids':torch.LongTensor(all_pos_turn_ids),
                        # 'position_ids':torch.LongTensor(all_pos_position_ids)
                    },
                  'negative':{
                        'input_ids':torch.LongTensor(all_neg_input_ids),
                        # 'input_mask':torch.LongTensor(all_neg_input_mask),
                        # 'segment_ids':torch.LongTensor(all_neg_segment_ids),
                        'role_ids':torch.LongTensor(all_neg_role_ids),
                        'turn_ids':torch.LongTensor(all_neg_turn_ids),
                        # 'position_ids':torch.LongTensor(all_neg_position_ids)
                    }
                }
        # print("==========inputs===========")
        # print(inputs)
        
        return inputs
    
    def __len__(self):
        # 데이터셋의 길이는 전체 대화 샘플의 수
        return self.len_data



dataset_path = glob2.glob("../dial2vec/datasets/*_train*.csv")

# CustomDataset 인스턴스 생성
dataset = DialogueDataset(dataset_path[3])  # sgd


from torch.nn.utils.rnn import pad_sequence

def custom_collate_fn(batch):
    # batch는 리스트 형태이며, 각 원소는 __getitem__에서 반환된 결과입니다.
    # 각 특성별로 패딩을 적용합니다.
    pos_input_ids = pad_sequence([torch.tensor(sample['positive']['input_ids']) for sample in batch], batch_first=True, padding_value=0)
    pos_role_ids = pad_sequence([torch.tensor(sample['positive']['role_ids']) for sample in batch], batch_first=True, padding_value=0)
    pos_turn_ids = pad_sequence([torch.tensor(sample['positive']['turn_ids']) for sample in batch], batch_first=True, padding_value=0)
    
    neg_input_ids = pad_sequence([torch.tensor(sample['negative']['input_ids']) for sample in batch], batch_first=True, padding_value=0)
    neg_role_ids = pad_sequence([torch.tensor(sample['negative']['role_ids']) for sample in batch], batch_first=True, padding_value=0)
    neg_turn_ids = pad_sequence([torch.tensor(sample['negative']['turn_ids']) for sample in batch], batch_first=True, padding_value=0)

    # 모든 특성에 대한 패딩 적용 후 최종 배치 데이터 구성
    batched_data = {
        'positive': {
            'input_ids': pos_input_ids,
            'role_ids': pos_role_ids,
            'turn_ids': pos_turn_ids
        },
        'negative': {
            'input_ids': neg_input_ids,
            'role_ids': neg_role_ids,
            'turn_ids': neg_turn_ids
        }
    }

    return batched_data

from torch.utils.data import DataLoader

batch_size = 4
dialogue_dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=custom_collate_fn)

for batch_idx, batch_data in enumerate(dialogue_dataloader):
    # batch_data에서 'positive'와 'negative' 데이터 추출
    positive_inputs = batch_data['positive']
    negative_inputs = batch_data['negative']

        # 단순히 배치 데이터의 형태를 확인하기 위한 출력
    print(f"Batch {batch_idx + 1}")
    print(f"Positive Input IDs shape: {positive_inputs['input_ids']}")
    print(f"Negative Input IDs shape: {negative_inputs['input_ids']}")
    # 추가적으로 필요한 처리를 여기에 구현할 수 있습니다.
    
    # 데모를 위한 break; 실제 사용시에는 제거해야 합니다.
    if batch_idx == 1:  # 첫 번째 배치만 처리하고 멈추기
        break


Batch 1
Positive Input IDs shape: tensor([[[ 2748,  1012,  2045,  2003,  2023,  2299,  2055,  4531,  2293,  1012,
           2009,  2003,  1037,  2200,  3522,  2299],
         [ 4067,  2017,  2005,  4531,  2054,  2008,  2299,  2001,  2170,  1012,
              2,  2017,  1005,  2128,  6160,   999]],

        [[ 4283,     2,  2053,  3291,     2,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0],
         [ 2151,  2334,  5030,  2686,  1999,  6396,  1029,     2,  2045,  1005,
           1055,  1037,  2193,  1997,  2307,  7516]],

        [[ 4283, 28516,     2,  2017,  1005,  2128,  2087,  6160,     2,     0,
              0,     0,     0,     0,     0,     0],
         [ 2821,  1010,  1045,  2228,  1045,  2001,  3810,  1996,  3308,  2126,
           1012,  1045,  1005,  2222,  3046,  2153]],

        [[ 1045,  2215,  2070,  2592,     2,  2054,  2785,  1997,  2592,  1029,
              2,     0,     0,     0,     0,     0],
         [ 2054,  2079,  2017

In [58]:
import warnings
# 경고창 숨기기
warnings.filterwarnings('ignore') 
# 경고창 다시 나타내기
# warnings.filterwarnings('default')

# DataLoader

In [13]:
from torch.utils.data import DataLoader

In [None]:
def get_pytorch_kobert_model(ctx="cpu", cachedir=".cache"):
    def get_kobert_model(model_path, vocab_file, ctx="cpu"):
        bertmodel = BertModel.from_pretrained(model_path, return_dict=False)
        device = torch.device(ctx)
        bertmodel.to(device)
        bertmodel.eval()
        vocab_b_obj = nlp.vocab.BERTVocab.from_sentencepiece(
            vocab_file, padding_token="[PAD]"
        )
        return bertmodel, vocab_b_obj

    pytorch_kobert = {
        "url": "s3://skt-lsl-nlp-model/KoBERT/models/kobert_v1.zip",
        "chksum": "411b242919",  # 411b2429199bc04558576acdcac6d498
    }

    # download model
    model_info = pytorch_kobert
    model_path, is_cached = download(
        model_info["url"], model_info["chksum"], cachedir=cachedir
    )
    cachedir_full = os.path.expanduser(cachedir)
    zipf = ZipFile(os.path.expanduser(model_path))
    zipf.extractall(path=cachedir_full)
    model_path = os.path.join(os.path.expanduser(cachedir), "kobert_from_pretrained")
    # download vocab
    vocab_path = get_tokenizer()
    return get_kobert_model(model_path, vocab_path, ctx)


def get_loader(args, metric):
    bert_model, vocab = get_pytorch_kobert_model()
    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
    tokenizer = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)
    
    self.tokenizer_config = PlatoConfig.from_json_file("plato/config.json")

    path_to_train_data = args.path_to_data + '/' + args.task + '/' + args.train_data
    path_to_valid_data = args.path_to_data + '/' + args.task + '/' + args.valid_data
    path_to_test_data = args.path_to_data + '/' + args.task + '/' + args.test_data

    if args.train == 'True' and args.test == 'False':
        train_iter = ModelDataLoader(path_to_train_data, args, metric, tokenizer, vocab, type='train')
        valid_iter = ModelDataLoader(path_to_valid_data, args, metric, tokenizer, vocab, type='valid')

        train_iter.load_data('train')
        valid_iter.load_data('valid')

        loader = {'train': DataLoader(dataset=train_iter,
                                      batch_size=args.batch_size,
                                      shuffle=True),
                  'valid': DataLoader(dataset=valid_iter,
                                      batch_size=args.batch_size,
                                      shuffle=True)}

    elif args.train == 'False' and args.test == 'True':
        test_iter = ModelDataLoader(path_to_test_data, args, metric, tokenizer, vocab, type='test')
        test_iter.load_data('test')

        loader = {'test': DataLoader(dataset=test_iter,
                                     batch_size=args.batch_size,
                                     shuffle=True)}

    else:
        loader = None

    return bert_model, loader, tokenizer