In [2]:
import pandas as pd
from tqdm import tqdm
import random


### 원본 데이터에서 시작

In [3]:
datasetname = 'mwoz' # metalwoz, mwoz, selfdialog, sgd
column_names = ['turn', 'dialogue', 'label']
path = '/home/jihyeon41/research_dial_embedding/dial2vec_git/dial2vec/datasets/'

train_df = pd.read_csv(f'{path}{datasetname}/train.tsv', sep = '\t', header=None, names=column_names)


In [4]:
train_df['dialogue'][0].split('#')

["I need lodgings, and I'd like to stay in an expensive guesthouse.",
 "Our database doesn't list an expensive guesthouse, would you like me to try something else?",
 'I would like a 4 star hotel, in an expensive guesthouse please. Thanks!',
 "I don't have anything in that area that meets those criteria. Can I try searching something else?",
 'What about a moderate priced one instead?',
 'I have 11 results. Do you prefer to stay in a particular area of town?',
 "I don't have an area preference.",
 'I located avalon guesthouse located north, is a 4 and moderately priced. Would you like me to book a room for how many guest, days and starting on what day?',
 'Do you know what area that is in and whether they have internet?',
 'They are wifi capable and in the north area, would you like more information or a booking?',
 'Can I please have the address along with post code?',
 'Sure, the address is 62 Gilbert Road and the postcode is cb43pd.',
 'Thanks, that takes care of everything for me. 

In [5]:
# train_df = train_df.head(5)
train_df['dial_split'] = train_df['dialogue'].str.split('|').tolist()
train_df['dial_split'][0]

["I need lodgings, and I'd like to stay in an expensive guesthouse.#Our database doesn't list an expensive guesthouse, would you like me to try something else?#I would like a 4 star hotel, in an expensive guesthouse please. Thanks!#I don't have anything in that area that meets those criteria. Can I try searching something else?#What about a moderate priced one instead?#I have 11 results. Do you prefer to stay in a particular area of town?#I don't have an area preference.#I located avalon guesthouse located north, is a 4 and moderately priced. Would you like me to book a room for how many guest, days and starting on what day?#Do you know what area that is in and whether they have internet?#They are wifi capable and in the north area, would you like more information or a booking?#Can I please have the address along with post code?#Sure, the address is 62 Gilbert Road and the postcode is cb43pd.#Thanks, that takes care of everything for me. Have a good one!#Have a great time on your trip.

In [6]:
train_df.head(3)

Unnamed: 0,turn,dialogue,label,dial_split
0,10101010101010,"I need lodgings, and I'd like to stay in an ex...",0,"[I need lodgings, and I'd like to stay in an e..."
1,1010101010101010,I'm looking for a medium priced Chinese restau...,0,[I'm looking for a medium priced Chinese resta...
2,101010101010,I would like to eat at an expensive european f...,0,[I would like to eat at an expensive european ...


### 질문, 대답 리스트 만들기

In [7]:
import re
import warnings
warnings.filterwarnings("ignore")

def add_sep_to_turn(turn):
    # 문장 구분을 위한 정규표현식
    # sentence_pattern = re.compile(r'([.!?])\s+(?=[A-Z])')
    sentence_pattern = re.compile(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\!|\?)\s')  # U.S.A., Mr. 등 배제
    
    # 문장 구분자를 추가할 턴
    new_turn = ""
    sentences = re.split(sentence_pattern, turn)
    for i, sentence in enumerate(sentences):
        if not sentence.strip():
            continue
        
        new_turn += sentence
        # 마지막 문장이 아니면서 다음 문자가 대문자인 경우 [sep] 추가
        if i < len(sentences) - 1 and sentences[i + 1] and sentences[i + 1][0].isupper():
            new_turn += "[sep]"
    
    return new_turn

In [8]:
# 대화를 턴 단위로 구분하여 문장 구분자 [sep] 추가
def add_sep_to_dialogue(dialogue):
    turns = dialogue.split('#')
    new_dialogue = ""
    for i, turn in enumerate(turns):
        new_turn = add_sep_to_turn(turn)
        new_dialogue += new_turn
        if i < len(turns) - 1:
            new_dialogue += "#"
    
    return new_dialogue

In [9]:
# 턴을 문장 단위로 구분하여 [sep] 구분자 추가
for i, dial_split in tqdm(enumerate(train_df['dial_split']), total=len(train_df['dial_split'])):
    new_dial_split = [add_sep_to_dialogue(turn) for turn in dial_split]
    train_df['dial_split'][i] = new_dial_split

display(train_df['dial_split'][0])

100%|██████████| 16698/16698 [00:12<00:00, 1354.16it/s]


["I need lodgings, and I'd like to stay in an expensive guesthouse.#Our database doesn't list an expensive guesthouse, would you like me to try something else?#I would like a 4 star hotel, in an expensive guesthouse please.[sep]Thanks!#I don't have anything in that area that meets those criteria.[sep]Can I try searching something else?#What about a moderate priced one instead?#I have 11 results.[sep]Do you prefer to stay in a particular area of town?#I don't have an area preference.#I located avalon guesthouse located north, is a 4 and moderately priced.[sep]Would you like me to book a room for how many guest, days and starting on what day?#Do you know what area that is in and whether they have internet?#They are wifi capable and in the north area, would you like more information or a booking?#Can I please have the address along with post code?#Sure, the address is 62 Gilbert Road and the postcode is cb43pd.#Thanks, that takes care of everything for me.[sep]Have a good one!#Have a grea

In [10]:
# 질문과 대답 리스트 생성
q_list = []
a_list = []
question_keywords = re.compile(r'^(who|what|why|when|where|how|is|are|does|did|do|can|could|will|would|shall|should|might|must|may|won\'t|can\'t|isn\'t|aren\'t|was|were|has|haven\'t|had|shall|whom|whose|which)\b',re.IGNORECASE)
intent_keywords = re.compile(r'\b(like|need|help|want|book|pick|think)\b', re.IGNORECASE)

for dial_split in tqdm(train_df['dial_split'], total=len(train_df['dial_split'])):
    dial = dial_split[0]
    turns = dial.split('#')
    for turn in turns:
        if '[sep]' in turn:
            sentences = turn.split('[sep]')  # [sep]로 분리된 문장
            for sentence in sentences:
                if '?' in sentence or question_keywords.match(sentence.strip()) or intent_keywords.search(sentence.strip()):
                    q_list.append(sentence)
                else:
                    a_list.append(sentence)
        else:
            if '?' in turn or question_keywords.match(turn.lower().strip()) or intent_keywords.search(turn.strip()):
                q_list.append(turn)
            else:
                a_list.append(turn) ;''

  0%|          | 0/16698 [00:00<?, ?it/s]

100%|██████████| 16698/16698 [00:00<00:00, 26540.48it/s]


In [11]:
q_list = list(set(q_list))
a_list = list(set(a_list))

In [12]:
print("q_list:", len(q_list))
# display(q_list)

print("a_list:", len(a_list))
# display(a_list)

q_list: 60159
a_list: 65896


### 턴 단위의 대화를 문장 단위로 구분, turn도 문장 단위로 변경하되 동일한 턴은 같은 숫자 부여

In [13]:
# 기존 negative samples를 활용하지 않을 경우
train_df['dialogue'] = train_df['dialogue'].str.split('|').str[0]
train_df.drop(columns=['dial_split'], inplace=True)

In [14]:
# 턴을 문장 단위로 구분하여 [sep] 구분자 추가
train_df['sep_dial'] = ''
for i, dial in tqdm(enumerate(train_df['dialogue']), total=len(train_df['dialogue'])):
    sep_dial = add_sep_to_dialogue(dial)
    train_df['sep_dial'][i] = sep_dial

100%|██████████| 16698/16698 [00:03<00:00, 5210.07it/s]


In [15]:
# [sep] 구분자 개수 세는 함수
def count_sep_per_turn(text):
    turns = text.split('#')  # #로 대화 턴 분리
    sep_counts = [turn.count('[sep]')+1 for turn in turns]  # 각 턴에서 [sep]을 이용해서 문장 개수 세기
    return sep_counts

In [16]:
train_df['sep_counts_per_turn'] = train_df['sep_dial'].apply(count_sep_per_turn)

In [17]:
train_df.head(3)

Unnamed: 0,turn,dialogue,label,sep_dial,sep_counts_per_turn
0,10101010101010,"I need lodgings, and I'd like to stay in an ex...",0,"I need lodgings, and I'd like to stay in an ex...","[1, 1, 2, 2, 1, 2, 1, 2, 1, 1, 1, 1, 2, 1]"
1,1010101010101010,I'm looking for a medium priced Chinese restau...,0,I'm looking for a medium priced Chinese restau...,"[1, 2, 1, 2, 3, 3, 2, 1, 2, 1, 1, 2, 1, 1, 2, 3]"
2,101010101010,I would like to eat at an expensive european f...,0,I would like to eat at an expensive european f...,"[1, 1, 2, 1, 1, 2, 1, 1, 1, 2, 1, 2]"


In [18]:
# 문장 단위로 turn 수정

# 1과 0이 번갈아가며 이루어진 리스트 생성 함수
def generate_turn_list(sep_counts_per_turn):
    turn_list = []
    for i, count in enumerate(sep_counts_per_turn):
        if i % 2 == 0:  # 짝수번째
            turn_list.extend([1] * count)
        else:  # 홀수번째
            turn_list.extend([0] * count)
    return turn_list

train_df['turn'] = train_df['sep_counts_per_turn'].apply(generate_turn_list)

In [19]:
def convert_to_binary(turn_list):
    turn_str = ''.join(map(str, turn_list))
    return turn_str

train_df['turn'] = train_df['turn'].apply(convert_to_binary)

In [20]:
train_df.head(3)

Unnamed: 0,turn,dialogue,label,sep_dial,sep_counts_per_turn
0,1011001001001010110,"I need lodgings, and I'd like to stay in an ex...",0,"I need lodgings, and I'd like to stay in an ex...","[1, 1, 2, 2, 1, 2, 1, 2, 1, 1, 1, 1, 2, 1]"
1,1001001110001101101001011000,I'm looking for a medium priced Chinese restau...,0,I'm looking for a medium priced Chinese restau...,"[1, 2, 1, 2, 3, 3, 2, 1, 2, 1, 1, 2, 1, 1, 2, 3]"
2,1011010010100100,I would like to eat at an expensive european f...,0,I would like to eat at an expensive european f...,"[1, 1, 2, 1, 1, 2, 1, 1, 1, 2, 1, 2]"


In [21]:
# [sep]구분자도 모두 #으로 변경하여 문장 단위의 구분으로 dialogue 저장

train_df['dialogue'] = train_df['sep_dial'].str.replace('[sep]', '#')
train_df.drop(columns=['sep_dial', 'sep_counts_per_turn'], inplace=True)

In [22]:
train_df.head(3)

Unnamed: 0,turn,dialogue,label
0,1011001001001010110,"I need lodgings, and I'd like to stay in an ex...",0
1,1001001110001101101001011000,I'm looking for a medium priced Chinese restau...,0
2,1011010010100100,I would like to eat at an expensive european f...,0


### qa_turn 생성
- 질문 문장(1), 질문이 아닌 문장(0)

In [23]:
# 포함 여부로 qa_turn 생성
# 질문 판단을 위한 조건 추가 시, 이 부분에서 코드 수정
question_keywords = re.compile(r'^(who|what|why|when|where|how|is|are|does|did|do|can|could|will|would|shall|should|might|must|may|won\'t|can\'t|isn\'t|aren\'t|was|were|has|haven\'t|had|shall|whom|whose|which)\b',re.IGNORECASE)
intent_keywords = re.compile(r'\b(like|need|help|want|book|pick|think)\b', re.IGNORECASE)

def get_qa_turn(dialogue):
    qa_turn = []
    for sentence in dialogue.split('#'):
        if '?' in sentence or question_keywords.match(sentence.strip()) or intent_keywords.search(sentence.strip()):
            qa_turn.append(1)
        else:
            qa_turn.append(0)
    return qa_turn

train_df['qa_turn'] = train_df['dialogue'].apply(get_qa_turn)
train_df['qa_turn'] = train_df['qa_turn'].apply(convert_to_binary)

In [24]:
train_df.head(3)

Unnamed: 0,turn,dialogue,label,qa_turn
0,1011001001001010110,"I need lodgings, and I'd like to stay in an ex...",0,1110011010011110000
1,1001001110001101101001011000,I'm looking for a medium priced Chinese restau...,0,0011010110000111110111101000
2,1011010010100100,I would like to eat at an expensive european f...,0,1101010101100100


### qan_turn 생성
- 대답 문장(0), 질문 문장(1), 질문도 대답도 아닌 문장(2)

In [25]:
def generate_qan_turn(dialogue):
    qan_turn = []
    previous_has_question = False

    for sentence in dialogue.split('#'):
        if '?' in sentence or question_keywords.match(sentence.strip()) or intent_keywords.search(sentence.strip()):
            # 현재 문장이 질문 문장일 경우
            qan_turn.append(1)
            previous_has_question = True
        elif previous_has_question:
            # 질문 문장이 아니고 이전 문장에 질문이 있었을 경우
            qan_turn.append(0)
            previous_has_question = False
        else:
            # 질문 문장도 아니고 이전 문장에 질문이 없을 경우
            qan_turn.append(2)
            previous_has_question = False

    return qan_turn

In [26]:
train_df['qan_turn'] = train_df['dialogue'].apply(generate_qan_turn)
train_df['qan_turn'] = train_df['qan_turn'].apply(convert_to_binary)

In [27]:
train_df.head(3)

Unnamed: 0,turn,dialogue,label,qa_turn,qan_turn
0,1011001001001010110,"I need lodgings, and I'd like to stay in an ex...",0,1110011010011110000,1110211010211110222
1,1001001110001101101001011000,I'm looking for a medium priced Chinese restau...,0,0011010110000111110111101000,2211010110222111110111101022
2,1011010010100100,I would like to eat at an expensive european f...,0,1101010101100100,1101010101102102


### negtive sample 생성: 질문 문장 대체 4개, 질문이 아닌 문장 대체 4개

In [28]:
def generate_que_negatives(dialogue, q_list):
    neg_samples = []
    for _ in range(4):
        neg_sample = []
        for sentence in dialogue.split('#'):
            if '?' in sentence or question_keywords.match(sentence.strip()) or intent_keywords.search(sentence.strip()):
                # 질문 문장이면 랜덤으로 q_list에서 문장을 선택하여 대체
                neg_sample.append(random.choice(q_list))
            else:
                neg_sample.append(sentence)
        # 리스트를 문자열로 변환하여 negative sample에 추가
        neg_samples.append('#'.join(neg_sample))
    # negative samples를 '|'로 구분하여 이어붙여서 반환
    return '|'.join(neg_samples)

In [29]:
def generate_notque_negatives(dialogue, a_list):
    neg_samples = []
    for _ in range(4):
        neg_sample = []
        for sentence in dialogue.split('#'):
            if '?' not in sentence and (not question_keywords.match(sentence.strip())) and (not intent_keywords.search(sentence.strip())):
                #  문장이면 랜덤으로 a_list에서 문장을 선택하여 대체
                neg_sample.append(random.choice(a_list))
            else:
                neg_sample.append(sentence)
        # 리스트를 문자열로 변환하여 negative sample에 추가
        neg_samples.append('#'.join(neg_sample))
    # negative samples를 '|'로 구분하여 이어붙여서 반환
    return '|'.join(neg_samples)

In [30]:
train_df['qa_que_negs'] = train_df['dialogue'].apply(lambda x: generate_que_negatives(x, q_list))
train_df['qa_quenot_negs'] = train_df['dialogue'].apply(lambda x: generate_notque_negatives(x, a_list))


In [31]:
train_df['qa_dialogue'] = train_df['dialogue'] + '|' + train_df['qa_que_negs']
train_df['qa_dialogue'] = train_df['qa_dialogue'] + '|' + train_df['qa_quenot_negs']

train_df.drop(columns=['qa_que_negs', 'qa_quenot_negs'], inplace=True)

In [32]:
train_df.head(3)

Unnamed: 0,turn,dialogue,label,qa_turn,qan_turn,qa_dialogue
0,1011001001001010110,"I need lodgings, and I'd like to stay in an ex...",0,1110011010011110000,1110211010211110222,"I need lodgings, and I'd like to stay in an ex..."
1,1001001110001101101001011000,I'm looking for a medium priced Chinese restau...,0,0011010110000111110111101000,2211010110222111110111101022,I'm looking for a medium priced Chinese restau...
2,1011010010100100,I would like to eat at an expensive european f...,0,1101010101100100,1101010101102102,I would like to eat at an expensive european f...


In [33]:
len(train_df['qa_dialogue'][0].split('|'))

9

### negtive sample 생성: 질문 문장 대체 4개 + 대답 문장 대체 4개

In [34]:
# 대답 문장(0)과 질문도 대답도 아닌 문장(2)를 구분하여, 대답 문장만 다른 문장으로 대체
def generate_ans_negatives(dialogue, a_list):
    neg_samples = []
    previous_has_question = False
    for _ in range(4):
        neg_sample = []
        for sentence in dialogue.split('#'):
            if '?' in sentence or question_keywords.match(sentence.strip()) or intent_keywords.search(sentence.strip()):
                # 현재 문장이 질문 문장일 경우(1)
                previous_has_question = True
                neg_sample.append(sentence)
            elif previous_has_question:
                # 질문 문장이 아니고 이전 문장에 질문이 있었을 경우 -> 대답 문장(0)
                previous_has_question = False
                neg_sample.append(random.choice(a_list))
            else:
                # 질문 문장도 아니고 이전 문장에 질문이 없을 경우 -> 질문도 대답도 아닌 문장(2)
                neg_sample.append(sentence)
                
        # 리스트를 문자열로 변환하여 negative sample에 추가
        neg_samples.append('#'.join(neg_sample))
    # negative samples를 '|'로 구분하여 이어붙여서 반환
    return '|'.join(neg_samples)

In [35]:
train_df['qa_que_negs'] = train_df['dialogue'].apply(lambda x: generate_que_negatives(x, q_list))
train_df['qa_ans_negs'] = train_df['dialogue'].apply(lambda x: generate_ans_negatives(x, a_list))

In [36]:
train_df['qan_dialogue'] = train_df['dialogue'] + '|' + train_df['qa_que_negs']
train_df['qan_dialogue'] = train_df['qan_dialogue'] + '|' + train_df['qa_ans_negs']

train_df.drop(columns=['qa_que_negs', 'qa_ans_negs'], inplace=True)

In [37]:
train_df.head(3)

Unnamed: 0,turn,dialogue,label,qa_turn,qan_turn,qa_dialogue,qan_dialogue
0,1011001001001010110,"I need lodgings, and I'd like to stay in an ex...",0,1110011010011110000,1110211010211110222,"I need lodgings, and I'd like to stay in an ex...","I need lodgings, and I'd like to stay in an ex..."
1,1001001110001101101001011000,I'm looking for a medium priced Chinese restau...,0,0011010110000111110111101000,2211010110222111110111101022,I'm looking for a medium priced Chinese restau...,I'm looking for a medium priced Chinese restau...
2,1011010010100100,I would like to eat at an expensive european f...,0,1101010101100100,1101010101102102,I would like to eat at an expensive european f...,I would like to eat at an expensive european f...


In [38]:
len(train_df['qan_dialogue'][0].split('|'))

9

### trainset 정리

칼럼명
- turn: turn을 반영(대화자1의 turn에 2개의 문장이 있을 경우 11로 할당)
- dialogue: 원본 대화 1개
- label: .
- qa_turn: 질문 문장이면 1, 질문 문장이 아니면 0을 할당
- qan_turn: 질문 문장이면 1, 대답 문장이면 0, 질문도 대답도 아닌 문장이면 2를 할당
- qa_dialogue: qa_turn을 기반으로, 원본 대화 1개와 질문 문장을 대체한 4개 대화와 질문이 아닌 문장을 대체한 4개 대화로 구성
- qan_dialogue: qan_turn을 기반으로, 원본 대화 1개와 질문 문장을 대체한 4개 대화와 대답 문장을 대체한 4개 대화로 구성


In [39]:
train_df.head(3)

Unnamed: 0,turn,dialogue,label,qa_turn,qan_turn,qa_dialogue,qan_dialogue
0,1011001001001010110,"I need lodgings, and I'd like to stay in an ex...",0,1110011010011110000,1110211010211110222,"I need lodgings, and I'd like to stay in an ex...","I need lodgings, and I'd like to stay in an ex..."
1,1001001110001101101001011000,I'm looking for a medium priced Chinese restau...,0,0011010110000111110111101000,2211010110222111110111101022,I'm looking for a medium priced Chinese restau...,I'm looking for a medium priced Chinese restau...
2,1011010010100100,I would like to eat at an expensive european f...,0,1101010101100100,1101010101102102,I would like to eat at an expensive european f...,I would like to eat at an expensive european f...


### QA 결합: 불균형 해소

In [40]:
# QA 결합
from tqdm import tqdm

def comb_qa(df):
    new_samples = []
    new_turns = []
    for qa_turn, sample in zip(df['qa_turn'],tqdm(df['qa_dialogue'])):
        dials = sample.split("|")
        new_sample = ""
        for dial in dials:
            sentences = dial.split("#")
            new_sentences = sentences[0]
            new_qa_turn = qa_turn[0]
            
            for i in range(1, len(qa_turn)):
                if qa_turn[i] != qa_turn[i-1]:
                    new_qa_turn += qa_turn[i]
                    new_sentences += "#" + sentences[i]
                else:
                    new_sentences += " " + sentences[i]
            new_sample += "|" + new_sentences

        new_samples.append(new_sample[1:])
        new_turns.append(new_qa_turn)

    return new_samples, new_turns

train_new_samples, train_new_turns = comb_qa(train_df)
train_df['comb_qa_dialogue'] = train_new_samples
train_df['comb_qa_turn'] = train_new_turns

  0%|          | 0/16698 [00:00<?, ?it/s]

100%|█████████▉| 16697/16698 [00:01<00:00, 12638.25it/s]


In [41]:
comb_qa_train_df = pd.DataFrame({
    'comb_qa_turn': train_new_turns,
    'dialogue': train_new_samples,
    'label': train_df['label'].to_list()
})
comb_qa_train_df

Unnamed: 0,comb_qa_turn,dialogue,label
0,10101010,"I need lodgings, and I'd like to stay in an ex...",0
1,0101010101010,I'm looking for a medium priced Chinese restau...,0
2,101010101010,I would like to eat at an expensive european f...,0
3,101010101010101010,"I feel like visiting a museum, find me one ple...",0
4,10101010101010,I'd like to find a college to visit in the cen...,0
...,...,...,...
16693,10101010,I want to find an expensive restaurant#There a...,0
16694,010101010,I'm looking for a museum to visit. I have 23 t...,0
16695,101010101010101010,Can you help me out with finding a restaurant ...,0
16696,1010101010101010,"Hi, I would like a restaurant inthe centre of ...",0


In [42]:
# 전체 샘플 9개 확인
for i in comb_qa_train_df['dialogue']:
    if len(i.split("|")) != 9:
        print(i)

#### 16개 이상 turn 개수 분석
- sgd: 2197개 중 1241개로 여전히 많음(56%)
- mwoz: 16698개 중 4998개 30%
- selfdialog: 38662개 중 3792개 10%
- metalwoz: 60614개 중 270개 0.004%

In [43]:
comb_qa_train_df['num_turns'] = comb_qa_train_df['comb_qa_turn'].apply(len)
comb_qa_train_df[comb_qa_train_df['num_turns'] >= 16]  # ['dialogue'].iloc[0]

Unnamed: 0,comb_qa_turn,dialogue,label,num_turns
3,101010101010101010,"I feel like visiting a museum, find me one ple...",0,18
5,0101010101010101,I'm looking for a restaurant called Saffron Br...,0,16
10,01010101010101010,"Hello, I am looking for a guesthouse in the we...",0,17
16,10101010101010101010,What trains will leave from cambrige and arriv...,0,20
17,1010101010101010,I need a restaurant in the east.#I have severa...,0,16
...,...,...,...,...
16685,101010101010101010101010,Where is this saffron brasserie located?#it's ...,0,24
16686,101010101010101010,Can you please recommend a hotel? I don't need...,0,18
16691,101010101010101010101010,Can you help me find a nice 4 star hotel?#I wo...,0,24
16695,101010101010101010,Can you help me out with finding a restaurant ...,0,18


In [44]:
import os

# dataset 폴더 생성
# case1_folder_path = "0511_qa_st"
# if not os.path.exists(case1_folder_path):
#     os.makedirs(case1_folder_path)
#     print(f"Folder '{case1_folder_path}' created successfully.")
# else:
#     print(f"Folder '{case1_folder_path}' already exists.")
    
# case2_folder_path = "0510_qan_st"
# if not os.path.exists(case2_folder_path):
#     os.makedirs(case2_folder_path)
#     print(f"Folder '{case2_folder_path}' created successfully.")
# else:
#     print(f"Folder '{case2_folder_path}' already exists.")

# case3_folder_path = f"0511_{datasetname}_comb_qa"
# if not os.path.exists(case3_folder_path):
#     os.makedirs(case3_folder_path)
#     print(f"Folder '{case3_folder_path}' created successfully.")
# else:
#     print(f"Folder '{case3_folder_path}' already exists.")

case5_folder_path = f"0527_{datasetname}_comb_qa"
if not os.path.exists(case5_folder_path):
    os.makedirs(case5_folder_path)
    print(f"Folder '{case5_folder_path}' created successfully.")
else:
    print(f"Folder '{case5_folder_path}' already exists.")

Folder '0527_mwoz_comb_qa' created successfully.


In [45]:
# case1: 
# train_case1 = train_df[['turn', 'qa_dialogue', 'label', 'qa_turn']]
# train_case1.to_csv(f"{case1_folder_path}/train.tsv", sep="\t", index=False, header=False)

# case2: 
# train_case2 = train_df[['turn', 'qan_dialogue', 'label', 'qan_turn']]
# train_case2.to_csv(f"{case2_folder_path}/train.tsv", sep="\t", index=False, header=False)

# case3: qa turn 결합
train_case5 = comb_qa_train_df[['comb_qa_turn', 'dialogue', 'label']]
train_case5.to_csv(f"{case5_folder_path}/train.tsv", sep="\t", index=False, header=False)

### dev, test 생성

In [46]:
datasetname = 'mwoz' # metalwoz, mwoz, selfdialog, sgd
column_names = ['turn', 'dialogue', 'label']
path = '/home/jihyeon41/research_dial_embedding/dial2vec_git/dial2vec/datasets/'

dev_df = pd.read_csv(f'{path}/{datasetname}/clustering_dev.tsv', sep = '\t', header=None, names=column_names)
test_df = pd.read_csv(f'{path}/{datasetname}/clustering_test.tsv', sep = '\t', header=None, names=column_names)

In [47]:
dev_df.head(2)

Unnamed: 0,turn,dialogue,label
0,10101010,I need a 19:15 taxi to take me to Avalon.#I ca...,1
1,10101010,Good day. Might you be able to assist me with ...,1


In [48]:
# dev, test의 dialogue 역시 문장 단위로 #구분자로 구분되도록 변경
# turn도 문장단위로 변경하되, 동일한 턴은 같은 숫자 부여
def sep_sent(df):
    df['dialogue'] = df['dialogue'].str.split('|').str[0]

    df['sep_dial'] = ''
    for i, dial in tqdm(enumerate(df['dialogue']), total=len(df['dialogue'])):
        sep_dial = add_sep_to_dialogue(dial)
        df['sep_dial'][i] = sep_dial
        
    df['sep_counts_per_turn'] = df['sep_dial'].apply(count_sep_per_turn)
    df['turn'] = df['sep_counts_per_turn'].apply(generate_turn_list)
    df['turn'] = df['turn'].apply(convert_to_binary)
        
    df['dialogue'] = df['sep_dial'].str.replace('[sep]', '#')
    df.drop(columns=['sep_dial', 'sep_counts_per_turn'], inplace=True)
    return df

dev_df = sep_sent(dev_df)
test_df = sep_sent(test_df)

  0%|          | 0/1077 [00:00<?, ?it/s]

100%|██████████| 1077/1077 [00:00<00:00, 5853.28it/s]
100%|██████████| 1084/1084 [00:00<00:00, 5967.82it/s]


In [49]:
dev_df.head(3)

Unnamed: 0,turn,dialogue,label
0,10010001001100,I need a 19:15 taxi to take me to Avalon.#I ca...,1
1,11001010100,Good day.#Might you be able to assist me with ...,1
2,1010001001001110001100,I am looking for a restaurant.#Is there a pric...,2


In [50]:
dev_df['qa_turn'] = dev_df['dialogue'].apply(get_qa_turn)
dev_df['qa_turn'] = dev_df['qa_turn'].apply(convert_to_binary)

test_df['qa_turn'] = test_df['dialogue'].apply(get_qa_turn)
test_df['qa_turn'] = test_df['qa_turn'].apply(convert_to_binary)

In [51]:
dev_df['qan_turn'] = dev_df['dialogue'].apply(generate_qan_turn)
dev_df['qan_turn'] = dev_df['qan_turn'].apply(convert_to_binary)

test_df['qan_turn'] = test_df['dialogue'].apply(generate_qan_turn)
test_df['qan_turn'] = test_df['qan_turn'].apply(convert_to_binary)

In [52]:
# dialogue 9개 반복
dev_df['dialogue'] = dev_df['dialogue'].apply(lambda x: '|'.join([x] * 9))
test_df['dialogue'] = test_df['dialogue'].apply(lambda x: '|'.join([x] * 9))

In [53]:
test_df.head(3)

Unnamed: 0,turn,dialogue,label,qa_turn,qan_turn
0,1010011011110,I am looking for a train to Ely.#What day are ...,1,110001000000,2110221022222
1,1000100010010010100,"Heya, can you find me an expensive restaurant ...",2,1001100100110101000,1021102102110101022
2,101010010,"Hey, I'm up south can you give me a place to e...",2,110010000,110210222


In [54]:
len(test_df['dialogue'][0].split('|'))
len(dev_df['dialogue'][0].split('|'))

9

In [55]:
# QA 결합
from tqdm import tqdm

def comb_qa(df):
    new_samples = []
    new_turns = []
    for qa_turn, sample in zip(df['qa_turn'],tqdm(df['dialogue'])):
        dials = sample.split("|")
        new_sample = ""
        for dial in dials:
            sentences = dial.split("#")
            new_sentences = sentences[0]
            new_qa_turn = qa_turn[0]
            
            for i in range(1, len(qa_turn)):
                if qa_turn[i] != qa_turn[i-1]:
                    new_qa_turn += qa_turn[i]
                    new_sentences += "#" + sentences[i]
                else:
                    new_sentences += " " + sentences[i]
            new_sample += "|" + new_sentences

        new_samples.append(new_sample[1:])
        new_turns.append(new_qa_turn)

    return new_samples, new_turns

dev_new_samples, dev_new_turns = comb_qa(dev_df)
dev_df['comb_qa_dialogue'] = dev_new_samples
dev_df['comb_qa_turn'] = dev_new_turns

test_new_samples, test_new_turns = comb_qa(test_df)
test_df['comb_qa_dialogue'] = test_new_samples
test_df['comb_qa_turn'] = test_new_turns

100%|█████████▉| 1076/1077 [00:00<00:00, 24145.69it/s]


100%|█████████▉| 1083/1084 [00:00<00:00, 24512.34it/s]


In [56]:
import os

# dataset 폴더 생성
# case1_folder_path = "0511_qa_st"
# if not os.path.exists(case1_folder_path):
#     os.makedirs(case1_folder_path)
#     print(f"Folder '{case1_folder_path}' created successfully.")
# else:
#     print(f"Folder '{case1_folder_path}' already exists.")
    
# case2_folder_path = "0510_qan_st"
# if not os.path.exists(case2_folder_path):
#     os.makedirs(case2_folder_path)
#     print(f"Folder '{case2_folder_path}' created successfully.")
# else:
#     print(f"Folder '{case2_folder_path}' already exists.")

# case3_folder_path = f"0511_{datasetname}_comb_qa"
# if not os.path.exists(case3_folder_path):
#     os.makedirs(case3_folder_path)
#     print(f"Folder '{case3_folder_path}' created successfully.")
# else:
#     print(f"Folder '{case3_folder_path}' already exists.")
    
case5_folder_path = f"0527_{datasetname}_comb_qa"
if not os.path.exists(case5_folder_path):
    os.makedirs(case5_folder_path)
    print(f"Folder '{case5_folder_path}' created successfully.")
else:
    print(f"Folder '{case5_folder_path}' already exists.")

Folder '0527_mwoz_comb_qa' already exists.


In [57]:
# 데이터 저장
# case1: 0510_qa_st
# dev_case1 = dev_df[['turn', 'dialogue', 'label', 'qa_turn']]
# dev_case1.to_csv(f"{case1_folder_path}/clustering_dev.tsv", sep="\t", index=False, header=False)

# test_case1 = test_df[['turn', 'dialogue', 'label', 'qa_turn']]
# test_case1.to_csv(f"{case1_folder_path}/clustering_test.tsv", sep="\t", index=False, header=False)

# # case2: 0510_qan_st
# dev_case2 = dev_df[['turn', 'dialogue', 'label', 'qan_turn']]
# dev_case2.to_csv(f"{case2_folder_path}/clustering_dev.tsv", sep="\t", index=False, header=False)

# test_case2 = test_df[['turn', 'dialogue', 'label', 'qan_turn']]
# test_case2.to_csv(f"{case2_folder_path}/clustering_test.tsv", sep="\t", index=False, header=False)

# case3: 0511_comb_qa
# dev_case3 = dev_df[['comb_qa_turn', 'comb_qa_dialogue', 'label']]
# dev_case3.to_csv(f"{case3_folder_path}/clustering_dev.tsv", sep="\t", index=False, header=False)

# test_case3 = test_df[['comb_qa_turn', 'comb_qa_dialogue', 'label']]
# test_case3.to_csv(f"{case3_folder_path}/clustering_test.tsv", sep="\t", index=False, header=False)

# case5: 0520_sgd_comb_qa, 0520_mwoz_comb_qa
dev_case5 = dev_df[['comb_qa_turn', 'comb_qa_dialogue', 'label']]
dev_case5.to_csv(f"{case5_folder_path}/clustering_dev.tsv", sep="\t", index=False, header=False)

test_case5 = test_df[['comb_qa_turn', 'comb_qa_dialogue', 'label']]
test_case5.to_csv(f"{case5_folder_path}/clustering_test.tsv", sep="\t", index=False, header=False)

In [58]:
test_case5

Unnamed: 0,comb_qa_turn,comb_qa_dialogue,label
0,01010,I am looking for a train to Ely.#What day are ...,1
1,101010101010,"Heya, can you find me an expensive restaurant ...",2
2,1010,"Hey, I'm up south can you give me a place to e...",2
3,010101010,I am looking for a 4 star place to stay.#What ...,3
4,101010,I would like to eat in the Center of town at a...,2
...,...,...,...
1079,101,I'm looking for the information on a restauran...,2
1080,010101010101010,Hi there. I'll be coming into the centre of to...,3
1081,010101010,I'm looking for a restaurant in the centre of ...,2
1082,10,I need a taxi to go to saigon city and arrive ...,5


---

### 질문 턴의 위치를 바꾼 positive 생성

In [59]:
datasetname = '0527_mwoz_comb_qa' # metalwoz, mwoz, selfdialog, sgd
column_names = ['qa_turn', 'dialogue', 'label']
path = '/home/jihyeon41/research_dial_embedding/dial2vec_git/dial2vec/datasets/'

train_df = pd.read_csv(f'{path}{datasetname}/train.tsv', sep = '\t', header=None, names=column_names)

In [60]:
len(train_df['dialogue'][0].split('|'))

9

In [61]:
def shuffle_questions(df):
    # 첫 번째 대화 추출
    anchor = df['dialogue'].split('|')[0]
    turns = anchor.split('#')
    qa_turns = df['qa_turn']
    
    # 질문 턴과 대답 턴 분리
    questions = []
    answers = []
    for i, turn in enumerate(turns):
        if qa_turns[i] == '1':
            questions.append(turn)
        else:
            answers.append(turn)
            
    # 질문 턴 섞기
    random.shuffle(questions)
    
    # 섞인 질문 턴과 대답 턴 다시 조합
    shuffled_turns = []
    q_index = 0
    a_index = 0
    for i in range(len(turns)):
        if qa_turns[i] == '1':
            shuffled_turns.append(questions[q_index])
            q_index += 1
        else:
            shuffled_turns.append(answers[a_index])
            a_index += 1
            
    # 섞인 대화 다시 조합
    shuffled_dialogue = '#'.join(shuffled_turns)
    
    # 첫 번째 대화와 두 번째 대화 사이에 삽입
    dialogues = df['dialogue'].split('|')
    updated_dialogue = '|'.join([dialogues[0], shuffled_dialogue] + dialogues[1:])
    
    return updated_dialogue

In [62]:
train_df['dialogue'] = train_df.apply(shuffle_questions, axis=1)

In [63]:
len(train_df['dialogue'][0].split('|'))

10

In [65]:
import os

case4_folder_path = f"0527_mwoz_anc_q_shuffle" # 폴더명 변경 필요
if not os.path.exists(case4_folder_path):
    os.makedirs(case4_folder_path)
    print(f"Folder '{case4_folder_path}' created successfully.")
else:
    print(f"Folder '{case4_folder_path}' already exists.")

Folder '0527_mwoz_anc_q_shuffle' created successfully.


In [66]:
train_df.to_csv(f"{case4_folder_path}/train.tsv", sep="\t", index=False, header=False)

In [67]:
# dev, test도 데이터 10개로 다시 수정

dev_df = pd.read_csv(f'{path}{datasetname}/clustering_dev.tsv', sep = '\t', header=None, names=column_names)
test_df = pd.read_csv(f'{path}{datasetname}/clustering_test.tsv', sep = '\t', header=None, names=column_names)

In [68]:
len(dev_df['dialogue'][0].split('|'))

9

In [69]:
def repeat_dialogue(df):
    dialogues = df.split('|')
    dialogues.append(dialogues[0])  # 같은 대화를 하나 더 추가
    updated_dialogue = '|'.join(dialogues)
    return updated_dialogue

In [70]:
dev_df['dialogue'] = dev_df['dialogue'].apply(repeat_dialogue)
test_df['dialogue'] = test_df['dialogue'].apply(repeat_dialogue)

In [71]:
len(dev_df['dialogue'][0].split('|'))

10

In [72]:
dev_df.to_csv(f"{case4_folder_path}/clustering_dev.tsv", sep="\t", index=False, header=False)
test_df.to_csv(f"{case4_folder_path}/clustering_test.tsv", sep="\t", index=False, header=False)

----