In [1]:
import os
import re
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim

In [2]:
with open(r"..\data\EMNLP_dataset\train\dialogues_train.txt", "r", encoding="UTF-8") as f:
    raw_dialog_lines = f.readlines()

In [3]:
dialogs = [line.strip().split("__eou__") for line in raw_dialog_lines]
dialogs_cleaned = [[utt.strip() for utt in dialog if utt.strip()] for dialog in dialogs]

In [4]:
max_len = max(len(d) for d in dialogs_cleaned)
dialogs_padded = [d + [""]*(max_len - len(d)) for d in dialogs_cleaned]
columns = [f"utterance_{i+1}" for i in range(max_len)]
df_dialogs = pd.DataFrame(dialogs_padded, columns=columns)
df_dialogs.head()

Unnamed: 0,utterance_1,utterance_2,utterance_3,utterance_4,utterance_5,utterance_6,utterance_7,utterance_8,utterance_9,utterance_10,...,utterance_26,utterance_27,utterance_28,utterance_29,utterance_30,utterance_31,utterance_32,utterance_33,utterance_34,utterance_35
0,"Say , Jim , how about going for a few beers af...",You know that is tempting but is really not go...,What do you mean ? It will help us to relax .,Do you really think so ? I don't . It will jus...,I guess you are right.But what shall we do ? I...,I suggest a walk over to the gym where we can ...,That's a good idea . I hear Mary and Sally oft...,"Sounds great to me ! If they are willing , we ...",Good.Let ' s go now .,All right .,...,,,,,,,,,,
1,Can you do push-ups ?,Of course I can . It's a piece of cake ! Belie...,Really ? I think that's impossible !,You mean 30 push-ups ?,Yeah !,"It's easy . If you do exercise everyday , you ...",,,,,...,,,,,,,,,,
2,Can you study with the radio on ?,"No , I listen to background music .",What is the difference ?,The radio has too many comerials .,"That's true , but then you have to buy a recor...",,,,,,...,,,,,,,,,,
3,Are you all right ?,I will be all right soon . I was terrified whe...,Don't worry.He is an acrobat 。,I see .,,,,,,,...,,,,,,,,,,
4,"Hey John , nice skates . Are they new ?","Yeah , I just got them . I started playing ice...",What position do you play ?,I ’ m a defender . It ’ s a lot of fun . You d...,"Yeah , you ’ re a pretty big guy . I play goal...","Oh , yeah ? Which team ?",The Rockets .,Really ? I think we play you guys next week . ...,"All right , see you later .",,...,,,,,,,,,,


In [5]:
rows = []
for i, dialog in enumerate(dialogs_cleaned):
    for utt in dialog:
        rows.append({"dialog_id": i, "utterance": utt})

flat_dialogs = pd.DataFrame(rows)

In [6]:
flat_dialogs.head()

Unnamed: 0,dialog_id,utterance
0,0,"Say , Jim , how about going for a few beers af..."
1,0,You know that is tempting but is really not go...
2,0,What do you mean ? It will help us to relax .
3,0,Do you really think so ? I don't . It will jus...
4,0,I guess you are right.But what shall we do ? I...


In [7]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

In [8]:
flat_dialogs["utterance_clean"] = flat_dialogs["utterance"].apply(preprocess_text)

In [9]:
flat_dialogs.head()

Unnamed: 0,dialog_id,utterance,utterance_clean
0,0,"Say , Jim , how about going for a few beers af...",say jim how about going for a few beers after ...
1,0,You know that is tempting but is really not go...,you know that is tempting but is really not go...
2,0,What do you mean ? It will help us to relax .,what do you mean it will help us to relax
3,0,Do you really think so ? I don't . It will jus...,do you really think so i dont it will just mak...
4,0,I guess you are right.But what shall we do ? I...,i guess you are rightbut what shall we do i do...
