In [1]:
import sys
sys.path.insert(0, '..')

import json
import re
import torch
import pandas as pd
import nltk
import numpy as np
import pickle
from pathlib import Path
from model.utils import remove_punct, sent_tokenize, stemming, lemmatize, preprocess_text

In [2]:
home_dir = Path('..')
data_dir = Path('dataset')
data_name = Path('SDAC')
train_data_name = 'sw_train.txt'
valid_data_name = 'sw_val.txt'
test_data_name = 'sw_test.txt'

In [3]:
train_data = pd.read_csv(home_dir / data_dir / data_name / train_data_name, header=None, sep='|', names=['speaker','utterance','tag'])
valid_data = pd.read_csv(home_dir / data_dir / data_name / valid_data_name, header=None, sep='|', names=['speaker','utterance','tag'])
test_data = pd.read_csv(home_dir / data_dir / data_name / test_data_name, header=None, sep='|', names=['speaker','utterance','tag'])

In [4]:
train_data

Unnamed: 0,speaker,utterance,tag
0,A,Okay.,"fo_o_fw_""_by_bc"
1,A,"So, What kind of experience do you, do you hav...",qw
2,B,"I guess, I think, uh, I wonder if that worked.",qy^d
3,A,Does it say something?,qy
4,B,I think it usually does.,sd
...,...,...,...
192381,B,it is.,sd
192382,B,It really is.,sd
192383,B,"Yeah,",ny
192384,B,it really is.,sd


In [5]:
valid_data

Unnamed: 0,speaker,utterance,tag
0,B,"Um, all right.","fo_o_fw_""_by_bc"
1,A,"I've, uh,",%
2,A,"as far as I'm concerned, I find that the young...",sv
3,A,and I think that comes about from their being ...,sv
4,B,Uh-huh.,aa
...,...,...,...
3267,B,"and I know they've, there's a lot of refinerie...",sd
3268,B,"and that, that's some pretty potent stuff they...",sv
3269,B,"I, but I don't know how, uh, you know,",sd
3270,B,there's a difference in what you can smell and...,sv


In [6]:
test_data

Unnamed: 0,speaker,utterance,tag
0,A,"Okay, uh,","fo_o_fw_""_by_bc"
1,A,could you tell me what you think contributes m...,qw
2,B,"Well, it's hard to say.",^h
3,B,"I mean, while it's certainly the case that thi...",sv
4,B,What do you think?,qo
...,...,...,...
4073,B,I appreciate it.,fc
4074,A,Okay.,fc
4075,B,Catch you later.,fc
4076,A,Bye-bye.,fc


In [7]:
data = pd.concat([train_data['utterance'], valid_data['utterance'], test_data['utterance']], axis=0, ignore_index=True)
data

0                                                     Okay.
1         So, What kind of experience do you, do you hav...
2            I guess, I think, uh, I wonder if that worked.
3                                    Does it say something?
4                                  I think it usually does.
                                ...                        
199731                                     I appreciate it.
199732                                                Okay.
199733                                     Catch you later.
199734                                             Bye-bye.
199735                                             Bye-bye.
Name: utterance, Length: 199736, dtype: object

    자연어 전처리 파이프라인 구동을 위해 다운로드 필요
    import nltk
    nltk.download('punct')
    nltk.download('wordnet')

In [8]:
preprocess_pipeline = [sent_tokenize, stemming]
data = data.apply(preprocess_text, processing_function_list=preprocess_pipeline)
data

0                                                 [okay, .]
1         [so, ,, what, kind, of, experi, do, you, ,, do...
2         [i, guess, ,, i, think, ,, uh, ,, i, wonder, i...
3                                 [doe, it, say, someth, ?]
4                             [i, think, it, usual, doe, .]
                                ...                        
199731                                  [i, appreci, it, .]
199732                                            [okay, .]
199733                               [catch, you, later, .]
199734                                          [bye-by, .]
199735                                          [bye-by, .]
Name: utterance, Length: 199736, dtype: object

In [10]:
# punctuation 만 있는 sentence 존재

zero_sent_idx = [ 15546,  17228,  17851,  19150,  21495,  26890,  27815,  
                 50593, 120070, 122050, 122228, 134316, 134342]

data[zero_sent_idx]

15546     [.]
17228     [.]
17851     [.]
19150     [.]
21495     [.]
26890     [.]
27815     [.]
50593     [?]
120070    [.]
122050    [.]
122228    [.]
134316    [.]
134342    [.]
Name: utterance, dtype: object

In [11]:
word2idx = {}
word2idx['<PAD>'] = 0  
for sent in data:
    for word in sent:
        if word not in word2idx:
            word2idx[word] = len(word2idx)

In [12]:
tags = pd.concat([train_data['tag'], valid_data['tag'], test_data['tag']], axis=0, ignore_index=True)
tags

0         fo_o_fw_"_by_bc
1                      qw
2                    qy^d
3                      qy
4                      sd
               ...       
199731                 fc
199732                 fc
199733                 fc
199734                 fc
199735                 fc
Name: tag, Length: 199736, dtype: object

In [13]:
labels = list(tags.unique())
label2idx = {l:i for i,l in enumerate(labels)}
label2idx

{'fo_o_fw_"_by_bc': 0,
 'qw': 1,
 'qy^d': 2,
 'qy': 3,
 'sd': 4,
 'ad': 5,
 'h': 6,
 'aa': 7,
 'b': 8,
 'sv': 9,
 'bk': 10,
 'nn': 11,
 'na': 12,
 'bh': 13,
 'ny': 14,
 '%': 15,
 'ba': 16,
 'bf': 17,
 'b^m': 18,
 'qh': 19,
 'no': 20,
 't1': 21,
 'qo': 22,
 '^h': 23,
 'qrr': 24,
 'oo_co_cc': 25,
 '^q': 26,
 '^2': 27,
 'br': 28,
 'aap_am': 29,
 'bd': 30,
 '^g': 31,
 'fc': 32,
 'ft': 33,
 'ar': 34,
 't3': 35,
 'ng': 36,
 'qw^d': 37,
 'fp': 38,
 'fa': 39,
 'arp_nd': 40}

In [14]:
config = {
    "n_words" : len(word2idx),
    "n_tags": len(label2idx),
    "n_train": len(train_data),
    "n_valid": len(valid_data),
    "n_test": len(test_data)
}

In [15]:
with open(home_dir / data_dir / data_name / "word2idx.json", "w") as f:
    json.dump(word2idx, f)
    
with open(home_dir / data_dir / data_name / "label2idx.json", "w") as f:
    json.dump(label2idx, f)
    
with open(home_dir / data_dir / data_name / "config.json", "w") as f:
    json.dump(config, f)

In [20]:
# embedding_dim = 100
# pretrained_embeddings = f"glove.6b.{embedding_dim}d.txt"
# embeddings = {}
# with open(data_dir / pretrained_embeddings, encoding="utf8") as file:
#         for line in file:
#             values = line.rstrip().rsplit(' ')
#             word = values[0]
#             vector = np.asarray(values[1:], dtype='float32')
#             embeddings[word] = vector

# embedding_matrix = np.zeros((len(word2idx), embedding_dim))
# for word, idx in word2idx.items():
#     if word in embeddings.keys():
#         word_embedding = embeddings[word]
#         embedding_matrix[idx] = word_embedding

# with open(data_dir / "embeddings.pkl", "wb") as f:
#     pickle.dump(embedding_matrix, f)

# # https://team-platform.tistory.com/38