In [130]:
import pandas as pd
import nltk
import json
import pickle
from pathlib import Path
from utils import sent2words

In [3]:
data_dir = Path('dataset')
train_data_name = 'sw_train.txt'
valid_data_name = 'sw_val.txt'
test_data_name = 'sw_test.txt'

In [18]:
train_data = pd.read_csv(data_dir / train_data_name, header=None, sep='|', names=['speaker','utterance','tag'])
valid_data = pd.read_csv(data_dir / valid_data_name, header=None, sep='|', names=['speaker','utterance','tag'])
test_data = pd.read_csv(data_dir / test_data_name, header=None, sep='|', names=['speaker','utterance','tag'])

In [65]:
train_data

Unnamed: 0,speaker,utterance,tag
0,A,Okay.,"fo_o_fw_""_by_bc"
1,A,"So, What kind of experience do you, do you hav...",qw
2,B,"I guess, I think, uh, I wonder if that worked.",qy^d
3,A,Does it say something?,qy
4,B,I think it usually does.,sd
...,...,...,...
192381,B,it is.,sd
192382,B,It really is.,sd
192383,B,"Yeah,",ny
192384,B,it really is.,sd


In [78]:
valid_data

Unnamed: 0,speaker,utterance,tag
0,B,"Um, all right.","fo_o_fw_""_by_bc"
1,A,"I've, uh,",%
2,A,"as far as I'm concerned, I find that the young...",sv
3,A,and I think that comes about from their being ...,sv
4,B,Uh-huh.,aa
...,...,...,...
3267,B,"and I know they've, there's a lot of refinerie...",sd
3268,B,"and that, that's some pretty potent stuff they...",sv
3269,B,"I, but I don't know how, uh, you know,",sd
3270,B,there's a difference in what you can smell and...,sv


In [79]:
test_data

Unnamed: 0,speaker,utterance,tag
0,A,"Okay, uh,","fo_o_fw_""_by_bc"
1,A,could you tell me what you think contributes m...,qw
2,B,"Well, it's hard to say.",^h
3,B,"I mean, while it's certainly the case that thi...",sv
4,B,What do you think?,qo
...,...,...,...
4073,B,I appreciate it.,fc
4074,A,Okay.,fc
4075,B,Catch you later.,fc
4076,A,Bye-bye.,fc


In [92]:
utterances = pd.concat([train_data['utterance'], valid_data['utterance'], test_data['utterance']], axis=0, ignore_index=True)
utterances

0                                                     Okay.
1         So, What kind of experience do you, do you hav...
2            I guess, I think, uh, I wonder if that worked.
3                                    Does it say something?
4                                  I think it usually does.
                                ...                        
199731                                     I appreciate it.
199732                                                Okay.
199733                                     Catch you later.
199734                                             Bye-bye.
199735                                             Bye-bye.
Name: utterance, Length: 199736, dtype: object

In [132]:
word2idx = {}
word2idx['<PAD>'] = 0  
for i, sent in enumerate(utterances):
    temp_words = sent2words(sent)
    for w in temp_words:
        if w not in word2idx:
            word2idx[w] = len(word2idx)

# idx2word = {i:w for w,i in word2idx.items()}

In [138]:
idx2word

{0: '<PAD>',
 1: 'okay',
 2: '.',
 3: 'so',
 4: ',',
 5: 'what',
 6: 'kind',
 7: 'of',
 8: 'experience',
 9: 'do',
 10: 'you',
 11: 'have',
 12: 'then',
 13: 'with',
 14: 'child',
 15: 'care',
 16: '?',
 17: 'i',
 18: 'guess',
 19: 'think',
 20: 'uh',
 21: 'wonder',
 22: 'if',
 23: 'that',
 24: 'worked',
 25: 'does',
 26: 'it',
 27: 'say',
 28: 'something',
 29: 'usually',
 30: 'might',
 31: 'try',
 32: "n't",
 33: 'know',
 34: 'hold',
 35: 'down',
 36: 'a',
 37: 'little',
 38: 'longer',
 39: 'and',
 40: 'see',
 41: 'well',
 42: 'make',
 43: 'recording',
 44: 'or',
 45: 'remember',
 46: 'seemed',
 47: 'like',
 48: 'did',
 49: 'but',
 50: 'not',
 51: 'we',
 52: 'can',
 53: 'start',
 54: 'no',
 55: 'any',
 56: 'kids',
 57: 'my',
 58: 'sister',
 59: 'has',
 60: 'she',
 61: 'just',
 62: 'had',
 63: 'baby',
 64: 'he',
 65: "'s",
 66: 'about',
 67: 'five',
 68: 'months',
 69: 'old',
 70: 'was',
 71: 'worrying',
 72: 'going',
 73: 'back',
 74: 'to',
 75: 'work',
 76: 'him',
 77: 'the',
 78: '

In [133]:
tags = pd.concat([train_data['tag'], valid_data['tag'], test_data['tag']], axis=0, ignore_index=True)
tags

0         fo_o_fw_"_by_bc
1                      qw
2                    qy^d
3                      qy
4                      sd
               ...       
199731                 fc
199732                 fc
199733                 fc
199734                 fc
199735                 fc
Name: tag, Length: 199736, dtype: object

In [139]:
labels = list(tags.unique())
# idx2label = {i:l for i,l in enumerate(labels)}
# label2idx = {l:i for i,l in idx2label.items()}
label2idx = {l:i for i,l in enumerate(labels)}

In [140]:
label2idx

{'fo_o_fw_"_by_bc': 0,
 'qw': 1,
 'qy^d': 2,
 'qy': 3,
 'sd': 4,
 'ad': 5,
 'h': 6,
 'aa': 7,
 'b': 8,
 'sv': 9,
 'bk': 10,
 'nn': 11,
 'na': 12,
 'bh': 13,
 'ny': 14,
 '%': 15,
 'ba': 16,
 'bf': 17,
 'b^m': 18,
 'qh': 19,
 'no': 20,
 't1': 21,
 'qo': 22,
 '^h': 23,
 'qrr': 24,
 'oo_co_cc': 25,
 '^q': 26,
 '^2': 27,
 'br': 28,
 'aap_am': 29,
 'bd': 30,
 '^g': 31,
 'fc': 32,
 'ft': 33,
 'ar': 34,
 't3': 35,
 'ng': 36,
 'qw^d': 37,
 'fp': 38,
 'fa': 39,
 'arp_nd': 40}

In [136]:
config = {
    "n_words" : len(word2idx)-1,
    "n_tags": len(label2idx),
    "n_train": len(train_data),
    "n_valid": len(valid_data),
    "n_test": len(test_data)
}

In [141]:
with open(data_dir / "word2idx.json", "w") as f:
    json.dump(word2idx, f)

# with open(data_dir / "idx2word.json", "w") as f:
#     json.dump(idx2word, f)
    
with open(data_dir / "label2idx.json", "w") as f:
    json.dump(label2idx, f)
    
# with open(data_dir / "idx2label.json", "w") as f:
#     json.dump(idx2word, f)
    
with open(data_dir / "config.json", "w") as f:
    json.dump(config, f)

In [123]:
import numpy as np
embedding_dim = 100
pretrained_embeddings = f"glove.6b.{embedding_dim}d.txt"
embeddings = {}
with open(data_dir / pretrained_embeddings, encoding="utf8") as file:
        for line in file:
            values = line.rstrip().rsplit(' ')
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings[word] = vector

In [124]:
embedding_matrix = np.zeros((len(word2idx), embedding_dim))

In [128]:
for word, idx in word2idx.items():
    if word in embeddings.keys():
        word_embedding = embeddings[word]
        embedding_matrix[idx] = word_embedding

In [131]:
with open(data_dir / "embeddings.pkl", "wb") as f:
    pickle.dump(embedding_matrix, f)