In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
from pathlib import Path
sys.path.append(Path().absolute().parent.as_posix())

In [3]:
import pandas as pd
from helpers.data_prep import *
from helpers.utils import *
import yaml
import numpy as np
from matplotlib import pylab as plt

Building prefix dict from the default dictionary ...
DEBUG:jieba:Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
DEBUG:jieba:Loading model from cache /tmp/jieba.cache
Loading model cost 0.572 seconds.
DEBUG:jieba:Loading model cost 0.572 seconds.
Prefix dict has been built successfully.
DEBUG:jieba:Prefix dict has been built successfully.


In [4]:
import logging
LOG = logging.getLogger()
LOG.setLevel(logging.DEBUG)

In [5]:
data_path = Path().absolute().parent.joinpath('data')
print(data_path)

/mnt/raid/Classes/houchang/nlp2020/QA-Summary/data


In [96]:
train_file = data_path.joinpath('AutoMaster_TrainSet.csv')
test_file = data_path.joinpath('AutoMaster_TestSet.csv')
stopwords_file = data_path.joinpath('stopwords.yml')
replacements_file = data_path.joinpath('replacements.yml')
freq_file = data_path.joinpath('word_freq.csv')
vocab_file = data_path.joinpath('vocab.yml')

# Load data, separate Question, Dialogue, Report (if exists)

In [8]:
train_loader = QALoader(
    stopwords_file=stopwords_file,
    replacements_file=replacements_file,
)
train_loader.load(train_file)

INFO:root:Parsing Question...
INFO:root:Parsing Dialogue...
INFO:root:Parsing Report...
INFO:root:Removing stopwords...
INFO:root:Cleaning Sentences...


<__main__.QALoader at 0x7f85bbf77a90>

In [9]:
test_loader = QALoader(
    df_type='test',
    stopwords_file=stopwords_file,
    replacements_file=replacements_file,
)
test_loader.load(test_file)

INFO:root:Parsing Question...
INFO:root:Parsing Dialogue...
INFO:root:Removing stopwords...
INFO:root:Cleaning Sentences...


<__main__.QALoader at 0x7f85928664d0>

In [10]:
wc = QACounter()
wc.add_loader(train_loader).add_loader(test_loader)

<__main__.QACounter at 0x7f85a55ab590>

In [11]:
wc.save_df(freq_file)

In [86]:
vocab = QAVocab()
vocab.load_freq_file(freq_file)

<__main__.QAVocab at 0x7f84c77f0f50>

In [87]:
vocab.save_vocab(vocab_file)

<__main__.QAVocab at 0x7f84c77f0f50>

In [88]:
summaries = []
for name, loader in zip(('Train', 'Test'), (train_loader, test_loader)):
    for attr in loader.cols:
        lengths = [len(s) for s in getattr(loader, attr)]
        summary = pd.Series(lengths).describe(percentiles=[.1, .25, .5, .75, .9])
        summary.name = f'{name}_{attr}'
        summaries.append(summary)
pd.concat(summaries, axis=1)

Unnamed: 0,Train_Question,Train_Dialogue,Train_Report,Test_Question,Test_Dialogue
count,82871.0,82871.0,82871.0,20000.0,20000.0
mean,28.013563,118.864271,20.089223,29.19325,120.014
std,28.30039,141.578053,14.676059,26.701881,152.217361
min,0.0,0.0,0.0,1.0,0.0
10%,8.0,15.0,6.0,8.0,15.0
25%,12.0,34.0,10.0,12.0,33.0
50%,20.0,76.0,17.0,22.0,72.0
75%,36.0,151.0,26.0,37.0,150.0
90%,57.0,267.0,38.0,59.0,275.0
max,2975.0,2868.0,571.0,693.0,3682.0


In [89]:
# Question and Dialogue won't be zero at the same time
for i in range(len(train_loader.Question)):
    if np.sum([len(getattr(train_loader, attr)[i]) for attr in ['Question', 'Dialogue']]) == 0:
        print(i)
        break

In [102]:
lengths = {
    'len_q': 60,
    'len_d': 240,
    'len_r': 60,
}

In [103]:
train_proc = QAProcessor(vocab=vocab, **lengths, ).load_data(train_loader).mask_oov().standardize_length()
test_proc = QAProcessor(vocab=vocab, **lengths, ).load_data(test_loader).mask_oov().standardize_length()

INFO:root:Masking OOV...
INFO:root:Standardizing Length...
INFO:root:Masking OOV...
INFO:root:Standardizing Length...


In [104]:
unmasked_sentences_path = data_path.joinpath('sentences_unmasked')
if not unmasked_sentences_path.is_dir():
    unmasked_sentences_path.mkdir()
masked_sentences_path = data_path.joinpath('sentences_masked')
if not masked_sentences_path.is_dir():
    masked_sentences_path.mkdir()

In [105]:
train_proc.save(save_path=unmasked_sentences_path, prefix='train_unmasked')
test_proc.save(save_path=unmasked_sentences_path, prefix='test_unmasked')

In [106]:
train_proc.mask_sentence().save(save_path=masked_sentences_path, prefix='train_masked')
test_proc.mask_sentence().save(save_path=masked_sentences_path, prefix='test_masked')

INFO:root:Masking sentences...
INFO:root:Masking sentences...
