In [1]:
import os
import re
import pickle
import collections
import subprocess

import numpy as np

In [2]:
SEQ_LEN = 30

# data loading

In [3]:
def multi_sub(text, remove_list):
    for rm in remove_list:
        text = re.sub(rm, "", text)
    return text

In [4]:
neg_data = []
with open("rt-polaritydata/rt-polarity.neg", "r", encoding="latin-1") as f:
    for line in f.readlines():
        line = re.sub("'s", " 's", line)
        line = re.sub(" \n", "", line)
        neg_data.append(line)

In [5]:
pos_data = []
with open("rt-polaritydata/rt-polarity.pos", "r", encoding="latin-1") as f:
    for line in f.readlines():
        line = re.sub("'s", " 's", line)
        line = re.sub(" \n", "", line)
        pos_data.append(line)

In [6]:
print(len(neg_data))
print(len(pos_data))

5331
5331


In [7]:
raw_data = []
label_data = []

# train data, valid data, test data
raw_data.append(neg_data[:-500] + pos_data[:-500])
raw_data.append(neg_data[-500:-250] + pos_data[-500:-250])
raw_data.append(neg_data[-250:] + pos_data[-250:])

for i in range(3):
    neg = [0 for _ in range(int(len(raw_data[i])/2))]
    pos = [1 for _ in range(int(len(raw_data[i])/2))]
    label_data.append(neg + pos)

# Preprocessing

In [8]:
def segmentation(inputlist):
    outputlist = []
    maxlen = 0
    sumlen = 0
    
    for line in inputlist:
        segmented_list = line.split(' ')
        outputlist.append(segmented_list[:len(segmented_list)-1])
        
        sumlen += len(segmented_list)
        if len(segmented_list) > maxlen:
            maxlen = len(segmented_list)
    f.close()
    
    print("MaxLen: %d" % maxlen)
    print("AveLen: %f" % (sumlen/len(outputlist)))
    
    return outputlist

In [9]:
def build_dictionary(segmentedlist, num_vocab, init_vocabdict={"<PAD>": 0, "<UNK>": 1, "<GO>": 2, "<EOS>": 3}):
    counter = collections.Counter()
    for i, wordlist in enumerate(segmentedlist):
        counter.update(wordlist)
    counter['EOS'] = 0
    
    vocablist = [vocab[0] for vocab in counter.most_common(num_vocab-len(init_vocabdict))]
    vocab_dict = init_vocabdict
    n = len(init_vocabdict)
    
    for i, vocab in enumerate(vocablist):
        vocab_dict[vocab] = i+n
    
    return vocab_dict

In [10]:
def words2ids(segmented_list, dictionary):
    ids_list = []
    for line in segmented_list:
        tmp_list = []
        for i in range(len(line)):
            if line[i] in dictionary:
                tmp_list.append(dictionary[line[i]])
            else:
                tmp_list.append(dictionary['<UNK>'])
        ids_list.append(tmp_list)
    
    return ids_list

In [11]:
def unk_rate(ids_list, unk):
    full_size = 0
    unk_count = 0
    for line in ids_list:
        full_size += len(line)
        unk_count += line.count(unk) 
    
    print("full size: %d" % full_size)
    print("unk_count: %d" % unk_count)
    print("unk_rate: %f" % (unk_count/full_size))

In [12]:
def fix_seqlen(ids_list, dictionary, seqlen):
    fixed_list = [[dictionary["<PAD>"] for _ in range(seqlen)] for _ in ids_list]
    for i, line in enumerate(ids_list):
        for j, idx in enumerate(line):
            if j==seqlen: break
            fixed_list[i][j] = line[j]
        if len(line) < seqlen:
            fixed_list[i][len(line)] = dictionary["<EOS>"]
        if i%10000 == 9999:
            print("%d" % (i+1))
    
    return fixed_list

In [13]:
segmented_list = []
for i in range(3):
    tmp_list = segmentation(raw_data[i])
    segmented_list.append(tmp_list)

MaxLen: 63
AveLen: 21.369075
MaxLen: 50
AveLen: 21.550000
MaxLen: 50
AveLen: 21.436000


In [14]:
for i in range(3):
    print(len(segmented_list[i]))

9662
500
500


In [15]:
full_text = segmented_list[0] + segmented_list[1] + segmented_list[2]
dictionary = build_dictionary(full_text, 10000)

In [16]:
len(dictionary)

10000

In [17]:
ids_mc = []
for i in range(3):
    ids_mc.append(words2ids(segmented_list[i], dictionary))
    print(len(segmented_list[i]))

9662
500
500


In [18]:
unk_rate(ids_mc[0] + ids_mc[1] + ids_mc[2], dictionary['<UNK>'])

full size: 217299
unk_count: 10989
unk_rate: 0.050571


## fix length

In [19]:
for i in range(3):
    ids_mc[i] = fix_seqlen(ids_mc[i], dictionary, SEQ_LEN)

In [20]:
filelist = ["dataset/train.pkl",
            "dataset/val.pkl",
            "dataset/test.pkl"]

In [21]:
for i in range(3):
    with open(filelist[i], "wb") as output:
        pickle.dump(ids_mc[i], output, protocol=2)

In [22]:
filelist = ["dataset/train_label.pkl",
            "dataset/val_label.pkl",
            "dataset/test_label.pkl"]

In [23]:
for i in range(3):
    with open(filelist[i], "wb") as output:
        pickle.dump(label_data[i], output, protocol=2)

In [24]:
dictfile = "dataset/dictionary.pkl"
with open(dictfile, "wb") as output:
    pickle.dump(dictionary, output, protocol=2)