In [1]:
import pickle
import numpy as np
from collections import Counter
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from utils import MAX_LENGTH, LABEL_DICT

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
data = pickle.load(open('data.pkl', 'rb'))
ws = pickle.load(open('ws.pkl', 'rb'))

In [3]:
x_text = []
y_label = []
for text, _, label, _ in data:
    if len(text) > MAX_LENGTH:
        continue
    if label not in LABEL_DICT:
        continue
    x_text.append(text)
    y_label.append(LABEL_DICT[label])

print('len(x_test)', len(x_text), len(y_label))

x_vec = np.array([
    ws.transform(x, max_len=MAX_LENGTH)
    for x in tqdm(x_text)
])
y_vec = to_categorical(y_label, len(LABEL_DICT))

  0%|          | 1997/880570 [00:00<00:44, 19960.50it/s]

len(x_test) 880570 880570


100%|██████████| 880570/880570 [00:44<00:00, 19888.22it/s]


In [4]:
x_train, x_test, y_train, y_test = train_test_split(
    x_vec, y_vec,
    test_size=0.4, random_state=0
)

In [5]:
inverse_ld = {v: k for k, v in LABEL_DICT.items()}
print(inverse_ld)
label_train = y_train.argmax(-1)
label_test = y_test.argmax(-1)
train_class_weight = {}
counter = Counter(label_train)
majority = max(counter.values())
for i in range(len(LABEL_DICT)):
    train_class_weight[i] = majority / counter[i]
    print('{}\t\t{:.2f}%\t\t{:.2f}\t\t{}'.format(
        inverse_ld[i],
        counter[i] / len(x_train) * 100,
        train_class_weight[i],
        counter[i]
    ))
print('-' * 20)
counter = Counter(label_test)
majority = max(counter.values())
for i in range(len(LABEL_DICT)):
    print('{}\t\t{:.2f}%\t\t{:.2f}\t\t{}'.format(
        inverse_ld[i],
        counter[i] / len(x_test) * 100,
        majority / counter[i],
        counter[i]
    ))

{0: 'happy', 1: 'sad', 2: 'angry', 3: 'disgust', 4: 'fear', 5: 'suprise'}
happy		68.49%		1.00		361868
sad		9.90%		6.92		52316
angry		7.68%		8.92		40569
disgust		6.74%		10.17		35584
fear		4.12%		16.63		21758
suprise		3.08%		22.27		16247
--------------------
happy		68.44%		1.00		241082
sad		9.93%		6.89		34985
angry		7.67%		8.92		27033
disgust		6.80%		10.06		23964
fear		4.20%		16.29		14796
suprise		2.94%		23.25		10368


In [6]:
with open('train_data.pkl', 'wb') as fp:
    pickle.dump(
        (x_train, x_test, y_train, y_test, train_class_weight),
        fp
    )