In [5]:
import pickle
import numpy as np
from collections import Counter
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from utils import MAX_LENGTH, LABEL_DICT

In [2]:
data = pickle.load(open('data.pkl', 'rb'))
ws = pickle.load(open('ws.pkl', 'rb'))

In [3]:
x_text = []
y_label = []
for text, _, label, _ in data:
    if len(text) > MAX_LENGTH:
        continue
    if label not in LABEL_DICT:
        continue
    x_text.append(text)
    y_label.append(LABEL_DICT[label])

print('len(x_test)', len(x_text), len(y_label))

x_vec = np.array([
    ws.transform(x, max_len=MAX_LENGTH)
    for x in tqdm(x_text)
])
y_vec = to_categorical(y_label, len(LABEL_DICT))

  0%|          | 1716/988697 [00:00<00:57, 17150.24it/s]

len(x_test) 988697 988697


100%|██████████| 988697/988697 [00:51<00:00, 19134.62it/s]


In [4]:
x_train, x_test, y_train, y_test = train_test_split(
    x_vec, y_vec,
    test_size=0.4, random_state=0
)

In [7]:
inverse_ld = {v: k for k, v in LABEL_DICT.items()}
print(inverse_ld)
label_train = y_train.argmax(-1)
label_test = y_test.argmax(-1)
train_class_weight = {}
counter = Counter(label_train)
majority = max(counter.values())
for i in range(len(LABEL_DICT)):
    train_class_weight[i] = majority / counter[i]
    print('{}\t\t{:.2f}%\t\t{:.2f}\t\t{}'.format(
        inverse_ld[i],
        counter[i] / len(x_train) * 100,
        train_class_weight[i],
        counter[i]
    ))
print('-' * 20)
counter = Counter(label_test)
majority = max(counter.values())
for i in range(len(LABEL_DICT)):
    print('{}\t\t{:.2f}%\t\t{:.2f}\t\t{}'.format(
        inverse_ld[i],
        counter[i] / len(x_test) * 100,
        majority / counter[i],
        counter[i]
    ))

{0: 'happy', 1: 'sad', 2: 'angry', 3: 'disgust', 4: 'fear', 5: 'suprise', 6: 'neutral'}
happy		58.57%		1.00		347467
sad		8.55%		6.85		50699
angry		7.34%		7.98		43547
disgust		5.10%		11.49		30244
fear		3.50%		16.72		20787
suprise		2.56%		22.85		15205
neutral		14.37%		4.07		85269
--------------------
happy		58.47%		1.00		231249
sad		8.63%		6.77		34143
angry		7.32%		7.98		28964
disgust		5.16%		11.34		20398
fear		3.51%		16.67		13875
suprise		2.56%		22.84		10125
neutral		14.34%		4.08		56725


In [8]:
with open('train_data.pkl', 'wb') as fp:
    pickle.dump(
        (x_train, x_test, y_train, y_test, train_class_weight),
        fp
    )