In [28]:
import pandas as pd

from collections import Counter

from tqdm import tqdm_notebook as tqdm

import numpy as np

import torch
torch.__version__

import sklearn.model_selection as ms

import torchtext.vocab as vb

In [7]:
RANDOM_SEED = 42

np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
torch.cuda.manual_seed_all(RANDOM_SEED)

In [11]:
torch.cuda.get_device_properties("cuda")

_CudaDeviceProperties(name='Tesla K20Xm', major=3, minor=5, total_memory=5699MB, multi_processor_count=14)

In [12]:
dataset = pd.read_hdf("../data/small_data.hdf", "df")

In [13]:
dataset.head(4)

Unnamed: 0,x,y
0,"[[START], Hello]","[üíú, <NOE>]"
1,"[[START], So, yesterday, I, got, my, self, a, ...","[<NOE>, <NOE>, <NOE>, <NOE>, <NOE>, <NOE>, <NO..."
2,"[[START], Lord, bustta, I, greet, you, üèø, üèø]","[<NOE>, <NOE>, <NOE>, <NOE>, <NOE>, üôå, üôå, <NOE>]"
3,"[[START], ADELIN, MADE, ME, YOUR, FRIEND]","[<NOE>, <NOE>, <NOE>, <NOE>, <NOE>, üòî]"


In [14]:
print(f"Totally {dataset.shape[0]} examples in dataset")

Totally 419582 examples in dataset


# Complete vocabualry

In [15]:
token_vocab = set()
emoji_vocab = set()

for row in tqdm(dataset.values):
    token_vocab |= frozenset(row[0])
    emoji_vocab |= frozenset(row[1])

HBox(children=(IntProgress(value=0, max=419582), HTML(value='')))




In [16]:
print(f"Totally {len(emoji_vocab)} emojis and {len(token_vocab)} tokens")

Totally 1161 emojis and 115405 tokens


# Splitting into train, dev and test

In [17]:
dataset = dataset.sample(frac=1.0).reset_index(drop=True)

In [32]:
TEST_SIZE = 0.2
MIN_FREQ = 3
MAX_SIZE_TOKENS = 100000
MAX_SIZE_EMOJIS = 1000
VECTORS = 'glove.840B.300d'

In [24]:
train_and_dev, test = ms.train_test_split(dataset, test_size=TEST_SIZE, random_state=RANDOM_SEED)

In [25]:
train, dev = ms.train_test_split(train_and_dev, test_size=TEST_SIZE, random_state=RANDOM_SEED)

In [26]:
print(f"Totally {train.shape[0]} train examples, {dev.shape[0]} dev examples and {test.shape[0]} test examples")

Totally 268532 train examples, 67133 dev examples and 83917 test examples


# Reduced Vocabulary

In [27]:
tokens = Counter()
emojis = Counter()

for row in tqdm(train.values):
    tokens.update(row[0])
    emojis.update(row[1])

HBox(children=(IntProgress(value=0, max=268532), HTML(value='')))




In [33]:
tokens_vocab = vb.Vocab(
    tokens, max_size=MAX_SIZE_TOKENS,
    min_freq=MIN_FREQ, specials=['[START]'],
    vectors=VECTORS
)
emojis_vocav = vb.Vocab(
    emojis, max_size=MAX_SIZE_EMOJIS,
    min_freq=MIN_FREQ, specials=['<NOE>'],
    specials_first=True
)

.vector_cache/glove.840B.300d.zip: 2.18GB [20:55, 1.73MB/s]                                
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ| 2195742/2196017 [07:02<00:00, 5389.23it/s]