In [None]:
import json
import os
import random
from collections import Counter, defaultdict
from functools import partial
from multiprocessing import Pool

import numpy as np
import pandas as pd
import sklearn
import torch
from gensim.models import Word2Vec, KeyedVectors

%load_ext Cython

# Library codes

In [None]:
%%cython
import re
from multiprocessing import Pool

import numpy as np
cimport numpy as np


cdef class StringReplacer:
    cpdef public dict rule
    cpdef list keys
    cpdef list values
    cpdef int n_rules

    def __init__(self, dict rule):
        self.rule = rule
        self.keys = list(rule.keys())
        self.values = list(rule.values())
        self.n_rules = len(rule)

    def __call__(self, str x):
        cdef int i
        for i in range(self.n_rules):
            if self.keys[i] in x:
                x = x.replace(self.keys[i], self.values[i])
        return x

    def __getstate__(self):
        return (self.rule, self.keys, self.values, self.n_rules)

    def __setstate__(self, state):
        self.rule, self.keys, self.values, self.n_rules = state
        
        
cdef class RegExpReplacer:
    cdef dict rule
    cdef list keys
    cdef list values
    cdef regexp
    cdef int n_rules

    def __init__(self, dict rule):
        self.rule = rule
        self.keys = list(rule.keys())
        self.values = list(rule.values())
        self.regexp = re.compile('(%s)' % '|'.join(self.keys))
        self.n_rules = len(rule)

    @property
    def rule(self):
        return self.rule

    def __call__(self, str x):
        def replace(match):
            x = match.group(0)
            if x in self.rule:
                return self.rule[x]
            else:
                for i in range(self.n_rules):
                    x = re.sub(self.keys[i], self.values[i], x)
                return x
        return self.regexp.sub(replace, x)
    

cdef class ApplyNdArray:
    cdef func
    cdef dtype
    cdef dims
    cdef int processes

    def __init__(self, func, processes=1, dtype=object, dims=None):
        self.func = func
        self.processes = processes
        self.dtype = dtype
        self.dims = dims

    def __call__(self, arr):
        if self.processes == 1:
            return self.apply(arr)
        else:
            return self.apply_parallel(arr)

    cpdef apply(self, arr):
        cdef int i
        cdef int n = len(arr)
        if self.dims is not None:
            shape = (n, *self.dims)
        else:
            shape = n
        cdef res = np.empty(shape, dtype=self.dtype)
        for i in range(n):
            res[i] = self.func(arr[i])
        return res

    cpdef apply_parallel(self, arr):
        cdef list arrs = np.array_split(arr, self.processes)
        with Pool(processes=self.processes) as pool:
            outputs = pool.map(self.apply, arrs)
        return np.concatenate(outputs, axis=0)


In [None]:
def load_qiqc(n_rows=None):
    train_df = pd.read_csv(f'{os.environ["DATADIR"]}/train.csv', nrows=n_rows)
    submit_df = pd.read_csv(f'{os.environ["DATADIR"]}/test.csv', nrows=n_rows)
    n_labels = {
        0: (train_df.target == 0).sum(),
        1: (train_df.target == 1).sum(),
    }
    train_df['target'] = train_df.target.astype('f')
    train_df['weights'] = train_df.target.apply(lambda t: 1 / n_labels[t])

    return train_df, submit_df


def build_datasets(train_df, submit_df, holdout, seed):
    submit_dataset = QIQCDataset(submit_df)
    if holdout:
        # Train : Test split for holdout training
        splitter = sklearn.model_selection.StratifiedShuffleSplit(
            n_splits=1, test_size=0.1, random_state=seed)
        train_indices, test_indices = list(splitter.split(
            train_df, train_df.target))[0]
        train_indices.sort(), test_indices.sort()
        train_dataset = QIQCDataset(
            train_df.iloc[train_indices].reset_index(drop=True))
        test_dataset = QIQCDataset(
            train_df.iloc[test_indices].reset_index(drop=True))
    else:
        train_dataset = QIQCDataset(train_df)
        test_dataset = QIQCDataset(train_df.head(0))

    return train_dataset, test_dataset, submit_dataset


class QIQCDataset(object):

    def __init__(self, df):
        self.df = df

    @property
    def tokens(self):
        return self.df.tokens.values

    @tokens.setter
    def tokens(self, tokens):
        self.df['tokens'] = tokens

    @property
    def positives(self):
        return self.df[self.df.target == 1]

    @property
    def negatives(self):
        return self.df[self.df.target == 0]

    def build(self, device):
        self._X = self.tids
        self.X = torch.Tensor(self._X).type(torch.long).to(device)
        if 'target' in self.df:
            self._t = self.df.target[:, None]
            self._W = self.df.weights
            self.t = torch.Tensor(self._t).type(torch.float).to(device)
            self.W = torch.Tensor(self._W).type(torch.float).to(device)
        if hasattr(self, '_X2'):
            self.X2 = torch.Tensor(self._X2).type(torch.float).to(device)
        else:
            self._X2 = np.zeros((self._X.shape[0], 1), 'f')
            self.X2 = torch.Tensor(self._X2).type(torch.float).to(device)

    def build_labeled_dataset(self, indices):
        return torch.utils.data.TensorDataset(
            self.X[indices], self.X2[indices],
            self.t[indices], self.W[indices])
    
## Pretrained vector

def load_pretrained_vectors(names, token2id, test=False):
    assert isinstance(names, list)
    with Pool(processes=len(names)) as pool:
        f = partial(load_pretrained_vector, token2id=token2id, test=test)
        vectors = pool.map(f, names)
    return dict([(n, v) for n, v in zip(names, vectors)])


def load_pretrained_vector(name, token2id, test=False):
    loader = dict(
        gnews=GNewsPretrainedVector,
        wnews=WNewsPretrainedVector,
        paragram=ParagramPretrainedVector,
        glove=GlovePretrainedVector,
    )
    return loader[name].load(token2id, test)


class BasePretrainedVector(object):

    @classmethod
    def load(cls, token2id, test=False, limit=None):
        embed_shape = (len(token2id), 300)
        freqs = np.zeros((len(token2id)), dtype='f')

        if test:
            np.random.seed(0)
            vectors = np.random.normal(0, 1, embed_shape)
            vectors[0] = 0
            vectors[len(token2id) // 2:] = 0
        else:
            vectors = np.zeros(embed_shape, dtype='f')
            path = f'{os.environ["DATADIR"]}/{cls.path}'
            for i, o in enumerate(
                    open(path, encoding="utf8", errors='ignore')):
                token, *vector = o.split(' ')
                token = str.lower(token)
                if token not in token2id or len(o) <= 100:
                    continue
                if limit is not None and i > limit:
                    break
                freqs[token2id[token]] += 1
                vectors[token2id[token]] += np.array(vector, 'f')

        vectors[freqs != 0] /= freqs[freqs != 0][:, None]
        vec = KeyedVectors(300)
        vec.add(list(token2id.keys()), vectors, replace=True)

        return vec


class GNewsPretrainedVector(object):

    name = 'GoogleNews-vectors-negative300'
    path = f'embeddings/{name}/{name}.bin'

    @classmethod
    def load(cls, tokens, limit=None):
        raise NotImplementedError
        path = f'{os.environ["DATADIR"]}/{cls.path}'
        return KeyedVectors.load_word2vec_format(
            path, binary=True, limit=limit)


class WNewsPretrainedVector(BasePretrainedVector):

    name = 'wiki-news-300d-1M'
    path = f'embeddings/{name}/{name}.vec'


class ParagramPretrainedVector(BasePretrainedVector):

    name = 'paragram_300_sl999'
    path = f'embeddings/{name}/{name}.txt'


class GlovePretrainedVector(BasePretrainedVector):

    name = 'glove.840B.300d'
    path = f'embeddings/{name}/{name}.txt'

    
class WordVocab(object):

    def __init__(self, mincount=1):
        self.counter = Counter()
        self.n_documents = 0
        self._counters = {}
        self._n_documents = defaultdict(int)
        self.mincount = mincount

    def __len__(self):
        return len(self.token2id)

    def add_documents(self, documents, name):
        self._counters[name] = Counter()
        for document in documents:
            bow = dict.fromkeys(document, 1)
            self._counters[name].update(bow)
            self.counter.update(bow)
            self.n_documents += 1
            self._n_documents[name] += 1

    def build(self):
        counter = dict(self.counter.most_common())
        self.word_freq = {
            **{'<PAD>': 0},
            **counter,
        }
        self.token2id = {
            **{'<PAD>': 0},
            **{word: i + 1 for i, word in enumerate(counter)}
        }
        self.lfq = np.array(list(self.word_freq.values())) < self.mincount
        self.hfq = ~self.lfq
        
        
class PunctSpacer(StringReplacer):

    def __init__(self, edge_only=False):
        puncts = [',', '.', '"', ':', ')', '(', '-', '!', '?', '|', ';', "'", '$', '&', '/', '[', ']', '>', '%', '=', '#', '*', '+', '\\', '•',  '~', '@', '£', '·', '_', '{', '}', '©', '^', '®', '`',  '<', '→', '°', '€', '™', '›',  '♥', '←', '×', '§', '″', '′', '█', '½', '…', '“', '★', '”', '–', '●', '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾', '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─', '▒', '：', '¼', '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', '¯', '♦', '¤', '▲', '¸', '¾', '⋅', '‘', '∞', '∙', '）', '↓', '、', '│', '（', '»', '，', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø', '¹', '≤', '‡', '√', ]  # NOQA
        if edge_only:
            rule = {
                **dict([(f' {p}', f' {p} ') for p in puncts]),
                **dict([(f'{p} ', f' {p} ') for p in puncts]),
            }
        else:
            rule = dict([(p, f' {p} ') for p in puncts])
        super().__init__(rule)
        
        
class NumberReplacer(RegExpReplacer):

    def __init__(self, with_underscore=False):
        prefix, suffix = '', ''
        if with_underscore:
            prefix += ' __'
            suffix = '__ '
        rule = {
            '[0-9]{5,}': f'{prefix}#####{suffix}',
            '[0-9]{4}': f'{prefix}####{suffix}',
            '[0-9]{3}': f'{prefix}###{suffix}',
            '[0-9]{2}': f'{prefix}##{suffix}',
        }
        super().__init__(rule)


def set_seed(seed=0):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True


class Pipeline(object):

    def __init__(self, *modules):
        self.modules = modules

    def __call__(self, x):
        for module in self.modules:
            x = module(x)
        return x

# Setup & preprocessing

In [None]:
%%time
os.environ['DATADIR'] = '/kaggle/input'
set_seed(0)
train_df, submit_df = load_qiqc()
datasets = build_datasets(train_df, submit_df, holdout=False, seed=0)
train_dataset, test_dataset, submit_dataset = datasets

In [None]:
%%time
tokenize = Pipeline(
    str.lower,
    PunctSpacer(),
    NumberReplacer(with_underscore=True),
    str.split
)
apply_tokenize = ApplyNdArray(tokenize, processes=2, dtype=object)
train_dataset.tokens, test_dataset.tokens, submit_dataset.tokens = \
    [apply_tokenize(d.df.question_text.values) for d in datasets]
tokens = np.concatenate([d.tokens for d in datasets])

In [None]:
%%time
vocab = WordVocab(mincount=1)
vocab.add_documents(train_dataset.positives.tokens, 'train-pos')
vocab.add_documents(train_dataset.negatives.tokens, 'train-neg')
vocab.add_documents(test_dataset.positives.tokens, 'test-pos')
vocab.add_documents(test_dataset.negatives.tokens, 'test-neg')
vocab.add_documents(submit_dataset.df.tokens, 'submit')
vocab.build()

In [None]:
%%time
glove = load_pretrained_vector('glove', vocab.token2id)
word_vectors = {'glove': glove}
unk = (glove.vectors == 0).all(axis=1)
known = ~unk

In [None]:
params = dict(
    min_count=1,
    workers=1,
    iter=5,
    size=300,
)

# Build models & training

## Word2Vec scratch

In [None]:
%%time
model = Word2Vec(**params)
model.build_vocab_from_freq(vocab.word_freq)
model.train(tokens, total_examples=len(tokens), epochs=model.epochs)
word_vectors['scratch'] = model.wv

## Word2Vec fine-tuning (word vector & context vector)

In [None]:
%%time
model = Word2Vec(**params)
model.build_vocab_from_freq(vocab.word_freq)
idxmap = np.array(
    [vocab.token2id[w] for w in model.wv.index2entity])
model.wv.vectors[:] = glove.vectors[idxmap]
model.trainables.syn1neg[:] = glove.vectors[idxmap]
model.train(tokens, total_examples=len(tokens), epochs=model.epochs)
word_vectors['finetune'] = model.wv

# Evaluations

## **High** frequency words in Quora & **known** words in Glove

- Glove: ○
- Scratch: ○
- Finetune: ○

In [None]:
word = 'obama'
print(vocab.word_freq[word])
pd.DataFrame({name: kv.most_similar(word) for name, kv in word_vectors.items()})

In [None]:
word = 'lgbt'
print(vocab.word_freq[word])
pd.DataFrame({name: kv.most_similar(word) for name, kv in word_vectors.items()})

In [None]:
word = 'cosx'
print(vocab.word_freq[word])
pd.DataFrame({name: kv.most_similar(word) for name, kv in word_vectors.items()})

## **High** frequency words in Quora & **unknown** words in Glove

- Glove: -
- Scratch: ○
- Finetune: ○

In [None]:
word = 'brexit'
print(vocab.word_freq[word])
pd.DataFrame({name: kv.most_similar(word) for name, kv in word_vectors.items()})

In [None]:
word = 'coinbase'
print(vocab.word_freq[word])
pd.DataFrame({name: kv.most_similar(word) for name, kv in word_vectors.items()})

In [None]:
word = 'tensorflow'
print(vocab.word_freq[word])
pd.DataFrame({name: kv.most_similar(word) for name, kv in word_vectors.items()})

In [None]:
word = 'cos2x'
print(vocab.word_freq[word])
pd.DataFrame({name: kv.most_similar(word) for name, kv in word_vectors.items()})

In [None]:
word = 'kubernetes'
print(vocab.word_freq[word])
pd.DataFrame({name: kv.most_similar(word) for name, kv in word_vectors.items()})

In [None]:
word = 'gdpr'
print(vocab.word_freq[word])
pd.DataFrame({name: kv.most_similar(word) for name, kv in word_vectors.items()})

## **Low** frequency words in Quora & **known** words in Glove

- Glove: ○
- Scratch: ☓
- Finetune: ○

In [None]:
word = '0bama'
print(vocab.word_freq[word])
pd.DataFrame({name: kv.most_similar(word) for name, kv in word_vectors.items()})

In [None]:
word = 'germnay'
print(vocab.word_freq[word])
pd.DataFrame({name: kv.most_similar(word) for name, kv in word_vectors.items()})

In [None]:
word = 'gogole'
print(vocab.word_freq[word])
pd.DataFrame({name: kv.most_similar(word) for name, kv in word_vectors.items()})

In [None]:
word = 'javadoc'
print(vocab.word_freq[word])
pd.DataFrame({name: kv.most_similar(word) for name, kv in word_vectors.items()})

In [None]:
word = 'cython'
print(vocab.word_freq[word])
pd.DataFrame({name: kv.most_similar(word) for name, kv in word_vectors.items()})

In [None]:
word = 'compresses'
print(vocab.word_freq[word])
pd.DataFrame({name: kv.most_similar(word) for name, kv in word_vectors.items()})

## **Low** frequency words in Quora & **unknown** words in Glove
- Glove: ☓
- Scratch: ☓
- Finetune: ☓

In [None]:
word = 'xgboost'
print(vocab.word_freq[word])
pd.DataFrame({name: kv.most_similar(word) for name, kv in word_vectors.items()})

In [None]:
word = '2sinxcosx'
print(vocab.word_freq[word])
pd.DataFrame({name: kv.most_similar(word) for name, kv in word_vectors.items()})

In [None]:
word = 'germeny'
print(vocab.word_freq[word])
pd.DataFrame({name: kv.most_similar(word) for name, kv in word_vectors.items()})

In [None]:
word = 'bigender'
print(vocab.word_freq[word])
pd.DataFrame({name: kv.most_similar(word) for name, kv in word_vectors.items()})

In [None]:
word = 'youcanttellyourstoryfromthe'
print(vocab.word_freq[word])
pd.DataFrame({name: kv.most_similar(word) for name, kv in word_vectors.items()})

In [None]:
word = '5gfwdhf4rz'
print(vocab.word_freq[word])
pd.DataFrame({name: kv.most_similar(word) for name, kv in word_vectors.items()})

In [None]:
word = 'ॡ'
print(vocab.word_freq[word])
pd.DataFrame({name: kv.most_similar(word) for name, kv in word_vectors.items()})

In [None]:
pd.DataFrame(np.array(list(vocab.word_freq.items()))).to_csv('all.csv', index=False, sep='\t')
pd.DataFrame(np.array(list(vocab.word_freq.items()))[unk]).to_csv('unk.csv', index=False, sep='\t')
pd.DataFrame(np.array(list(vocab.word_freq.items()))[known]).to_csv('known.csv', index=False, sep='\t')