In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
try:
  # %tensorflow_version only exists in Colab.
  %tensorflow_version 2.x
except Exception:
  pass

In [None]:
import numpy as np
import tensorflow as tf
import matplotlib.pylab as plt

## Fetching the Data 

This is a bit annoying. But to download from kaggle we need to upload the kaggle API key here. Then we need to move the file to the correct folder after which we need to change the permissions. The error messages will not provide super helpful information so I've added the correct code here. 

You can also upload the dataset from kaggle manually or you can download all of this locally. The kaggle dataset can be found [here](https://www.kaggle.com/therohk/million-headlines).

Then again, this code works;

The idea is to build something like this; 

```
word_i -> together <- word_j 
```

We will merely predict if these two words share context. In my book they will share context if they are in the same title. Later we will try to construct document embeddings as well.

## Sequence of Letters 

Let's now take these headlines and grab sequences of letters out of them.

In [None]:
import pandas as pd
import itertools as it 
from collections import Counter 
from functools import reduce 

n_documents = 20000

headlines = pd.read_csv('/kaggle/input/million-headlines/abcnews-date-text.csv')['headline_text'][:n_documents]

In [None]:
headlines

The next codeblock looks strange but it is much faster to split the counters up.

In [None]:
import tqdm

combs = []
bunchsize = 500
for i in tqdm.tqdm(range(round(n_documents/bunchsize))):
  g = (Counter(it.combinations(i.split(" "), 2)) for i in headlines[bunchsize*i:bunchsize*(i+1)])
  combs.append(reduce(lambda x,y : x + y, g))

In [None]:
combs[0][('act', 'fire')]

In [None]:
combs[1][('act', 'fire')]

In [None]:
big_word_count = reduce(lambda x,y : x + y, combs)

In [None]:
big_word_count[('abalone', 'penalties')]

In [None]:
word_count = Counter([i[0] for i in big_word_count.keys()]) + Counter([i[1] for i in big_word_count.keys()])
uniq_words = Counter(word_count.keys()).keys()
num_words = len(uniq_words)

In [None]:
list(big_word_count.keys())[0]

In [None]:
m = {c: i for i, c in enumerate(uniq_words)}

def gen_rand_tok(n):
  t1 = np.random.choice(list(word_count.keys()), size=n, replace=True)
  t2 = np.random.choice(list(word_count.keys()), size=n, replace=True)
  return np.array([[m[w1], m[w2]] for w1, w2 in zip(t1, t2) if (w1, w2) not in big_word_count.keys()])

This is where we generate the training labels. 

In [None]:
positive_integers = np.array([[m[w1], m[w2]] for w1, w2 in big_word_count.keys()])
negative_integers = gen_rand_tok(n=positive_integers.shape[0])

integers_in = np.concatenate([positive_integers, negative_integers])
labels_in = np.concatenate([list(big_word_count.values()), np.zeros(negative_integers.shape[0])])

In [None]:
from tensorflow.keras.layers import Embedding, Dense, Flatten, Input, Dot
from tensorflow.keras.models import Sequential, Model

dim_words = 5

# this one is so we might grab the embeddings
model_emb = Sequential()
embedding = Embedding(num_words, dim_words, input_length=1)
model_emb.add(embedding)
model_emb.add(Flatten())

word_one = Input(shape=(1,))
word_two = Input(shape=(1,))

cross_prod = Dot(axes=1)([model_emb(word_one), model_emb(word_two)])
out = Dense(1, activation="relu")(cross_prod)

glovelike = Model(inputs=[word_one, word_two], outputs=out)

In [None]:
from tensorflow.keras.optimizers import Adam

In [None]:
for lr, epo in zip([0.02, 0.01, 0.002], [5, 10, 10]):
  print(f"stepsize={lr}")
  adam = Adam(learning_rate=lr, beta_1=0.9, beta_2=0.999, amsgrad=False)
  glovelike.compile(adam, 'mse', metrics=['accuracy'])
  glovelike.fit(x=[integers_in[:, 0], integers_in[:, 1]], y=labels_in, epochs=epo, verbose=1)

In [None]:
# for i in range(100, 200):
#   print(headlines[i])

In [None]:
words = ['violence', 'arrested', 'murder', 'police', 'jury', 'minister', 'health', 'finance', 'banks', 'wildlife', 'doctor',]
emb = model_emb.predict([m[i] for i in words])

In [None]:
plt.scatter(emb[:, 0], emb[:, 1], alpha=0)
for i, w in enumerate(words):
  plt.text(emb[i, 0], emb[i, 1], w)

## Cosine Distances 

Let's compare some of these distances now.

In [None]:
from scipy.spatial import distance
import pandas as pd

In [None]:
df = pd.DataFrame(distance.cdist(emb, emb, 'cosine'), columns=words)
df.index = words
q1, q2 = df.quantile([0.2, 0.8]).mean(axis=1).values

def color(val):
    if val < q1:
        color = 'green'
    elif val > q2:
        color = 'red'
    else:
        color = 'yellow'
    return 'background-color: %s' % color

df.style.applymap(color)

## Euclidean Distances 

Let's also have a peek at another type of distance.

In [None]:
df = pd.DataFrame(distance.cdist(emb, emb, 'euclidean'), columns=words)
df.index = words
q1, q2 = df.quantile([0.2, 0.8]).mean(axis=1).values
df.style.applymap(color)

Why is `arrested` so different from `murder`? It feels like they should be similar. 

Let's find out.

In [None]:
n_arrested = (headlines.loc[lambda d: d.str.contains('arrested')].shape)
n_murder = (headlines.loc[lambda d: d.str.contains('murder')].shape)

n_both = (headlines
          .loc[lambda d: d.str.contains('arrested')]
          .loc[lambda d: d.str.contains('murder')]
          .shape)

n_arrested, n_murder, n_both

And what about `money`, `cash`, `zealand` and `domestic`?

In [None]:
n_arrested = (headlines.loc[lambda d: d.str.contains('money')].shape)
n_murder = (headlines.loc[lambda d: d.str.contains('zealand')].shape)

n_both = (headlines
          .loc[lambda d: d.str.contains('money')]
          .loc[lambda d: d.str.contains('zealand')]
          .shape)

n_arrested, n_murder, n_both

Note how there is no overlap! 

## Conclusion 

Notice the tricky business here with word embeddings. They depend on a lot of things; 

- the size of the embeddig $k$ 
- the choice of algorithm 
- the choice of the data going in 

We need to be careful that we don't cherry pick our results. The output of the model may not generalise the language well if we pick a bad language task, a bad dataset or if the hyperparameters of the model are wrong. 

The hope though is that if you train on a large corpus you do get something that might be general. It's not always the case though that a word embedding that is general will be appropriate for the virtual assistant setting.

## In Real Life

You won't need to train these yourself. In fact, it's I'll advised. Best to use the ones trained by others. 
