### Word2vec - SkipGram architecture 

The Skip-gram model architecture usually tries to predict the context words (surrounding words) given a target word 

![title](img/skipgram.png)

----------------------------------------------------------------------------------------------------------------------

### Word2vec - Data Sampling 

![title](img/sampling.png)

In [None]:
def sample_data(sequence, window_size):
    """
    This function provides a sampling using a window strategy, the window moves on the sequence
    of link_ids and the positives are selected in the scope of the window. e.g, if a list of sequence is
    [1,2,3,4] and the window is 1, the samples are [(1,2), (2,1), (2,3), (3,2), (3,4), (4,3)], this function
    returns zip* of the above list in the form of two lists, source and positive.
    """

    number_of_tokens = len(sequence)
    samples = []
    for i in range(number_of_tokens):
        nbr_inds = list(range(max(0, i - window_size), i)) + list(
            range(i + 1, min(number_of_tokens, i + window_size + 1))
        )
        for j in nbr_inds:
            samples.append((sequence[i], sequence[j]))
    return samples

In [None]:
sequence = ['the', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog']
sample_data(sequence, 2)

----------------------------------------------------------------------------------------------------------------------

### Map n-dimension vectors into vector space and take cosine similarity

![title](img/vector_space.png)

In [None]:
import pandas as pd
import numpy as np
import fasttext
import glob
import re
import cv2
import matplotlib.pyplot as plt

### Reading session data

In [None]:
data = pd.read_parquet("data/data.parquet")
data.head()

### Sorting by event time stamp

In [None]:
data = data.sort_values('event_time_stamp')
data.head()

### Representing the sequence of clicks

In [None]:
data['product_id'] = data['product_id'].astype(str)
session_seq = data.groupby('session_id')['product_id'].apply(list).reset_index(
).rename(columns={'product_id':"sequence_of_clicks"})
session_seq.head()

### Visualizing the sequence length

In [None]:
session_seq['sequence_length'] = session_seq['sequence_of_clicks'].apply(lambda x: len(x))
session_seq.head()

In [None]:
session_seq['sequence_length'].plot.box()

In [None]:
session_seq['sequence_length'].quantile(0.95)

### Removing the outliers

In [None]:
session_seq = session_seq[session_seq['sequence_length'] <= session_seq['sequence_length'].quantile(0.95)]
session_seq = session_seq[session_seq['sequence_length'] >= 2]
session_seq['sequence_length'].plot.box()

In [None]:
session_seq['sequence_length'].value_counts().to_frame().plot.bar()

In [None]:
sample_data(['1463503', '1418365', '1531480'],  2)

### Running SkipGram (using fasttext) on the sequences

In [None]:
fasttext_params = {
            "model": "skipgram",
            "lr": 0.05,
            "dim": 100,
            "ws": 3,
            "epoch": 300,
            "minCount": 1,
            "minn": 3,
            "maxn": 0,
            "neg": 5,
            "wordNgrams": 1,
            "loss": "ns",
            "bucket": 2000000,
            "thread": 24,
            "lrUpdateRate": 100,
            "t": 0.0001,
            "verbose": 2,
        }
sequence_txt_file = 'data/seq.txt'
sequence = [' '.join(x) for x in session_seq['sequence_of_clicks'].values]
np.savetxt(sequence_txt_file, sequence, fmt="%s", encoding="utf-8")
model = fasttext.train_unsupervised(sequence_txt_file, **fasttext_params)

### Generating Embeddings

In [None]:
vectors = np.vstack([model[x] for x in model.words])
vocabs = model.words
vectors_dict = dict(zip(vocabs, vectors))

In [None]:
vectors_dict['1531480']

## Cosine similarity

In [None]:
import numpy as np


def cos_sim(a, b):
    """
    Takes 2 ndarray and  a, b and returns the cosine similarity according
    to the definition of the dot product.
        a should be a single 1-d array
        b should be a 2-d array
    """

    norm_a = np.linalg.norm(a)
    norm_b = np.linalg.norm(b, axis=1)
    return np.dot(a, b.T) / (norm_a * norm_b)

### Build a hash table for (product_id, image)

In [None]:
files = glob.glob('images/*.jpeg')
file_dict = {}
for file in files:
    result = re.search('images/(.*).jpeg', file)
    file_dict[result.group(1)] = file   

### Finding similar item to 

In [None]:
sims = cos_sim(vectors_dict['1556752'], vectors)
sims = sorted(zip(vocabs, sims), key=lambda x: x[1], reverse=True)[:9]
print(sims)

In [None]:
images = []
for product_id, sim in sims: 
    images.append(file_dict[product_id]) 

In [None]:
img = cv2.imread(images[0], cv2.IMREAD_COLOR)
plt.imshow(img[:,:,::-1])


fig = plt.figure(figsize=(10, 7))
i = 1
for image in images[1:]:
    img =  cv2.imread(image, cv2.IMREAD_COLOR)
    ax = fig.add_subplot(3, 3, i)
    plt.imshow(img[:,:,::-1])
    i = i + 1

### Approximate Nearest Neighbor (ANN)