## Word Embeddings


Similarity between words is calculated using **Cosine Similarity**.
For two words represented by vectors u,v:


![Alt text](image-1.png)


If u and v are very similar, their cosine similarity will be close to 1; if they are dissimilar, the cosine similarity will take a smaller value.


![Alt text](image-2.png)

## Loading Data

In [2]:
import tensorflow as tf



url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"

dataset = tf.keras.utils.get_file("aclImdb_v1.tar.gz", url,
                                  untar=True, cache_dir='.',       #download the data
                                  cache_subdir='')

dataset_dir = os.path.join(os.path.dirname(dataset), 'aclImdb')
os.listdir(dataset_dir)

train_dir = os.path.join(dataset_dir, 'train')
os.listdir(train_dir)

Downloading data from https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
[1m84125825/84125825[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m347s[0m 4us/step


In [2]:
import glob

pathPos = "aclImdb/test/pos"  #location of folder containing positive movie reviews
pathNeg = "aclImdb/test/neg"  #location of folder containing negative movie reviews
filepathsPos = glob.glob(os.path.join(pathPos, '*.txt'))
filepathsNeg = glob.glob(os.path.join(pathNeg, '*.txt'))

In [13]:
#importing necessary libraries

import string
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense,Input,Embedding,Flatten


## Preprocessing Data


### Creating a vocabulary

In [6]:
vocab = {}   #dict storing all unique words 
punctuation = ['.',',']
vocab['<pad>'] = 0  #padding token
vocab['<UNK>'] = 1  #unknown words token
index = 2  # Starting index from 2
for comment in comments:
    words = comment.split()  #Split individual sentence to words 
    for word in words:
        word = word.lower()  # Convert to lowercase for consistency
        word = word.strip(string.punctuation)  # Remove punctuation
        if len(word) > 1:
            if word not in vocab:
                vocab[word] = index
                index += 1


In [7]:
reverse_vocab = {index: word for word, index in vocab.items()}
reverse_vocab[45]

'twins'

### Encoding the sentences

In [8]:
encoded_texts = []   #list storing all sentenced encoded
for comment in comments:
    encoded_text = []
    words = comment.split()
    for word in words:
        # Remove punctuation from the word
        word = word.strip(string.punctuation)
        # Encode the word using the vocabulary, defaulting to index 1 for unknown words ('<UNK>')
        encoded_word = vocab.get(word, 1)
        encoded_text.append(encoded_word)
    encoded_texts.append(encoded_text)

In [9]:
encoded_texts

[[1,
  2,
  3,
  4,
  5,
  6,
  1,
  7,
  8,
  9,
  10,
  4,
  5,
  11,
  1,
  12,
  8,
  13,
  14,
  15,
  16,
  17,
  18,
  1,
  19,
  8,
  20,
  4,
  21,
  6,
  22,
  23,
  24,
  1,
  19,
  8,
  20,
  16,
  25,
  26,
  11,
  18,
  1,
  19,
  8,
  27,
  28,
  29,
  30,
  12,
  1,
  31,
  16,
  32,
  33,
  1,
  34,
  31,
  16,
  35,
  36,
  37,
  38,
  39,
  35,
  6,
  37,
  38,
  39,
  40,
  37,
  41,
  1,
  42,
  43,
  37,
  44,
  45,
  19,
  8,
  27,
  37,
  26,
  29,
  46,
  20,
  47,
  48,
  1,
  49,
  8,
  50,
  37,
  32,
  51,
  52,
  53,
  54,
  1,
  55,
  56,
  57,
  58,
  16,
  37,
  38,
  39,
  6,
  59,
  60,
  61,
  6,
  62,
  63,
  64,
  65,
  66,
  67,
  1,
  63,
  68,
  69,
  67,
  70],
 [1,
  71,
  20,
  4,
  72,
  73,
  74,
  75,
  76,
  6,
  77,
  70,
  78,
  79,
  80,
  1,
  81,
  82,
  6,
  1,
  81,
  1,
  83,
  84,
  85,
  1,
  86,
  87,
  88,
  89,
  90,
  6,
  91,
  92,
  93,
  37,
  72,
  1,
  2,
  94,
  95,
  96,
  37,
  97,
  98,
  37,
  99,
  100,
  101,
  8

In [12]:
max_length = 75   #maximum length of a sentence
padded_reviews = pad_sequences(encoded_texts, maxlen=max_length, padding='post') #padding sentence to make length 75.
print(padded_reviews)

[[   12     1    31 ...    69    67    70]
 [    6     8     1 ...   146   147   148]
 [  141   331   146 ...     6   358   365]
 ...
 [    1   371    37 ...     0     0     0]
 [  190    56 26218 ...  5932 10916 10917]
 [    6    37  5337 ...  1254   133  3706]]


## Model

In [23]:
embeded_vector_size = 16   #dimensions of embedding vector
vocab_size = len(vocab)

model = Sequential()
model.add(Input(shape=(75,)))
model.add(Embedding(vocab_size, embeded_vector_size, name="embedding"))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))

In [24]:
X = tf.convert_to_tensor(padded_reviews)
y = tf.convert_to_tensor(labels)

In [25]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
print(model.summary())

None


In [21]:
model.fit(X, y, epochs=50, verbose=0)

<keras.src.callbacks.history.History at 0x17f09b9d0>

In [22]:
loss, accuracy = model.evaluate(X, y)
accuracy

[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 464us/step - accuracy: 1.0000 - loss: 5.9517e-05


1.0

In [26]:
weights = model.get_layer('embedding').get_weights()[0]     #get the embeddings of all the words in dictionary
len(weights)

52033

In [57]:
weights[100]

array([-0.01382252,  0.0206104 , -0.02311552, -0.02308308,  0.01610355,
        0.03104476,  0.03148444,  0.03826911, -0.01811688,  0.02804099,
        0.01905943, -0.02987733, -0.0172006 , -0.01985987,  0.02242563,
        0.02372732], dtype=float32)

## Visualizing the Embeddings

In [103]:
i,j,k,l = vocab['man'],vocab['france'],vocab['woman'],vocab['america']  #getting indices for given words for plotting
print(i,j,k,l)
indx = [i,j,k,l]

814 32 424 923


In [104]:
vectors = [weights[a] for a in indx]


In [107]:
import plotly.figure_factory as ff
from sklearn.decomposition import PCA
import numpy as np

data = np.array(vectors[:])

pca = PCA(n_components=2)  
reduced_vectors = pca.fit_transform(data)   #reducing vector dimensions from 16 to 2
x = [item[0] for item in reduced_vectors]   
y = [item[1] for item in reduced_vectors]
u = x  #projection of vector along x-axis
v = y  #projection of vector along y-axis

fig = ff.create_quiver(x, y, u, v)

labels = ['Man', 'France', 'Woman', 'America']
for i, label in enumerate(labels):
    fig.add_annotation(x=x[i], y=y[i], text=label, showarrow=True, arrowhead=2, ax=0, ay=-40)

fig.show()



## Results

France and America(both being countries) are nearly parallel i.e angle between them is almost zero. Also Man and Woman have a large angle between them (greater than 120 degrees).