### Data Featurization/Vectorization using embeddings

In [1]:
# installing transformers library
!pip install -U sentence-transformers

Collecting sentence-transformers
  Using cached sentence-transformers-2.2.2.tar.gz (85 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting torchvision
  Downloading torchvision-0.14.0-cp39-cp39-win_amd64.whl (1.1 MB)
     ---------------------------------------- 1.1/1.1 MB 7.0 MB/s eta 0:00:00
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp39-cp39-win_amd64.whl (1.1 MB)
     ---------------------------------------- 1.1/1.1 MB 11.9 MB/s eta 0:00:00
Building wheels for collected packages: sentence-transformers
  Building wheel for sentence-transformers (setup.py): started
  Building wheel for sentence-transformers (setup.py): finished with status 'done'
  Created wheel for sentence-transformers: filename=sentence_transformers-2.2.2-py3-none-any.whl size=125925 sha256=48178f4b3512ba9d2c6960c979238ba567416b9a48bfc8b3417b7d25cc1e316f
  Stored in directory: c:\users\rishikesh\appdata\local\pip\cache\wheels\71\67\

In [7]:
# Import all the required libraries
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
pd.options.display.max_colwidth = None

In [3]:
# SentenceTransformer model
vectorizer = SentenceTransformer('all-MiniLM-L6-v2')

Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [4]:
# Reading the cleaned and preprocessed training and testing data
X_train = pd.read_csv('X_train.tsv', sep='\t')
X_test = pd.read_csv('X_test.tsv', sep='\t')
y_train = np.load('y_train.npy')
y_test = np.load('y_test.npy')
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(394179, 2) (10108, 2) (394179,) (10108,)


In [15]:
len(vectorizer.encode(['this is how it is done', 'ok cool am fine with it']))

2

In [17]:
X_train.shape

(394179, 2)

In [21]:
len(vectorizer.encode(X_train.question1.values[:10])[0])

384

In [23]:
# Creating embeddings for vectorizing the data
def getEmbeddings(data):
    q1_list = data['question1'].values
    q2_list = data['question2'].values
    embeddingQ1 = vectorizer.encode(q1_list)
    embeddingQ2 = vectorizer.encode(q2_list)
    print(f'shape of embeddingQ1 is {embeddingQ1.shape} and that of embeddingQ2 is {embeddingQ2.shape}')
    combinedEmbeddings = np.hstack([embeddingQ1, embeddingQ2])
    print(f'shape of combinedEmbeddings is {combinedEmbeddings.shape}')
    return combinedEmbeddings

In [26]:
%%time
# vectorizing the data
#X_train_vectorized = getEmbeddings(X_train)
#X_test_vectorized = getEmbeddings(X_test)

shape of embeddingQ1 is (394179, 384) and that of embeddingQ2 is (394179, 384)
shape of combinedEmbeddings is (394179, 768)
Wall time: 49min 9s


In [35]:
type(X_train_vectorized), X_train_vectorized.shape, y_train.shape

(numpy.ndarray, (394179, 768), (394179,))

In [36]:
type(X_test_vectorized), X_test_vectorized.shape, y_test.shape

(numpy.ndarray, (10108, 768), (10108,))

In [34]:
# display the generate embeddings for a sample record
X_train_vectorized[0]

array([-1.73077732e-02, -1.15619944e-02, -4.47018966e-02, -4.03273925e-02,
        1.55057255e-02, -3.03726848e-02, -3.34449820e-02, -2.45147198e-02,
       -4.14966196e-02,  1.71340071e-02,  4.26927209e-02,  3.55424024e-02,
       -9.32996944e-02,  3.53590213e-02, -3.63464728e-02, -1.16587244e-02,
       -3.65620442e-02, -1.62451006e-02,  5.72149828e-02, -5.28689176e-02,
        9.97329783e-03,  4.66389507e-02, -3.38801593e-02, -1.73442084e-02,
       -1.77388988e-03,  3.48255448e-02,  1.29412598e-04, -7.63145043e-03,
        5.35183325e-02, -5.82148135e-02,  9.54163319e-04,  8.08919817e-02,
        2.01608576e-02, -2.22493093e-02, -6.89876452e-02,  7.50955790e-02,
        1.03642575e-01, -2.19306834e-02,  9.11987424e-02,  1.42452875e-02,
        4.99857143e-02, -8.62038210e-02, -6.05892837e-02, -6.08597100e-02,
        2.70988094e-04, -1.57031417e-02, -3.09687424e-02, -1.18762076e-01,
       -3.04717803e-03, -3.61866355e-02,  1.16090495e-02, -4.72569354e-02,
        5.82347345e-03, -

In [31]:
# Saving the vectorized data to numpy files
np.save('X_train_vectors.npy', X_train_vectorized)
np.save('X_test_vectors.npy', X_test_vectorized)

In [20]:
# copying files from current session in colab to google drive
# !cp -r /content/y_train.npy /content/drive/MyDrive/ML_Project/