### Data Featurization/Vectorization using embeddings

In [1]:
# installing transformers library
!pip install -U sentence-transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[K     |████████████████████████████████| 85 kB 4.8 MB/s 
[?25hCollecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.23.1-py3-none-any.whl (5.3 MB)
[K     |████████████████████████████████| 5.3 MB 34.7 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 57.5 MB/s 
[?25hCollecting huggingface-hub>=0.4.0
  Downloading huggingface_hub-0.10.1-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 74.2 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 67.1 MB/s 
Building wheels for collected 

In [2]:
# Import all the required libraries
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer

In [3]:
# SentenceTransformer model
vectorizer = SentenceTransformer('all-MiniLM-L6-v2')

Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [5]:
# Reading the cleaned and preprocessed training and testing data
X_train = pd.read_csv('X_train.tsv', sep='\t')
X_test = pd.read_csv('X_test.tsv', sep='\t')
y_train = np.load('y_train.npy')
y_test = np.load('y_test.npy')

In [6]:
# Creating embeddings for vectorizing the data
def getEmbeddings(data):
  q1_list = list(data['question1'].values)
  q2_list = list(data['question2'].values)
  embeddingQ1 = vectorizer.encode(q1_list)
  embeddingQ2 = vectorizer.encode(q2_list)
  combinedEmbeddings = np.hstack([embeddingQ1, embeddingQ2])
  return combinedEmbeddings

In [7]:
%%time
# vectorizing the data
X_train_vectorized = getEmbeddings(X_train)
X_test_vectorized = getEmbeddings(X_test)

CPU times: user 54 s, sys: 2.35 s, total: 56.4 s
Wall time: 56.4 s


In [8]:
type(X_train_vectorized), X_train_vectorized.shape

(numpy.ndarray, (76564, 768))

In [9]:
type(X_test_vectorized), X_test_vectorized.shape

(numpy.ndarray, (10108, 768))

In [10]:
# display the generate embeddings for a sample record
X_train_vectorized[0]

array([-1.73077248e-02, -1.15620522e-02, -4.47018556e-02, -4.03273813e-02,
        1.55057311e-02, -3.03726755e-02, -3.34449671e-02, -2.45147292e-02,
       -4.14966680e-02,  1.71340127e-02,  4.26926948e-02,  3.55423726e-02,
       -9.32996720e-02,  3.53590287e-02, -3.63464653e-02, -1.16587095e-02,
       -3.65620442e-02, -1.62450802e-02,  5.72150201e-02, -5.28688803e-02,
        9.97331738e-03,  4.66389507e-02, -3.38801593e-02, -1.73442215e-02,
       -1.77384645e-03,  3.48254964e-02,  1.29439562e-04, -7.63141457e-03,
        5.35183288e-02, -5.82148060e-02,  9.54154995e-04,  8.08920041e-02,
        2.01608539e-02, -2.22493410e-02, -6.89876005e-02,  7.50955865e-02,
        1.03642568e-01, -2.19307225e-02,  9.11987722e-02,  1.42452987e-02,
        4.99857329e-02, -8.62038583e-02, -6.05892688e-02, -6.08596988e-02,
        2.70989171e-04, -1.57031175e-02, -3.09687145e-02, -1.18762046e-01,
       -3.04719293e-03, -3.61865982e-02,  1.16090355e-02, -4.72569726e-02,
        5.82350092e-03, -

In [11]:
# Saving the vectorized data to numpy files
np.save('X_train_vectorized.npy', X_train_vectorized)
np.save('X_test_vectorized.npy', X_test_vectorized)

In [20]:
# copying files from current session in colab to google drive
# !cp -r /content/y_train.npy /content/drive/MyDrive/ML_Project/