In [5]:
# Imports

from random import randint

import numpy as np
import torch
import numpy as np
import matplotlib.pyplot as plt
import cv2
import pandas as pd
from tqdm import tqdm

## Using Infersent ( glove & word2vec)

In [2]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/jupyter/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [5]:
!wget https://raw.githubusercontent.com/facebookresearch/InferSent/master/models.py
    

--2019-04-27 16:31:08--  https://raw.githubusercontent.com/facebookresearch/InferSent/master/models.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 29786 (29K) [text/plain]
Saving to: ‘models.py’


2019-04-27 16:31:08 (93.2 MB/s) - ‘models.py’ saved [29786/29786]



In [30]:
!wget https://dl.fbaipublicfiles.com/senteval/infersent/infersent1.pkl
!wget https://dl.fbaipublicfiles.com/senteval/infersent/infersent2.pkl

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  146M  100  146M    0     0  63.8M      0  0:00:02  0:00:02 --:--:-- 63.8M
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  146M  100  146M    0     0  55.7M      0  0:00:02  0:00:02 --:--:-- 55.7M


### Load model

In [3]:

from models import InferSent
model_version = 1
MODEL_PATH = "../models/encoder/infersent%s.pkl" % model_version
params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,
                'pool_type': 'max', 'dpout_model': 0.0, 'version': model_version}
model = InferSent(params_model)
model.load_state_dict(torch.load(MODEL_PATH))

In [4]:
# Keep it on CPU or put it on GPU
use_cuda = True
model = model.cuda() if use_cuda else model

In [5]:
# If infersent1 -> use GloVe embeddings. If infersent2 -> use InferSent embeddings.
W2V_PATH = '../data/glove/glove.840B.300d.txt' if model_version == 1 else '../dataset/fastText/crawl-300d-2M.vec'
model.set_w2v_path(W2V_PATH)

In [6]:
%%time
# Load embeddings of K most frequent words
model.build_vocab_k_words(K=100000)

Vocab size : 100000
CPU times: user 6.91 s, sys: 716 ms, total: 7.62 s
Wall time: 15.4 s


## load sentences

In [6]:
df = pd.read_csv("../data/train_data_1000.csv")
df.head()

Unnamed: 0,image_id,ques_id,question,answer
0,458752,458752000,What is this photo taken looking through?,net
1,458752,458752001,What position is this man playing?,pitcher
2,458752,458752002,What color is the players shirt?,orange
3,458752,458752003,Is this man a professional baseball player?,yes
4,262146,262146000,What color is the snow?,white


In [7]:
vocab=[]
sentences=[]
for _,row in tqdm(df.iterrows()):
    vocab.append(row.question + " " + str(row.answer))
    sentences.append(row.question)

368900it [00:36, 10077.02it/s]


In [8]:
sentences[:5]

['What is this photo taken looking through?',
 'What position is this man playing?',
 'What color is the players shirt?',
 'Is this man a professional baseball player?',
 'What color is the snow?']

## Encode sentences

In [10]:
%%time

embeddings = model.encode(sentences, bsize=1500, tokenize=True, verbose=True)
print('nb sentences encoded : {0}'.format(len(embeddings)))

Nb words kept : 3368208/3372447 (99.9%)
Speed : 3150.0 sentences/s (gpu mode, bsize=1500)
nb sentences encoded : 368900
CPU times: user 1min 30s, sys: 32.4 s, total: 2min 2s
Wall time: 1min 57s


In [11]:
%%time
# Save the model
np.save("../data/features_infersent_1000.npy",embeddings)

CPU times: user 0 ns, sys: 3.78 s, total: 3.78 s
Wall time: 3.95 s


In [12]:
len(embeddings)

368900

## Measure distance between two sentences

In [9]:

def cosine(u, v):
    return np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v))

In [139]:

cosine(model.encode(['What color is the players shirt?'])[0], model.encode(["What color shirt is the player wearing?"])[0])

0.91577494

# Using BERT ( use keras kernel/env to run the below code )

In [1]:
# imports
from bert_serving.client import BertClient
import numpy as np



In [2]:
bc = BertClient()

In [3]:
a=bc.encode(['What color is the players shirt?', 'What color shirt is the player wearing?'])

In [10]:
cosine(a[0],a[1])

0.93905

In [11]:

# Get sentence embeddings
embeddings=bc.encode(sentences)
np.save("../data/features_bert_1000.npy",embeddings)

In [None]:
https://www.analyticsvidhya.com/blog/2019/03/learn-to-use-elmo-to-extract-features-from-text/

In [None]:
https://towardsdatascience.com/bert-in-keras-with-tensorflow-hub-76bcbc9417b

In [None]:
https://github.com/strongio/keras-bert/blob/master/keras-bert.ipynb

In [None]:
https://github.com/strongio/keras-elmo/blob/master/Elmo%20Keras.ipynb