In [1]:
#This code is for adaptive GPU usage
import keras.backend as K
cfg = K.tf.ConfigProto()
cfg.gpu_options.allow_growth = True
K.set_session(K.tf.Session(config=cfg))

Using TensorFlow backend.


In [1]:
from InferSent.models import InferSent
import time
import torch
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA

V = 2
MODEL_PATH = './InferSent/encoder/infersent%s.pkl' % V
params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,
                'pool_type': 'max', 'dpout_model': 0.0, 'version': V}
infersent = InferSent(params_model)
infersent.load_state_dict(torch.load(MODEL_PATH))

W2V_PATH = './InferSent/dataset/GloVe/glove.840B.300d.txt'
infersent.set_w2v_path(W2V_PATH)

In [3]:
data = pd.read_csv("SemEval Full Data.csv")
# data2 = pd.read_csv("Tree_Model/Data/Test/clean_test2_elmo.csv")
# data.columns = ['queryID','query','response','labelID']

In [4]:
data.head()

Unnamed: 0,qid,ref_id,category,question,reference_ans,student_ans,acccuracy
0,BULB_C_VOLTAGE_EXPLAIN_WHY1,answer204,BEST,Explain why you got a voltage reading of 1.5 f...,Terminal 1 and the positive terminal are separ...,The positive battery terminal is separated by ...,correct
1,BULB_C_VOLTAGE_EXPLAIN_WHY1,answer204,BEST,Explain why you got a voltage reading of 1.5 f...,Terminal 1 and the positive terminal are separ...,terminal one was positive,incorrect
2,BULB_C_VOLTAGE_EXPLAIN_WHY1,answer204,BEST,Explain why you got a voltage reading of 1.5 f...,Terminal 1 and the positive terminal are separ...,terminal 1 is positive,incorrect
3,BULB_C_VOLTAGE_EXPLAIN_WHY1,answer204,BEST,Explain why you got a voltage reading of 1.5 f...,Terminal 1 and the positive terminal are separ...,i have absolutely no idea,incorrect
4,BULB_C_VOLTAGE_EXPLAIN_WHY1,answer204,BEST,Explain why you got a voltage reading of 1.5 f...,Terminal 1 and the positive terminal are separ...,the terminal is separated from the positive ba...,correct


In [5]:
data['question'] = data['question'].astype(str)
data['reference_ans'] = data['reference_ans'].astype(str)
data['student_ans'] = data['student_ans'].astype(str)

In [6]:
infersent.build_vocab(data['question'], tokenize=True)

Found 1390(/1406) words with w2v vectors
Vocab size : 1390


In [7]:
infersent.update_vocab(data['reference_ans'], tokenize=True)

Found 318(/323) words with w2v vectors
New vocab size : 1708 (added 318 words)


In [8]:
infersent.update_vocab(data['student_ans'], tokenize=True)

Found 2846(/3010) words with w2v vectors
New vocab size : 4554 (added 2846 words)


In [9]:
infersent = infersent.cuda()

In [11]:
def get_embeddings(data, batch_size, ltype):
    prev = time.time()
    for i in range(0,data.shape[0],batch_size):
        print(i)
        print("Time : ", time.time()-prev)
        prev = time.time()
        
        qem = np.float16(infersent.encode(data['question'].values[i:i+batch_size], tokenize=True))
        rem = np.float16(infersent.encode(data['reference_ans'].values[i:i+batch_size], tokenize=True))
        sem = np.float16(infersent.encode(data['student_ans'].values[i:i+batch_size], tokenize=True))
        
        torch.cuda.empty_cache()
        
        if(i == 0):
            q_embed = pd.DataFrame(qem)
            r_embed = pd.DataFrame(rem)
            s_embed = pd.DataFrame(sem)
        else:
            q_embed = pd.concat([q_embed,pd.DataFrame(qem)],axis=0)
            r_embed = pd.concat([r_embed,pd.DataFrame(rem)],axis=0)
            s_embed = pd.concat([s_embed,pd.DataFrame(sem)],axis=0)

        print(q_embed.shape, r_embed.shape, s_embed.shape)

    return q_embed, r_embed, s_embed

In [12]:
q_embed, r_embed, s_embed = get_embeddings(data,500,'Train_3rd')

0
Time :  0.0001347064971923828
(500, 4096) (500, 4096) (500, 4096)
500
Time :  2.3983142375946045
(1000, 4096) (1000, 4096) (1000, 4096)
1000
Time :  2.1294097900390625
(1500, 4096) (1500, 4096) (1500, 4096)
1500
Time :  3.2153069972991943
(2000, 4096) (2000, 4096) (2000, 4096)
2000
Time :  4.707529067993164
(2500, 4096) (2500, 4096) (2500, 4096)
2500
Time :  4.334707975387573
(3000, 4096) (3000, 4096) (3000, 4096)
3000
Time :  2.417628288269043
(3500, 4096) (3500, 4096) (3500, 4096)
3500
Time :  2.780672311782837
(4000, 4096) (4000, 4096) (4000, 4096)
4000
Time :  2.897702217102051
(4500, 4096) (4500, 4096) (4500, 4096)
4500
Time :  2.8903608322143555
(5000, 4096) (5000, 4096) (5000, 4096)
5000
Time :  2.895291328430176
(5500, 4096) (5500, 4096) (5500, 4096)
5500
Time :  3.7608535289764404
(6000, 4096) (6000, 4096) (6000, 4096)
6000
Time :  3.514214038848877
(6500, 4096) (6500, 4096) (6500, 4096)
6500
Time :  3.575150489807129
(7000, 4096) (7000, 4096) (7000, 4096)
7000
Time :  3.828

In [13]:
q_embed.shape, r_embed.shape, s_embed.shape

((48249, 4096), (48249, 4096), (48249, 4096))

In [14]:
q_embed.to_csv("SemEval_Question_Infersent_Embeddings.csv",sep=',',index=None)

In [15]:
r_embed.to_csv("SemEval_Ref_Ans_Infersent_Embeddings.csv",sep=',',index=None)

In [16]:
s_embed.to_csv("SemEval_Stu_Ans_Infersent_Embeddings.csv",sep=',',index=None)