In [1]:
from google.colab import drive
drive.mount('/content/drive')
%cd drive/'My Drive'/UniversalEmb

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/My Drive/UniversalEmb


In [0]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.metrics import accuracy_score
import time
from math import sqrt
from torchvision import datasets
from torchvision import transforms
from torch.utils.data.sampler import SubsetRandomSampler

In [3]:
!apt install libomp-dev
!python -m pip install --upgrade faiss faiss-gpu

Reading package lists... Done
Building dependency tree       
Reading state information... Done
libomp-dev is already the newest version (5.0.1-1).
0 upgraded, 0 newly installed, 0 to remove and 32 not upgraded.
Requirement already up-to-date: faiss in /usr/local/lib/python3.6/dist-packages (1.5.3)
Requirement already up-to-date: faiss-gpu in /usr/local/lib/python3.6/dist-packages (1.6.3)


In [0]:
import faiss
import torch
torch.manual_seed(0)
import numpy as np
np.random.seed(0)

import tensorflow as tf
import tensorflow_hub as hub

In [0]:
module_url = "https://tfhub.dev/google/universal-sentence-encoder-large/5" 
model_USE = hub.load(module_url)
def USE_Embeder(inp):
  return model_USE(inp)

In [0]:
def get_k_near_neibs(train_vectors, augmented_vectors, k=5, gpu=False):
  index = faiss.IndexFlatL2(train_vectors.shape[1])   # build the index
  index.add(augmented_vectors)
  D, I = index.search(train_vectors, k)  
  print("Number of neibs : {}".format(I.shape[0]*I.shape[1])) 
  return D,I

In [0]:
def filter_samples_by_dist(distances, indices, augs, train):
  mask_for_ind = np.where(distances < 1.1, 1, -1)
  filtred_ind = np.multiply(indices, mask_for_ind)
  augs = augs.reset_index(drop=True)
  augmented_text = []
  augmented_labels = []
  augmented_lbl_index = []
  for row, lbl  in zip(filtred_ind, train['lbl_index'].tolist()):
    for ind in row:
      if ind > -1:
        augmented_text.append(augs.text[ind])
        augmented_labels.append(augs.labels[ind])
        augmented_lbl_index.append(augs.lbl_index[ind])
  print('Number of samples after filting by distance : {}'.format(len(augmented_text)))
  return pd.DataFrame({'text': augmented_text, 'lbl_index':augmented_lbl_index, 'labels':augmented_labels})

In [0]:
from tqdm import tqdm
def filter_samples_with_USE(train, all_aug, name, k=2, gpu=False):
  batch = 15000
  n = 2
  filtred_augs = []
  train_vectors = USE_Embeder(train.text).numpy()
  for b in tqdm(range(0, len(all_aug), batch)):
    part_of_aug = all_augs[b:b+batch]
    aug_vectors = USE_Embeder(part_of_aug.text).numpy()
    n_samples = int(len(train_vectors)/n)
    s = 0
    d, i = [], []
    for i in range(n):
      tr_vect_part = train_vectors[s:s+n_samples]
      s+=n_samples
      dist, ind = get_k_near_neibs(tr_vect_part, aug_vectors, k)
      d.append(dist)
      i.append(ind)
    dist = np.concatenate(d)
    ind = np.concatenate(i)

    sample = filter_samples_by_dist(dist, ind, part_of_aug, train,)
    filtred_augs.append(sample)
  pd.concat(filtred_augs).to_csv('USE_filtring_sst2_{}_{}.csv'.format(name[0], name[1]))
  return 

In [0]:
import pandas as pd
def read_trec6(file_name, typ=0):
  train = pd.read_csv(file_name, index_col=0)
  lbl2indx = {i:j for j, i in enumerate(train.labels.unique())}
  train.columns = ['text', 'labels']
  train['lbl_index'] = train.labels.apply(lambda x:lbl2indx[x])


  labels_t = [i.split(' ', 1)[0].split(':')[typ] for i in open('TREC_10.label', encoding = 'windows-1252').readlines()]
  texts_t = [ i.split(' ', 1)[1][:-1] for i in open('TREC_10.label', encoding = 'windows-1252').readlines()]
  test  = pd.DataFrame({'text':texts_t, 'labels': labels_t})
  test['lbl_index'] = test.labels.apply(lambda x:lbl2indx[x])

  return train, test, lbl2indx

In [0]:
import pandas as pd
def read_trec50(file_name, typ=1):
  train = pd.read_csv(file_name, index_col=0)
  lbl2indx = {i:j for j, i in enumerate(train.labels.unique())}
  train.columns = ['text', 'labels']
  train['lbl_index'] = train.labels.apply(lambda x:lbl2indx[x])


  labels_t = [i.split(' ', 1)[0].split(':')[typ] for i in open('TREC_10.label', encoding = 'windows-1252').readlines()]
  texts_t = [ i.split(' ', 1)[1][:-1] for i in open('TREC_10.label', encoding = 'windows-1252').readlines()]
  test  = pd.DataFrame({'text':texts_t, 'labels': labels_t})
  test['lbl_index'] = test.labels.apply(lambda x:lbl2indx[x])

  return train, test, lbl2indx

In [0]:
import pandas as pd
def read_sst(file_name):
  i2l = {1:"pos", 0:"neg"}
  train_data = pd.read_csv(file_name, index_col=0)
  train_data.columns = ['text', 'lbl_index']
  train_data["labels"] = train_data.lbl_index.apply(lambda x:i2l[x])
  test_data = pd.read_csv("sst2-splited/test.csv", sep = '\t')
  test_data.columns = ['text', 'lbl_index']
  test_data["labels"] = test_data.lbl_index.apply(lambda x:i2l[x])
  return train_data, test_data, {i:j for j,i in i2l.items()}
  

In [0]:
from glob import glob 
import pandas as pd
files = glob('./sst2-splited/sst_Splited_3k_bert_*')
all_augs = pd.concat([pd.read_csv(i, index_col=0) for i in files])

In [0]:
train_data, test_data, l2i = read_sst('sst2-splited/sst2_full_rev_3k.csv')

In [0]:
a = filter_samples_with_USE(train_data, all_augs[:10000], name=('3','0'))
a

In [0]:
!nvidia-smi

In [17]:
len(all_augs)

472482

In [0]:
all_augs['text_len'] = all_augs.text.apply(lambda x:len(x))

In [0]:
all_augs['text_f'] = np.where(all_augs.text_len>200, all_augs.text, '0')

In [42]:
all_augs[all_augs.text_f == '0'].shape

(416440, 5)

In [24]:
all_augs.text_f == '0'

0        False
1        False
2        False
3        False
4        False
         ...  
94310    False
94311    False
94312    False
94313    False
94314    False
Name: text_f, Length: 472482, dtype: bool