In [1]:
import pandas as pd
import torch
from tqdm import tqdm 

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
tdf = pd.read_json("../dataset/traindf.json")
vdf = pd.read_json("../dataset/valdf.json")

In [3]:
def returnSynthSentences(df):
    csnliHinglish = list(df["Hinglish_csnliop"])
    sentences = []
    for line in csnliHinglish:
        sent = []
        for token in line:
            sent.append(token[1])
        sent = " ".join(sent)
        sentences.append(sent)
    
    return sentences

In [4]:
def returnHumSentences(df):
    csnliHinglish = list(df["hum_gen_csnliop"])
    sentences = []
    for sents in csnliHinglish:
        ref = []
        for line in sents:
            sent = []
            for token in line:
                sent.append(token[1])
            sent = " ".join(sent)
            ref.append(sent)
            
        sentences.append(ref)
    
    return sentences

In [5]:
trainSynth = returnSynthSentences(tdf)
validSynth = returnSynthSentences(vdf)

trainHuman = returnHumSentences(tdf)
validHuman = returnHumSentences(vdf)

print(len(trainSynth), len(trainHuman), len(validSynth), len(validHuman))

2766 2766 395 395


In [6]:
xlmr = torch.hub.load('pytorch/fairseq', 'xlmr.large').cuda()
xlmr.eval()

Using cache found in /home/akshay.goindani/.cache/torch/hub/pytorch_fairseq_main
2022-05-07 17:34:35 | INFO | fairseq.file_utils | loading archive file http://dl.fbaipublicfiles.com/fairseq/models/xlmr.large.tar.gz from cache at /home/akshay.goindani/.cache/torch/pytorch_fairseq/3f864e15bb396f062dd37494309dbc4238416edd1f8ef293df18b1424813f2fe.cf46c7deb6b9eaa3e47c17b9fc181669c52bc639c165fbc69166a61487662ac9
2022-05-07 17:34:40 | INFO | fairseq.tasks.multilingual_masked_lm | dictionary: 250001 types


RobertaHubInterface(
  (model): RobertaModel(
    (encoder): RobertaEncoder(
      (sentence_encoder): TransformerEncoder(
        (dropout_module): FairseqDropout()
        (embed_tokens): Embedding(250002, 1024, padding_idx=1)
        (embed_positions): LearnedPositionalEmbedding(514, 1024, padding_idx=1)
        (layernorm_embedding): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (layers): ModuleList(
          (0): TransformerEncoderLayerBase(
            (self_attn): MultiheadAttention(
              (dropout_module): FairseqDropout()
              (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
            )
            (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (d

In [7]:
def returnSentenceVectors(sents):
    with torch.no_grad():
        vectors = []
        for line in tqdm(sents):
            if isinstance(line, list):
                refs = []
                for st in line:
                    tokens = xlmr.encode(st.strip())
                    llf = xlmr.extract_features(tokens)
                    llf = llf.mean(dim=1)
                    refs.append(llf)
                refs = torch.cat(refs, dim=0)
                refs = refs.mean(dim=0).unsqueeze(0)
                vectors.append(refs)

            else:
                tokens = xlmr.encode(line)
                llf = xlmr.extract_features(tokens)
                llf = llf.mean(dim=1)
                vectors.append(llf)

        vectors = torch.cat(vectors, dim=0)
        return vectors

In [8]:
trainSynthVecs = returnSentenceVectors(trainSynth)
validSynthVecs = returnSentenceVectors(validSynth)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2766/2766 [00:53<00:00, 51.50it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 395/395 [00:07<00:00, 52.16it/s]


In [9]:
torch.save(trainSynthVecs, "train.synth.pt")
torch.save(validSynthVecs, "valid.synth.pt")

In [10]:
trainHumanVecs = returnSentenceVectors(trainHuman)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2766/2766 [02:08<00:00, 21.60it/s]


In [None]:
validHumanVecs = returnSentenceVectors(validHuman)

 68%|████████████████████████████████████████████████████████████████████████████████████████▌                                         | 269/395 [00:12<00:05, 21.72it/s]

In [None]:
torch.save(trainHumanVecs, "train.human.pt")
torch.save(validHumanVecs, "valid.human.pt")