In [None]:
# Copyright (c) 2022, salesforce.com, inc.
# All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause

This notebook can be run directly on Google Colab. 

Notice that you need to specify the directory/filename to save the document projection vectors.

To read or write files on Google Drive, you can run the code below to mount your Google Drive in the notebook:

```
from google.colab import drive
drive.mount('/content/drive')
```
Then you can specify the path to the file on the Google drive by a path string starting with "/content/drive/MyDrive/"

In [None]:
# !pip install datasets

In [None]:
'''
  Load data.
  You can either download the data again, or just reload the data which has been
  saved as doc.jsonl from 02-doc_process (train+test)
'''
doc_list = []

'''option 1: download the mnli and reconstruct the input'''
# from datasets import load_dataset
# dataset = load_dataset('multi_nli', split='validation_matched')
# dataset = dataset.filter(lambda x: x['genre']=='travel')
# for item in dataset:
#     doc_list.append(item['premise']+'<S/>'+item['hypothesis'])

'''option 2: read the json file and reconstruct the input'''
import pandas as pd
df = pd.read_json(path_or_buf="<specify your path>/mnli_government_travel/doc.jsonl", orient="records")
for idx, item in df.iterrows():
    doc_list.append(item['sentence1']+'<S/>'+item['sentence2'])

In [None]:
doc_list[:2]

["Calcutta seems to be the only other production center having any pretensions to artistic creativity at all, but ironically you're actually more likely to see the works of Satyajit Ray or Mrinal Sen shown in Europe or North America than in India itself.<S/>Most of Mrinal Sen's work can be found in European collections.",
 'The most important directions are simply up and up leads eventually to the cathedral and fortress commanding the hilltop, and down inevitably leads to one of three gates through the wall to the new town.<S/>Go downwards to one of the gates, all of which will lead you into the cathedral.']

Generate Document Embedding

In [None]:
!pip install -U sentence-transformers

Collecting sentence-transformers
  Downloading sentence-transformers-2.2.0.tar.gz (79 kB)
[K     |████████████████████████████████| 79 kB 5.4 MB/s 
[?25hCollecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.17.0-py3-none-any.whl (3.8 MB)
[K     |████████████████████████████████| 3.8 MB 65.3 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 45.1 MB/s 
Collecting tokenizers!=0.11.3,>=0.11.1
  Downloading tokenizers-0.11.6-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.5 MB)
[K     |████████████████████████████████| 6.5 MB 48.0 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.47-py2.py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 54.1 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (59

In [None]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("paraphrase-multilingual-mpnet-base-v2")

In [None]:
sentence_embeddings = model.encode(doc_list)

In [None]:
sentence_embeddings.shape

(1976, 768)

## Dimensionality Reduction 
run t-SNE algorithm to map the document embedding to 2D space.

In [None]:
from sklearn.manifold import TSNE

In [None]:
X_embedded = TSNE(n_components=2).fit_transform(sentence_embeddings)

In [None]:
np.savetxt("<spcify your path>/mnli_government_travel/sentence_tsne.csv", X_embedded, delimiter=",")