In [None]:
# Copyright (c) 2022, salesforce.com, inc.
# All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause

This notebook can be run directly on Google Colab. 

Notice that you need to specify the directory/filename to save the processed documents and tokens.

To read or write files on Google Drive, you can run the code below to mount your Google Drive in the notebook:

```
from google.colab import drive
drive.mount('/content/drive')
```


Then you can specify the path to the file on the Google drive by a path string starting with "/content/drive/MyDrive/"

In [None]:
!pip install datasets



In [None]:
'''load mnli data'''
from datasets import load_dataset

dataset = load_dataset('multi_nli', split='validation_matched')

# 1. Testing Data processing
Here we transform the original test set (MNLI, travel genre) into a tokens (unigram, bigram, trigram) and a matrix for the occurrence of each token in each instance.

The tokens in the test set we extracted in this notebook, will then be used to generate error rules.

In [None]:
genre_to_test = ['travel']

In [None]:
data_by_genre = {
    'travel': dataset.filter(lambda x: x['genre']=='travel'),
}

 #### Save doc.jsonl
With the test data saved in a file, the data can be reused by reading doc.jsonl instead of downloading. 
  
The `doc.jsonl` can also be used for document detail view in the user interface by transforming it into a typical json format, 

that is, {"content": docs}, where `docs` is the actual output by running the `df.to_json()` below. 

In [None]:
import pandas as pd

label_map = {0: 'entailment', 1: 'neutral', 2: 'contradiction'}

for genre in ['travel']:
  df = pd.DataFrame()
  df['sentence1'] = data_by_genre[genre]['premise']
  df['sentence2'] = data_by_genre[genre]['hypothesis']

  label_list = [label_map[x] for x in data_by_genre[genre]['label']]
  df['gold_label'] = label_list
  df.to_json(path_or_buf="<specify your path>/mnli_government_travel/doc.jsonl", orient="records")


### Tokenization

In [None]:
import numpy as np

import nltk
import spacy

import os
import json

from gensim.models.phrases import Phrases
from gensim.utils import simple_preprocess

''' you can remove stop words, but we keep them in this example'''
# nltk.download('stopwords')

import gensim.corpora as corpora
from gensim.models import TfidfModel
from gensim.matutils import corpus2dense, corpus2csc

from scipy import sparse

In [None]:
nlp = spacy.load('en_core_web_sm')

def tokenization(df):
  linked = [x['sentence1']+" <S> "+x['sentence2'] for i,x in df.iterrows()]
  '''tokenization'''
  data_word_list = [simple_preprocess(sentence) for sentence in linked]

  print("length of data_word_list: " , len(data_word_list))
  print("length of data_word_list[0]: " , len(data_word_list[0]))

  '''try lemmatization'''
  # Initialize spacy 'en' model, keeping only tagger component needed for lemmatization
  allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']
  data_ready = []

  for sent in data_word_list:
      # Parse the sentence using the loaded 'en' model object `nlp`. Extract the lemma for each token and join
      doc = nlp(" ".join(sent)) 
      data_ready.append([token.lemma_ for token in doc])
      
  # remove stopwords once more after lemmatization
  data_ready = [[word for word in simple_preprocess(str(doc))] for doc in data_ready]

  bigram = Phrases(data_ready)
  bigram_sentences = [bigram[sent] for sent in data_ready]

  trigram = Phrases(bigram_sentences)
  trigram_sent = [trigram[sent] for sent in bigram_sentences]
  return trigram_sent

### Create dictionary and corpus

In [None]:
def corpus_matrix(trigram_sent):
  # Create Dictionary
  id2word = corpora.Dictionary(trigram_sent)
  # Create Corpus: Term Document Frequency
  corpus = [id2word.doc2bow(text) for text in trigram_sent]

  num_docs = id2word.num_docs
  num_terms = len(id2word.keys())
  print("num_docs, num_terms:", num_docs, num_terms)

  model = TfidfModel(corpus)
  corpus_tfidf = model[corpus]
  corpus_tfidf_dense = corpus2dense(corpus_tfidf, num_terms, num_docs)
  print("tfidf shape:", corpus_tfidf_dense.shape)

  ori_columns = []
  for i in id2word.keys():
      ori_columns.append(id2word[i])

  tfidf_sum = corpus_tfidf_dense.sum(axis=1)
  df0 = pd.DataFrame(data=corpus_tfidf_dense, index=ori_columns)
  median_val = np.median(tfidf_sum)
  
  input_col = [ori_columns[i] for i in range(len(ori_columns)) if tfidf_sum[i]>=median_val]
  corpus_binary_danse = (df0.loc[input_col].values>0).astype(int)
  input_columns = input_col 

  A = np.matrix(corpus_binary_danse.T)
  denseA = sparse.csr_matrix(A)
  return denseA, input_columns

In [None]:

for genre in genre_to_test:
  '''
    Read data from jsonl file.
    You can also load the data using datasets library as we did in the 01-model_output+shap.ipynb notebook
  '''
  print("="*10, genre, "="*10)
  df = pd.read_json(path_or_buf="<specify your path>/mnli_government_travel/doc.jsonl", orient="records")
  grams = tokenization(df)

  denseA, input_columns = corpus_matrix(grams)
  
  with open("<specify your path>/mnli_government_travel/input_columns.json", 'w') as output:
    output.write(json.dumps({'input_columns': input_columns}))
  
  sparse.save_npz("<specify your path>/mnli_government_travel/corpus_mat.npz", denseA)

num_docs, num_terms: 1976 4917
tfidf shape: (4917, 1976)


# 2. Training Data Process
Here we calculate how each token in the training data set is relevant to each class in terms of the ground truth.

The information about training data will then be used for interpreting errors in the user interface.

In [None]:
dataset_train = load_dataset('multi_nli', split='train')

In [None]:
for genre in genre_to_test:
  print("="*10, genre, "="*10)
  data = dataset_train.filter(lambda x: x['genre']==genre)

  df = pd.DataFrame()
  df['sentence1'] = data['premise']
  df['sentence2'] = data['hypothesis']
  trigram_sent = tokenization(df)

  id2word = corpora.Dictionary(trigram_sent)

  # Create Corpus: Term Document Frequency
  corpus = [id2word.doc2bow(text) for text in trigram_sent]
  num_docs = id2word.num_docs
  num_terms = len(id2word.keys())

  # use 3 b/c len(labels)==3
  token_labels = np.zeros(shape=(num_terms, 3))
  for doc_idx in range(num_docs):
    doc = corpus[doc_idx]
    doc_label = data[doc_idx]['label']
    for id, freq in doc:
      token_labels[id][doc_label] += 1
  columns = []
  for i in id2word.keys():
      columns.append(id2word[i])

  # save stat for training set
  token_stat = {
      "token_labels": token_labels.tolist(),
      "token_list": columns
  }
  with open('<specify your path>/mnli_government_travel/train_token_stat.json', 'w') as json_output:
    json_output.write(json.dumps(token_stat))

Loading cached processed dataset at /root/.cache/huggingface/datasets/multi_nli/default/0.0.0/591f72eb6263d1ab527561777936b199b714cda156d35716881158a2bd144f39/cache-12a2fb40d31d6a28.arrow


length of data_word_list:  77350
length of data_word_list[0]:  46


