In [None]:
%%capture
!pip install datasets
!pip install transformers
!pip install GPUtil


In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import torch
import datasets
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification, AutoTokenizer
from transformers import TextClassificationPipeline
from GPUtil import showUtilization as gpu_usage
import pandas as pd
import numpy as np
import tqdm
import sys, os

In [None]:
from transformers import AutoModelForSequenceClassification

In [None]:
torch.cuda.empty_cache()
gpu_usage()  

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

| ID | GPU | MEM |
------------------
|  0 |  0% |  0% |
cuda:0


In [None]:
from scipy.stats import pearsonr
from sklearn.metrics import mean_squared_error

In [None]:
def test_eval(language):
  print('evaluating on',language,'data')
  test_df = pd.read_csv('/content/drive/MyDrive/final data and shapley values/'+language+'_test_4-14.csv')
  if language == 'english':
    model_name = '/content/drive/MyDrive/testing_politeness_models/english/english_model'
    tok_name = 'roberta-base'
  elif language == 'spanish':
    model_name = '/content/drive/MyDrive/testing_politeness_models/spanish/sp_model'
    tok_name = "bertin-project/bertin-roberta-base-spanish"
  elif language == 'chinese':
    model_name = '/content/drive/MyDrive/testing_politeness_models/chinese/ch_model'
    tok_name = 'hfl/chinese-roberta-wwm-ext'
  elif language == 'japanese':
    model_name = '/content/drive/MyDrive/testing_politeness_models/japanese/ja_model'
    tok_name = 'rinna/japanese-roberta-base'
  elif language == 'all':
    model_name = '/content/drive/MyDrive/testing_politeness_models/all/all_model'
    tok_name = 'xlm-roberta-base'
  tokenizer = AutoTokenizer.from_pretrained(tok_name)
  if language == 'japanese':
    tokenizer.do_lower_case = True
  tokenizer_kwargs = {'padding':True,'truncation':True,'max_length':512}
  model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=1, problem_type="regression")
  ppl = TextClassificationPipeline(model=model, tokenizer=tokenizer, device=0, function_to_apply="none", batch_size=8,**tokenizer_kwargs)
  predictions = ppl(list(test_df['Utterance']))
  predictions = [p['score'] for p in predictions]
  rmse = mean_squared_error(test_df['labels'],predictions,squared=False)
  print('test rmse:',rmse)
  corr = pearsonr(predictions,test_df['labels'])
  print('corr btwn labels and predictions:',corr)
  test_df['prediction'] = predictions
  test_df.to_csv('/content/drive/MyDrive/final data and shapley values/'+language+'_test_with_predictions.csv',index=False)

In [None]:
test_eval('english')

evaluating on english data
test rmse: 0.6407014169774128
corr btwn labels and predictions: PearsonRResult(statistic=0.6879448342141262, pvalue=4.1460139444069537e-81)


In [None]:
test_eval('spanish')

evaluating on spanish data


Downloading (…)okenizer_config.json:   0%|          | 0.00/1.12k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/851k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/509k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.21M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

test rmse: 0.7187393586943718
corr btwn labels and predictions: PearsonRResult(statistic=0.624081840856454, pvalue=7.367025977300054e-63)


In [None]:
test_eval('chinese')

evaluating on chinese data


Downloading (…)okenizer_config.json:   0%|          | 0.00/19.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/689 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/110k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/269k [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

test rmse: 0.5465328265514258
corr btwn labels and predictions: PearsonRResult(statistic=0.7246460930412951, pvalue=6.541424095240257e-94)


In [None]:
test_eval('japanese')

evaluating on japanese data


Downloading (…)okenizer_config.json:   0%|          | 0.00/259 [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/806k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/153 [00:00<?, ?B/s]

test rmse: 0.6505838192876758
corr btwn labels and predictions: PearsonRResult(statistic=0.6403534049238702, pvalue=4.246829588899106e-67)


In [None]:
test_eval('all')

evaluating on all data


Downloading (…)lve/main/config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

test rmse: 0.6786939369655923
corr btwn labels and predictions: PearsonRResult(statistic=0.6811063998116996, pvalue=9.889753647854e-311)


In [None]:
import pickle

evaluate by sentence

In [None]:
from google.colab import files

In [None]:
def sent_eval(language):
  print('evaluating on',language,'data')
  test_df = pd.read_csv('/content/drive/MyDrive/final data and shapley values/'+language+'_sent.csv')
  if language == 'english':
    model_name = '/content/drive/MyDrive/testing_politeness_models/english/english_model'
    tok_name = 'roberta-base'
  elif language == 'spanish':
    model_name = '/content/drive/MyDrive/testing_politeness_models/spanish/sp_model'
    tok_name = "bertin-project/bertin-roberta-base-spanish"
  elif language == 'chinese':
    model_name = '/content/drive/MyDrive/testing_politeness_models/chinese/ch_model'
    tok_name = 'hfl/chinese-roberta-wwm-ext'
  elif language == 'japanese':
    model_name = '/content/drive/MyDrive/testing_politeness_models/japanese/ja_model'
    tok_name = 'rinna/japanese-roberta-base'
  elif language == 'all':
    model_name = '/content/drive/MyDrive/testing_politeness_models/all/all_model'
    tok_name = 'xlm-roberta-base'
  tokenizer = AutoTokenizer.from_pretrained(tok_name)
  if language == 'japanese':
    tokenizer.do_lower_case = True
  tokenizer_kwargs = {'padding':True,'truncation':True,'max_length':512}
  model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=1, problem_type="regression")
  ppl = TextClassificationPipeline(model=model, tokenizer=tokenizer, device=0, function_to_apply="none", batch_size=8,**tokenizer_kwargs)
  predictions = ppl(list(test_df['Sentence']))
  predictions = [p['score'] for p in predictions]
  test_df['predicted'] = predictions
  test_df.to_csv(language+'_sent_predictions.csv',index=False)
  files.download(language+'_sent_predictions.csv')

In [None]:
sent_eval('english')

evaluating on english data


Downloading (…)lve/main/config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
sent_eval('spanish')

evaluating on spanish data


Downloading (…)okenizer_config.json:   0%|          | 0.00/1.12k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/851k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/509k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.21M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
sent_eval('chinese')

evaluating on chinese data


Downloading (…)okenizer_config.json:   0%|          | 0.00/19.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/689 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/110k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/269k [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
sent_eval('japanese')

evaluating on japanese data


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>