In [1]:
import re
import numpy as np
from tqdm import tqdm
import pandas as pd
from sqlalchemy import create_engine
import pickle 
from politenessr import Politenessr

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
con = create_engine('mysql://127.0.0.1/wikipedia_talk?read_default_file=~/.my.cnf&charset=utf8')

en_full = pd.read_sql('select * from msgs_en_turns', con)
es_full = pd.read_sql('select * from msgs_es_turns', con)
ja_full = pd.read_sql('select * from msgs_ja_turns', con)
zh_full = pd.read_sql('select * from msgs_zh_turns', con)

In [3]:
#keep only the message and language columns
def process_turns(turns):
    processed = []
    for t in turns:
        if(t == None): continue
        if(len(t)<5): continue
        t = t.replace('\n', ' ')
        t = t.replace('\r', ' ')
        t = t.replace('\t', ' ')
        t = t.replace('""', '[temp filler for double quotes]')
        t = t.replace('"', '')
        t = t.replace('[temp filler for double quotes]', '"')
        processed.append(t.strip())
    return processed


en_turns = process_turns(en_full['turn'])
es_turns = process_turns(es_full['turn'])
ja_turns = process_turns(ja_full['turn'])
zh_turns = process_turns(zh_full['turn'])


In [None]:
#make them csvs
en_trimmed = pd.DataFrame(en_turns, columns=['message'])
en_trimmed.to_csv('full_data/en_trimmed.csv', index=True)

es_trimmed = pd.DataFrame(es_turns, columns=['message'])
es_trimmed.to_csv('full_data/es_trimmed.csv', index=True)

ja_trimmed = pd.DataFrame(ja_turns, columns=['message'])
ja_trimmed.to_csv('full_data/ja_trimmed.csv', index=True)

zh_trimmed = pd.DataFrame(zh_turns, columns=['message'])
zh_trimmed.to_csv('full_data/zh_trimmed.csv', index=True)

In [16]:
pr = Politenessr()
en_labels = pr.predict(en_turns[1:10])

06/13/2023 21:26:36 - INFO - pytorch_transformers.modeling_utils -   loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-config.json from cache at /home/shreyah/.cache/torch/pytorch_transformers/b945b69218e98b3e2c95acf911789741307dec43c698d35fad11c1ae28bda352.9da767be51e1327499df13488672789394e2ca38b877837e52618a67d7002391
06/13/2023 21:26:36 - INFO - pytorch_transformers.modeling_utils -   Model config {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labels": 1,
  "output_attentions": false,
  "output_hidden_states": false,
  "pad_token_id": 0,
  "pruned_heads": {},
  "torchscript": false,
  "type_vo

In [25]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TextClassificationPipeline
from datasets import Dataset

class MyPipeline(TextClassificationPipeline):   
    def postprocess(self, model_outputs,return_all_scores=False):
        print(model_outputs["logits"][0])
        score = model_outputs["logits"][0]
        return score.tolist()[0]
    
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
model = AutoModelForSequenceClassification.from_pretrained("mpressi/english_xlm-False")

test = ["I hate you", "I love you"]

# dataset = Dataset.from_list(test)

pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer, return_all_scores=True, truncation=True)

results = pipe(test)

In [37]:
for r in results:
    print(((r[0]['score'])-0.5) * 4)

results

-0.8569754362106323
1.474205732345581


[[{'label': 'LABEL_0', 'score': 0.2857561409473419}],
 [{'label': 'LABEL_0', 'score': 0.8685514330863953}]]

In [None]:
pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer, return_all_scores=True)
pipe(en_turns[1:10])

In [11]:
def read_pickle(file_path):
    with open(file_path, 'rb') as file:
        pickle_file = pickle.load(file)
    return pickle_file

en_dialog = read_pickle("dialog_acts/dialog_act_data/english_dialog_acts_updated.p")
es_dialog = read_pickle("dialog_acts/dialog_act_data/spanish_dialog_acts_updated.p")
ja_dialog = read_pickle("dialog_acts/dialog_act_data/japanese_dialog_acts_updated.p")
zh_dialog = read_pickle("dialog_acts/dialog_act_data/chinese_dialog_acts_updated.p")

en_shap = read_pickle("shapley/xlm_shap_values/english.p")
es_shap = read_pickle("shapley/xlm_shap_values/spanish.p")
ja_shap = read_pickle("shapley/xlm_shap_values/japanese.p")
zh_shap = read_pickle("shapley/xlm_shap_values/chinese.p")

  from .autonotebook import tqdm as notebook_tqdm


['' 'That ' 'is ' 'why ' 'he ' 'is ' 'a ' '‘' 'pre' 'tender' '’' '. '
 'He ' 'has ' 'never ' 'claim' 'ed ' 'to ' 'be ' 'a ' 'King ' '- ' 'or '
 'a ' 'Kaiser' ', ' 'for ' 'that ' 'matter' '. ' 'He ' 'is ' 'in ' 'the '
 'same ' 'class ' 'as ' 'the ' 'Com' 'te ' 'de ' 'Paris' ', ' 'who ' 'is '
 'not ' 'the ' 'King ' 'of ' 'France' ', ' 'but ' 'would ' 'be ' 'if '
 'the ' 'Bour' 'bon' 's ' 'were ' 'placed ' 'on ' 'a ' 'resto' 'red '
 'French ' 't' 'thro' 'ne' '.' '']
['That is why he is a ‘pretender’.', 'He has never claimed to be a King - or a Kaiser, for that matter.', 'He is in the same class as the Comte de Paris, who is not the King of France, but would be if the Bourbons were placed on a restored French throne.']
['' 'Tengo ' 'entend' 'ido ' 'que ' 'desde ' '1929' ', ' 'y ' 'sobre '
 'todo ' 'desde ' '1948' ', ' 'la ' 'electro' 'di' 'ná' 'mica ' 'cu' 'án'
 'tica ' 'de' 'ster' 'ró ' 'la ' 'teoría ' 'dual' '. ' 'Lo ' 'de ' 'la '
 'teoría ' 'onda' '-' 'corp' 'ús' 'culo ' 'está ' 'histór