In [1]:
import os
import random
import sys
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw
import torch
from openai import OpenAI
from anthropic import Anthropic

path = os.path.dirname(os.path.abspath("__file__"))
ppath = os.path.abspath(os.path.join(path, os.pardir))
sys.path.append(ppath)

from summary_eval.data import summary_df, prompts_df
from summary_eval.settings import TRAIN_SIZE

2024-05-12 00:21:42,704 - INFO - Read 7165 summaries from D:\summary-eval\data\summaries_train.csv
2024-05-12 00:21:42,707 - INFO - Read 4 prompts from D:\summary-eval\data\prompts_train.csv


In [2]:
train_df, test_df = train_test_split(summary_df, train_size=TRAIN_SIZE, random_state=42)
train_df.drop(["student_id", "prompt_id"], axis=1)

Unnamed: 0,text,content,wording
2441,"the article by UShistory. Org ""Egypt social s...",0.846025,-0.344397
3901,In the upper class There was the royal famile...,-0.456956,-0.042516
4267,In the Egyptian government system only the nob...,0.376374,0.463619
6189,Aristotle describes many elements of an ideal ...,1.690740,1.461055
4119,They would use chemicals to get rid of mold an...,0.050689,0.260165
...,...,...,...
3772,The social classes were involved due to how th...,-0.970237,-0.417058
5191,Jonas had told them how the meat that was take...,-1.547163,-1.461245
5226,"In The Jungle, when meat was spoiled it could ...",1.038163,0.928848
5390,One element is that a tragedy should be arrang...,0.531368,0.583991


In [7]:
def random_chars(df, **kwargs):
    """
    actions:
    - 'insert' random chars into the text
    - 'substitute' chars randomly with other chars
    - 'swap' chars with another char in the text
    - 'delete' chars randomly
    https://nlpaug.readthedocs.io/en/latest/augmenter/char/random.html
    """
    text = df['text'].tolist()
    
    aug = nac.random.RandomCharAug(kwargs)
    return np.array(aug.augment(text))

def ocr_aug(df, **kwargs):
    """
    adds typical errors usually seen from ocr
    https://nlpaug.readthedocs.io/en/latest/augmenter/char/ocr.html
    """
    text = df['text'].tolist()

    aug = nac.ocr.OcrAug(kwargs)
    return np.array(aug.augment(text))

def spelling_aug(df, **kwargs):
    """
    adds spelling mistakes
    https://nlpaug.readthedocs.io/en/latest/augmenter/word/spelling.html
    """
    text = df['text'].tolist()

    aug = naw.spelling.SpellingAug(kwargs)
    return np.array(aug.augment(text))

def random_words(df, **kwargs):
    """
    actions:
    - 'swap' word with another word (randomly) in the text
    - 'delete' words randomly
    https://nlpaug.readthedocs.io/en/latest/augmenter/word/random.html
    """
    text = df['text'].tolist()

    aug = naw.random.RandomWordAug(kwargs)
    return np.array(aug.augment(text))

def synonym_replacement(df, **kwargs):
    """
    replace words with their synonyms
    either use wordnet or ppdb
    https://nlpaug.readthedocs.io/en/latest/augmenter/word/synonym.html
    """
    text = df['text'].tolist()

    aug = naw.SynonymAug()
    return np.array(aug.augment(text))

def back_translate(df):
    text = df['text'].tolist()

    aug = naw.back_translation.BackTranslationAug(from_model_name="Helsinki-NLP/opus-mt-en-de",
                                                  to_model_name="Helsinki-NLP/opus-mt-de-en")
    return np.array(aug.augment())

In [4]:
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'tokenizer', 'bert-base-cased')
masked_lm_model = torch.hub.load('huggingface/pytorch-transformers', 'modelForMaskedLM', 'bert-base-cased')

Using cache found in C:\Users\Daniel/.cache\torch\hub\huggingface_pytorch-transformers_main
Using cache found in C:\Users\Daniel/.cache\torch\hub\huggingface_pytorch-transformers_main
Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [8]:
def insert_random_mask(text):
    words = text.split()
    mask_idx = random.randint(0, len(words))
    new_words = words[:mask_idx] + ["[MASK]"] + words[mask_idx:]

    return " ".join(new_words)

def substitute_random_mask(text):
    words = text.split()
    mask_idx = random.randint(0, len(words))
    words[mask_idx] = "[MASK]"

    return " ".join(words)

def bert(text):
    tokens = tokenizer(text, padding=True)
    mask_index = [i for i, token_id in enumerate(tokens["input_ids"]) if token_id == tokenizer.mask_token_id]
    
    segments_tensors = torch.tensor([tokens["token_type_ids"]])
    tokens_tensor = torch.tensor([tokens["input_ids"]])
    
    with torch.no_grad():
        predictions = masked_lm_model(tokens_tensor, token_type_ids=segments_tensors)
    
    pred_token = torch.argmax(predictions[0][0], dim=1)
    tokens["input_ids"][mask_index[0]] = pred_token[mask_index[0]]
    
    return tokenizer.decode(tokens["input_ids"], skip_special_tokens=True)

def bert_insert_random(df):
    summaries = map(insert_random_mask, df['text'].tolist())
    return numpy.array([bert(s) for s in summaries])

def bert_substitute_random(df):
    summaries = map(substitute_random_mask, df['text'].tolist())
    return numpy.array([bert(s) for s in summaries])

# can also test DistilBERT & RoBERTA equivalents

In [9]:
OPENAI_API_KEY = "" #set here or in .env
ANTHROPIC_API_KEY = "" #set here or in .env
openAiClient = OpenAI(api_key=OPENAI_API_KEY)
anthropicClient = Anthropic(api_key=ANTHROPIC_API_KEY)

def single_turn_openai(text):
    return openAiClient.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "user", "content": f"Please rephrase the following sentence: {text}"}
        ]
    ).choices[0].message.content

def multi_turn_openai(text):
    return openAiClient.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are a helpful assistant that rephrase text and make sentence smooth"},
            {"role": "user", "content": "I will give you a sample, please rephrase it, then give me 6 rephrased answers"},
            {"role": "assistant", "content": "Sure, please provide the sentence you would like me to rephrase."},
            {"role": "user", "content": text},
        ]
    ).choices[0].message.content

def single_turn_anthropic(text):
    return anthropicClient.messages.create(
        model="claude-3-opus-20240229",
        max_tokens=1000,
        messages=[
            {"role": "user", "content": f"Please rephrase the following sentence: {text}"}
        ]
    ).choices[0].message.content

def multi_turn_anthropic(text):
    return anthropicClient.messages.create(
        model="claude-3-opus-20240229",
        max_tokens=1000,
        system="You are a helpful assistant that rephrase text and make sentence smooth",
        messages=[
            {"role": "user", "content": "I will give you a sample, please rephrase it, then give me 6 rephrased answers"},
            {"role": "assistant", "content": "Sure, please provide the sentence you would like me to rephrase."},
            {"role": "user", "content": text},
        ]
    ).choices[0].message.content