In [None]:
pip -q install bitsandbytes

In [None]:
pip show iesta


In [None]:
pip install ../

In [None]:
from iesta.llms.generate import Generator

In [None]:
generator = Generator(
    ideology="liberal",
    model_name=Generator._MODEL_ALPACA_,
    out_file="../data/llms_out/",
)

In [None]:
ineffective_argument = generator.filtered_dataset[123]["text"]
ineffective_argument[:100]
generator.generate_for_prompts(ineffective_argument)

In [None]:
def init_prompts():
    basic_str = "Transform the following argument to an effective argument by maintaining the original length"
    ideology_str = "for readers with a {ideology} political ideology"
    content_str = "by preserving the content of the argument"
    style_str = "by only changing the style of the text"

    prompt_dict = {
        "basic": f"{basic_str}:",
        "ideology": f"{basic_str} {ideology_str}:",
        "content": f"{basic_str} {content_str}:",
        "style": f"{basic_str} {style_str}:",
        "ideology-content": f"{basic_str} {ideology_str} {content_str}:",
        "ideology-style": f"{basic_str} {ideology_str} {style_str}:",
        "all": f"{basic_str} {ideology_str} {content_str} and {style_str}:",
    }
    return prompt_dict


def create_prompt_template(prompt):
    system_message_prompt = SystemMessagePromptTemplate.from_template(prompt)
    human_template = "Argument: {ineffective_argument}"
    human_message_prompt = HumanMessagePromptTemplate.from_template(
        human_template
    )

    return [system_message_prompt, human_message_prompt]

In [None]:
from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    AIMessagePromptTemplate,
    HumanMessagePromptTemplate,
)
from langchain import PromptTemplate, LLMChain


template = """Transform the following argument to an effective argument by maintaining the original length:
    Argument: {ineffective_argument}
    """
prompt = PromptTemplate(
    template=template, input_variables=["ineffective_argument"]
)


llm_chain = LLMChain(llm=generator.local_llm, prompt=prompt)
llm_chain.run(ineffective_argument)

In [None]:
pip show iesta

In [None]:

pip install -q transformers 


In [None]:
pip install -U srsly

In [None]:
pip install -q langchain 


In [None]:
pip install -q xformers
pip show ydata-profiling

In [None]:
from langchain.llms import OpenAIChat
from langchain import LLMChain
from langchain.text_splitter import CharacterTextSplitter, TokenTextSplitter
from langchain.chains.mapreduce import MapReduceChain
from langchain.prompts import PromptTemplate
from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    AIMessagePromptTemplate,
    HumanMessagePromptTemplate,
)
from langchain.docstore.document import Document
from langchain.chains.summarize import load_summarize_chain
from langchain.prompts.few_shot import FewShotPromptTemplate


import os
from dotenv import load_dotenv

load_dotenv()

import pandas as pd

pd.set_option("display.max_colwidth", None)
from tqdm.notebook import tqdm

tqdm.pandas()
from ast import literal_eval
import random
import re
from langchain.chat_models import ChatOpenAI

In [None]:
# !source ../../../elbaff_iesta_venv/bin/activate
# %pip install langchain
# %pip install python-dotenv
# %pip install openai

In [None]:
"""

liberal_chat_prompt = ChatPromptTemplate.from_messages(create_prompt_template(prompt_dict["all"].format(ideology ="liberal")))
llm_chain = LLMChain(llm=chat, prompt=liberal_chat_prompt)
result = llm_chain.run(ineffective_argument="If there was no Kryptonite, can Superman defeat the Silver Surfer?")
print(result)
cons_chat_prompt = ChatPromptTemplate.from_messages(create_prompt_template(prompt_dict["all"].format(ideology ="conservative")))
llm_chain = LLMChain(llm=chat, prompt=cons_chat_prompt)
result = llm_chain.run(ineffective_argument="If there was no Kryptonite, can Superman defeat the Silver Surfer?")
print(result)
"""

In [None]:
from torch import cuda, bfloat16
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM


class FalconInit:
    def __init__(self):
        self.device = (
            f"cuda:{cuda.current_device()}" if cuda.is_available() else "cpu"
        )

        # creating a model
        self.fmodel = AutoModelForCausalLM.from_pretrained(
            "tiiuae/falcon-7b-instruct",
            trust_remote_code=True,
            torch_dtype=bfloat16,
        )
        self.fmodel.eval()
        self.fmodel.to(self.device)
        self.tokenizer = AutoTokenizer.from_pretrained(
            "tiiuae/falcon-7b-instruct"
        )

        self.text_generation_pipeline = transformers.pipeline(
            model=self.fmodel,
            tokenizer=self.tokenizer,
            task="text-generation",
            return_full_text=True,
            device=self.device,
            max_length=10000,
            temperature=0.1,
            top_p=0.15,  # select from top tokens whose probability adds up to 15%
            top_k=0,  # selecting from top 0 tokens
            repetition_penalty=1.1,  # without a penalty, output starts to repeat
            do_sample=True,
            num_return_sequences=1,
            eos_token_id=self.tokenizer.eos_token_id,
        )

In [None]:
from dotenv import load_dotenv, find_dotenv

found = load_dotenv(find_dotenv())

from langchain import HuggingFaceHub


# Create a new OpenAI instance
def get_model(model_name):
    if model_name == "chatgpt":
        return ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)
    elif model_name == "falcon":
        return HuggingFaceHub(
            huggingfacehub_api_token=os.getenv("HUGGINGFACE_TOKEN"),
            repo_id="tiiuae/falcon-7b-instruct",
            model_kwargs={"temperature": 0.6, "max_new_tokens": 1000},
        )


chat = get_model("chatgpt")

In [None]:
basic_str = "Transform the following argument to an effective argument by maintaining the original length"
ideology_str = "for readers with a {ideology} political ideology"
content_str = "by preserving the content of the argument"
style_str = "by only changing the style of the text"

prompt_dict = {
    "basic": f"{basic_str}:",
    "ideology": f"{basic_str} {ideology_str}:",
    "content": f"{basic_str} {content_str}:",
    "style": f"{basic_str} {style_str}:",
    "ideology-content": f"{basic_str} {ideology_str} {content_str}:",
    "ideology-style": f"{basic_str} {ideology_str} {style_str}:",
    "all": f"{basic_str} {ideology_str} {content_str} and {style_str}:",
}

In [None]:
len(prompt_dict["all"].split(" "))

In [None]:
def create_prompt_template(prompt):
    system_message_prompt = SystemMessagePromptTemplate.from_template(prompt)
    human_template = "{ineffective_argument}"
    human_message_prompt = HumanMessagePromptTemplate.from_template(
        human_template
    )
    return [system_message_prompt, human_message_prompt]

In [None]:
pip -q install langdetect


In [None]:
# pip install -U transformers datasets

In [None]:
from datasets import load_dataset, Dataset
from datasets.combine import concatenate_datasets
from iesta.data.huggingface_loader import IESTAHuggingFace
from ydata_profiling import ProfileReport
from langdetect import detect

_MAX_LMIT = 4061  # 4097 - 36 the longest prompt


def get_data(
    ideology,
    effect="ineffective",
    limit=500,
    profile: bool = False,
    save: bool = True,
):
    seed = 2062021
    name: str = f"notaphoenix/debateorg_w_effect_for_{ideology}"
    dataset: Dataset = load_dataset(name, split="test")
    dataset = dataset.filter(
        lambda x: x["label"] == IESTAHuggingFace._LABEL2ID_[effect]
    ).shuffle(seed=seed)

    if len(dataset) > limit:
        dataset = dataset.select(range(limit))

    print(f"{len(dataset)} before len filter")
    dataset = dataset.filter(
        lambda x: len(x["text"].split(" ")) > 10
        and len(x["text"].split(" ")) <= 1024
        and x["idx"] != 64707
        and detect(x["text"]) == "en"
    )
    print(f"{len(dataset)} after len filter")

    while len(dataset) < limit:
        idxes = dataset.to_pandas()["idx"].values.tolist()
        dataset_extra: Dataset = load_dataset(name, split="test")
        dataset_extra = dataset_extra.filter(
            lambda x: len(x["text"].split(" ")) > 10
            and len(x["text"].split(" ")) <= 1024
            and x["idx"] != 64707
            and detect(x["text"]) == "en"
        )

        dataset_extra = dataset_extra.filter(
            lambda x: x["label"] == IESTAHuggingFace._LABEL2ID_[effect]
            and x["idx"] not in idxes
        ).shuffle(seed=seed)
        dataset_extra = dataset_extra.select(range(limit - len(dataset)))
        print(f"{len(dataset_extra)} of extra")
        dataset = concatenate_datasets([dataset, dataset_extra])

        print(f"{len(dataset)} new length")
    print(f"Return dataset {name} with {len(dataset)} ")
    # dataset = dataset.map(lambda example, idx: {"id": idx, **example}, with_indices=True)

    df = dataset.to_pandas().copy()
    if profile:
        report = ProfileReport(df=df, minimal=False)
        report.to_file(f"{ideology}_test_{500}_seed_{seed}")

    if save:
        df.to_csv(f"{ideology}_test_{500}_seed_{seed}.csv")
    return dataset

In [None]:
lib = get_data("liberal", save=True, profile=True).to_pandas()

In [None]:
cons = get_data("conservative", save=True, profile=True).to_pandas()
# cons.head()

In [None]:
def get_generations(ineffective_argument: str, ideology: str):
    result_dict = {}
    for k, prompt_template in prompt_dict.items():
        chat_prompt = ChatPromptTemplate.from_messages(
            create_prompt_template(prompt_template.format(ideology=ideology))
        )
        llm_chain = LLMChain(llm=chat, prompt=chat_prompt)
        result_dict[k] = llm_chain.run(
            ineffective_argument=ineffective_argument
        )
        result_dict[f"len_{k}"] = len(result_dict[k])
        result_dict[f"len_orig"] = len(ineffective_argument)
        # print(f"'{result}'\n\n")
    return result_dict

In [None]:
import json
import pandas as pd
from tqdm import tqdm
from os.path import exists


def generate_args(
    ideology: str,
    out_file: str = "../data/llms_out/",
    model_name: str = "gpt3.5turbo",
    profile: bool = False,
) -> pd.DataFrame:
    out_file = f"{out_file}{ideology}_{model_name}_2.jsonl"

    existing_indices = []

    if exists(out_file):
        _df = pd.read_json(path_or_buf=out_file, lines=True)
        existing_indices = _df["idx"].values.tolist()

    filtered_dataset = get_data(
        ideology, effect="ineffective", limit=500, profile=profile, save=True
    )
    add_new_l = False
    if len(existing_indices) > 0:
        print(f"filtering out existing indices ({len(existing_indices)})")
        filtered_dataset = filtered_dataset.filter(
            lambda example: example["idx"] not in existing_indices
        )
        print(f"{filtered_dataset.num_rows} to go...")
        add_new_l = True

    with open(out_file, "a") as file:
        for datapoint in tqdm(filtered_dataset):
            try:
                promt_generated_dict = get_generations(
                    datapoint["text"], ideology
                )
                promt_generated_dict.update(datapoint)

                nline = "\n" if add_new_l else ""

                file.write(f"{nline}{json.dumps(promt_generated_dict)}")
                add_new_l = True
            except Exception as e:
                print(e)
                print(f"Failed to get a response for ID: {datapoint['idx']}")

### Generate for ChatGPT

In [None]:
generate_args(ideology="liberal", profile=False)

In [None]:
generate_args(ideology="conservative", profile=False)

### Generate for ChatGPT

In [None]:
generate_args(ideology="liberal", model_name="falconinstruct7b")

In [None]:
load_dataset("notaphoenix/debateorg_w_effect_for_liberal", split="test")[0]

In [None]:
for k, v in result_dict.items():
    print(f"\n{k} - {prompt_dict[k]}")
    diff = Redlines(ineffective_argument, v)
    display(Markdown(diff.output_markdown))

In [None]:
import itertools

for ptype1, ptype2 in itertools.combinations_with_replacement(
    prompt_dict.keys(), 2
):
    if ptype1 == ptype2:
        continue
    print(f"\n{ptype1} VS. {ptype2}")
    print(len(f"{ptype1} VS. {ptype2}") * "-")

    diff = Redlines(result_dict[ptype1], result_dict[ptype2])
    display(Markdown(diff.output_markdown))

In [None]:
for k, v in result_dict.items():
    for k,v  in result_dict.items():
    print(f"\n{k} - {prompt_dict[k]}")
    diff = Redlines(ineffective_argument,v)
    display(Markdown(diff.output_markdown))