In [None]:
# t.start()
from transformers import pipeline

transformer_nlp = pipeline(
    "text-classification",
    model="j-hartmann/emotion-english-distilroberta-base",
    truncation=True,
    top_k=None,
)

In [None]:
from transformers.utils import logging
from transformers import pipeline
from transformers import RobertaTokenizer, RobertaForSequenceClassification

model_path = "SkolkovoInstitute/roberta_toxicity_classifier"
tokenizer = RobertaTokenizer.from_pretrained(model_path)
model = RobertaForSequenceClassification.from_pretrained(model_path)

transformer_nlp = pipeline(
    "text-classification",
    model=model,
    tokenizer=tokenizer,
    truncation=True,
    top_k=None,
)

In [1]:
from simpletransformers.ner import NERModel, NERArgs

ner_args = NERArgs()
ner_args.silent = True
model = NERModel(
    "bert",
    "jeniakim/hedgehog",
    use_cuda=False,
    labels=["C", "D", "E", "I", "N"],
    args=ner_args,
)

KeyboardInterrupt: 

In [2]:
class TransformerBasedFeaturePipeline(Pipeline):
    def __init__(
        self,
        input=None,
        load_default_pipe_configs=True,
        extended_pipe_configs: dict = None,
        save_output=False,
        out_path=None,
        argument_col: str = "argument",
    ):
        super().__init__(
            input,
            load_default_pipe_configs,
            extended_pipe_configs,
            save_output,
            out_path,
        )
        self.argument_col = argument_col

    def process_input(self) -> list:
        processed = []
        txt_df = self.input[["id", self.argument_col]].copy()
        txt_df = txt_df.rename(
            columns={"id": "input_id", self.argument_col: "text"},
        )

        for idx, row in txt_df.iterrows():
            processed.append((row.text, {"input_id": row.input_id}))

        return processed

    def init_and_run(self):
        self.add_annotation_pipe(
            name="senter", save_output=False, is_spacy=True, is_native=True
        )
        self.add_annotation_pipe(
            name="EmotionPipeOrchestrator", save_output=True, is_spacy=True
        )
        self.add_annotation_pipe(
            name="HedgePipeOrchestrator", save_output=True, is_spacy=True
        )
        self.add_annotation_pipe(
            name="ToxicityOrchestrator", save_output=True, is_spacy=True
        )

        # annotate the input
        # self.set_spacy_language_model("en_core_web_md")
        self.annotate()
        # save annotations when "save_output" is set to True
        self.save()

NameError: name 'Pipeline' is not defined

In [None]:
import pandas as pd

test_df = pd.DataFrame(
    [
        {"id": 1, "argument": "This is a very hard time, I am devastated!"},
        {
            "id": 2,
            "argument": "You can not get your tiny brain to work on this so stupid!!",
        },
        {
            "id": 3,
            "argument": "The amonium Nitrate was sitting there for ages.",
        },
        {"id": 4, "argument": "I love you and I love how you look"},
    ]
)

In [None]:
import sklearn.utils.random

In [None]:
pipeline = TransformerBasedFeaturePipeline(
    save_output=True, out_path="../data/extracted_features/test.parquet"
)
pipeline.set_input(test_df)
# pipeline.set_input(test_df)
pipeline.init_and_run()
pipeline.out_df.head()

In [4]:
import pandas as pd
liberal_parquet = "../data/splitted/methodology_each/liberal_training.parquet"
conservative_parquet = "../data/splitted/methodology_each/conservative_training.parquet"

pd.read_parquet(liberal_parquet).to_csv(liberal_parquet.replace("parquet", "csv"))
pd.read_parquet(conservative_parquet).to_csv(conservative_parquet.replace("parquet", "csv"))

In [11]:
pd.read_csv(liberal_parquet.replace("parquet", "csv"), index_col="idx")["split"].unique()

array(['training'], dtype=object)

In [12]:

import pandas as pd
from tqdm.notebook import tqdm

from langchain.chat_models import ChatOpenAI
import transformers
from dotenv import load_dotenv, find_dotenv

from transformers import pipeline
from langchain.llms import HuggingFacePipeline
from langchain import PromptTemplate, LLMChain
import torch

from datasets import load_dataset, Dataset

from iesta.machine_learning.huggingface_loader import IESTAHuggingFace
from ydata_profiling import ProfileReport
import json
import pandas as pd
from tqdm import tqdm
from os.path import exists
import dataclasses

from datasets import load_dataset, Dataset
from datasets.combine import concatenate_datasets
from iesta.machine_learning.huggingface_loader import IESTAHuggingFace
from ydata_profiling import ProfileReport
from langdetect import detect
from langchain.prompts.few_shot import FewShotPromptTemplate
from langchain.prompts.prompt import PromptTemplate
from transformers import AutoTokenizer, AutoModelForCausalLM 
from typing import ClassVar


def get_data(ideology, effect="ineffective"):
    limit = 500
    seed = 2062021
    name: str = f"notaphoenix/debateorg_w_effect_for_{ideology}"
    dataset: Dataset = load_dataset(name, split="test")
    dataset = dataset.filter(
        lambda x: x["label"] == IESTAHuggingFace._LABEL2ID_[effect]
    ).shuffle(seed=seed)

    if len(dataset) > limit:
        dataset = dataset.select(range(limit))

    print(f"{len(dataset)} before len filter")
    dataset = dataset.filter(
        lambda x: len(x["text"].split(" ")) > 10
        and len(x["text"].split(" ")) <= 1024
        and x["idx"] != 64707
        and detect(x["text"]) == "en"
    )
    print(f"{len(dataset)} after len filter")

    while len(dataset) < limit:
        idxes = dataset.to_pandas()["idx"].values.tolist()
        dataset_extra: Dataset = load_dataset(name, split="test")
        dataset_extra = dataset_extra.filter(
            lambda x: len(x["text"].split(" ")) > 10
            and len(x["text"].split(" ")) <= 1024
            and x["idx"] != 64707
            and detect(x["text"]) == "en"
        )

        dataset_extra = dataset_extra.filter(
            lambda x: x["label"] == IESTAHuggingFace._LABEL2ID_[effect]
            and ["idx"] not in idxes
        ).shuffle(seed=seed)
        dataset_extra = dataset_extra.select(range(limit - len(dataset)))
        print(f"{len(dataset_extra)} of extra")
        dataset = concatenate_datasets([dataset, dataset_extra])

        print(f"{len(dataset)} new length")
    print(f"Return dataset {name} with {len(dataset)} ")
    # dataset = dataset.map(lambda example, idx: {"id": idx, **example}, with_indices=True)

    df = dataset.to_pandas().copy()

    df.to_csv(f"data/splitted/methodology_each/{ideology}_test.csv")
    return dataset



dotenv was True
