In [1]:
# python3 -m venv elbaff_iesta_venv --prompt="elbaff_iesta_venv"
# source elbaff_iesta_venv/bin/activate

In [1]:
from transformers.utils import logging
from transformers import pipeline
from transformers import RobertaTokenizer, RobertaForSequenceClassification
model_path = 'SkolkovoInstitute/roberta_toxicity_classifier'
tokenizer = RobertaTokenizer.from_pretrained(model_path)
model = RobertaForSequenceClassification.from_pretrained(model_path)

transformer_nlp = pipeline('text-classification',
                                model=model,
                                tokenizer=tokenizer,
                                truncation=True, top_k=None)

Some weights of the model checkpoint at SkolkovoInstitute/roberta_toxicity_classifier were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [11]:
from simpletransformers.ner import NERModel, NERArgs
ner_args = NERArgs()
ner_args.silent = True
model = NERModel(
    'bert',
    'jeniakim/hedgehog',
    use_cuda=False,
    labels=["C", "D", "E", "I", "N"], args=ner_args
)


In [None]:
model.predict(["This is a text"])

In [4]:
%timeit
transformer_nlp("This is a toxic sentence!")


[{'label': 'neutral', 'score': 0.9998080134391785},
 {'label': 'toxic', 'score': 0.00019199376401957124}]

In [5]:
from textmining_utility.annotator.pipeline.pipeline_base import Pipeline
import pandas as pd

import pandas as pd
import numpy as np
import sys, os
import importlib

import iesta.loader as loader
import iesta.properties as prop  
import iesta.processor as proc  
import iesta.feature_extractor as fe
from iesta.machine_learning.dataloader import IESTAData, METHODOLOGY

from collections import Counter




ModuleNotFoundError: No module named 'textmining_utility'

In [3]:
liberal_each_dataloader = IESTAData(ideology=prop.LIBERAL_IDEOLOGY, methodology=METHODOLOGY.EACH)





In [4]:
liberal_each_dataloader.load()

textmining-utility INFO     File already created. Loading file...
textmining-utility INFO     File already created. Loading file...


In [5]:
liberal_each_dataloader.pivot_df

effect,effective,ineffective,okay,provocative
split,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
test,248,5349,969,92
training,1722,37713,6835,522
validation,613,10749,1828,204


In [6]:
liberal_each_dataloader.pivot_binary_effect

binary_effect,effective,ineffective
split,Unnamed: 1_level_1,Unnamed: 2_level_1
test,248,6410
training,1722,45070
validation,613,12781


In [7]:
liberal_training_each_df  = liberal_each_dataloader.data_df[liberal_each_dataloader.data_df["split"] == "training"].copy()
len(liberal_training_each_df)

46792

In [8]:
liberal_training_each_df.columns.tolist()

['id',
 'debate_id',
 'p_name',
 'effects',
 'effect_count',
 'top_effect',
 'effect',
 'category',
 'round',
 'argument',
 'split',
 'binary_effect']

In [9]:
import dataclasses
class TransformerBasedFeaturePipeline(Pipeline):
    
    def __init__(self,
                 input= None,
                 load_default_pipe_configs = True,
                 extended_pipe_configs:dict = None,
                 save_output= False,
                 out_path = None,
                 argument_col:str = "argument"
                 ):
        super().__init__(input,
                 load_default_pipe_configs,
                 extended_pipe_configs,
                 save_output,
                 out_path)
        self.argument_col= argument_col
        
    def process_input(self) -> list:
        processed = []
        txt_df = self.input[["id", self.argument_col]].copy()
        txt_df =txt_df.rename(columns={"id": "input_id", self.argument_col: "text"},)
        
        for idx, row in txt_df.iterrows():
            processed.append((row.text, {"input_id": row.input_id}))

        return processed

    def init_and_run(self):
        
        self.add_annotation_pipe(name = "senter",                  save_output= False,is_spacy=True, is_native=True)
        self.add_annotation_pipe(name = "EmotionPipeOrchestrator", save_output= True, is_spacy=True)
        self.add_annotation_pipe(name = "HedgePipeOrchestrator",   save_output= True, is_spacy=True)
        self.add_annotation_pipe(name = "ToxicityOrchestrator",    save_output= True, is_spacy=True)

        # annotate the input
        #self.set_spacy_language_model("en_core_web_md")
        self.annotate()
        # save annotations when "save_output" is set to True
        self.save()

In [None]:
import pandas  as pd
test_df = pd.DataFrame([{"id": 1, "argument": "This is a very hard time, I am devastated!"},
                         {"id": 2, "argument": "You can not get your tiny brain to work on this so stupid!!"},
                         {"id": 3, "argument": "The amonium Nitrate was sitting there for ages."},
                         {"id": 4, "argument": "I love you and I love how you look"}
                       ])

In [None]:
pipeline = TransformerBasedFeaturePipeline(save_output= True,
                 out_path = "../data/extracted_features/test.parquet")
pipeline.set_input(test_df)
#pipeline.set_input(test_df)
pipeline.init_and_run()
pipeline.out_df.head()

In [10]:
pipeline = TransformerBasedFeaturePipeline(save_output= True,
                 out_path = "../data/extracted_features/features_liberal_training_each.parquet")
pipeline.set_input(liberal_training_each_df)
#pipeline.set_input(test_df)
pipeline.init_and_run()
pipeline.out_df.head()

textmining-utility INFO     adding pipe with name EmotionPipeOrchestrator
textmining-utility INFO     adding pipe with name EmotionPipeOrchestrator
textmining-utility DEBUG    loading textmining_utility.annotator.pipe.linguistic.emotion
textmining-utility DEBUG    loading textmining_utility.annotator.pipe.linguistic.emotion
textmining-utility INFO     orchestrator was initialized successfully
textmining-utility INFO     orchestrator was initialized successfully
textmining-utility INFO     adding pipe with code emotion_hartmann_component
textmining-utility INFO     adding pipe with code emotion_hartmann_component
textmining-utility INFO     adding pipe with name HedgePipeOrchestrator
textmining-utility INFO     adding pipe with name HedgePipeOrchestrator
textmining-utility DEBUG    loading textmining_utility.annotator.pipe.linguistic.hedge
textmining-utility DEBUG    loading textmining_utility.annotator.pipe.linguistic.hedge
textmining-utility INFO     orchestrator was initialized succe

In [None]:
pipeline.spacy_language_model

In [None]:
lib_transformers_df = pd.read_parquet("../data/extracted_features/features_liberal_training_each.parquet")

lib_transformers_df.head()
lib_transformers_df['hedge_dominant'].value_counts().to_frame().plot(kind="bar")
lib_transformers_df['emotion_hartmann_label'].value_counts().to_frame().plot(kind="bar")
lib_transformers_df['toxicity_dominant'].value_counts().to_frame().plot(kind="bar")

lib_transformers_df[["emotion_hartmann_anger",
                     "emotion_hartmann_disgust",
                    "emotion_hartmann_fear", 
                    "emotion_hartmann_joy",
                    "emotion_hartmann_neutral",
                    "emotion_hartmann_surprise",
                    "emotion_hartmann_sadness"]].plot.hist(bins=12, alpha=0.5)