In [1]:
import pandas as pd
import numpy as np
from glob import glob
from iesta.machine_learning.dataloader import IESTAData, METHODOLOGY

from iesta.machine_learning.feature_extraction import get_features_df
import iesta.loader as loader
import iesta.properties as prop  
import iesta.processor as proc  
from iesta.machine_learning.dataloader import IESTAData, METHODOLOGY

#from .autonotebook import tqdm as notebook_tqdm

import matplotlib.pyplot as plt

%matplotlib inline
import seaborn as sns

sns.reset_defaults()
sns.set(
    rc={'figure.figsize':(3,2)}, 
    #style="white" # nicer layout
)




In [2]:
ideology = prop.LIBERAL_IDEOLOGY.lower()
ideology

'liberal'

In [3]:
path = "../data/extracted_features/"



dataloader = IESTAData(ideology=ideology, methodology=METHODOLOGY.EACH)

training_df, training_data_path = dataloader.get_training_data()
training_data = pd.read_parquet(training_data_path)

/home/elba_ro/repos/conf22-style-transfer/iesta/../data/splitted_liberal_debate_arguments_effect_test0.3_random2.parquet


iesta        INFO     File already created. Loading file...


In [4]:
len(training_data)

46792

## Fetching MPQA and EMPATH

In [5]:
style_features_path = glob(f"{path}/{ideology}_style-features_1000/*.parquet")
transformer_features_path = glob(f"{path}/{ideology}_transformer-features_100/*.parquet")

In [6]:
style_features_df = get_features_df(style_features_path, 1000, training_data)
transformer_features_df = get_features_df(transformer_features_path, 100, training_data)


792: ../data/extracted_features//liberal_style-features_1000/liberal_batch1000_47_style-features.parquet
92: ../data/extracted_features//liberal_transformer-features_100/liberal_batch100_468_transformer-features.parquet


## Running significance tests

### All Effects

In [7]:
import iesta.stats.significance
significance_empath_mpqa_effects_df = iesta.stats.significance.calc_sign_effects(
                        style_features_df, 
                        "liberal", 
                        "empath-mpqa", 
                        "effect", 
                        exclude_iv_vals = [])

significance_transformers_effects_df = iesta.stats.significance.calc_sign_effects(
                        transformer_features_df, 
                        "liberal", 
                        "transformers", 
                        "effect", 
                        exclude_iv_vals = [])

data has 46792 instances
bonforrini_threshold:  0.008333333333333333
there are 408 features.
Feature: mpqa_assessments
Feature: mpqa_doubt
Feature: mpqa_authority
Feature: mpqa_emphasis
Feature: mpqa_necessity
Feature: mpqa_causation
Feature: mpqa_generalization
Feature: mpqa_structure
Feature: mpqa_conditionals
Feature: mpqa_inconsistency
Feature: mpqa_possibility
Feature: mpqa_wants
Feature: mpqa_contrast
Feature: mpqa_priority
Feature: mpqa_difficulty
Feature: mpqa_inyourshoes
Feature: mpqa_rhetoricalquestion
Feature: mpqa_argumentative
Feature: mpqa_token_ratio
Feature: mpqa_args_count
Feature: empath_cold_count
Feature: empath_cold_ratio
Feature: empath_aggression_count
Feature: empath_aggression_ratio
Feature: empath_vacation_count
Feature: empath_vacation_ratio
Feature: empath_dispute_count
Feature: empath_dispute_ratio
Feature: empath_nervousness_count
Feature: empath_nervousness_ratio
Feature: empath_leisure_count
Feature: empath_leisure_ratio
Feature: empath_journalism_count


In [8]:
significance_empath_mpqa_effects_df

Unnamed: 0_level_0,effective ineffective,effective okay,effective provocative,ineffective okay,ineffective provocative,okay provocative
feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
mpqa_assessments,,,,-0.01,,
mpqa_doubt,,,,-0.01,,
mpqa_authority,0.01,,,-0.01,,
mpqa_emphasis,0.01,,,-0.02,,
mpqa_necessity,0.02,,,-0.03,-0.02,
mpqa_causation,0.02,,,-0.04,-0.01,
mpqa_structure,,,,-0.02,,
mpqa_conditionals,0.01,,,-0.01,,
mpqa_inconsistency,0.03,,,-0.03,,
mpqa_possibility,0.01,,,-0.02,,


In [9]:
significance_transformers_effects_df

Unnamed: 0_level_0,effective ineffective,effective okay,effective provocative,ineffective okay,ineffective provocative,okay provocative
feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
emotion_hartmann_anger,,,-0.06,,-0.01,
emotion_hartmann_anger_count,0.02,,,-0.03,,
emotion_hartmann_disgust,0.02,,,-0.04,-0.02,
emotion_hartmann_disgust_count,0.03,,,-0.05,-0.01,
emotion_hartmann_disgust_ratio,0.02,,,-0.04,-0.01,
emotion_hartmann_fear,,,,,,
emotion_hartmann_fear_count,,,,-0.01,,
emotion_hartmann_joy,0.03,0.05,,,,
emotion_hartmann_joy_count,0.03,,,-0.03,,
emotion_hartmann_joy_ratio,0.02,,,-0.02,,


### All Effects - Excluding *Okay*

In [10]:
significance_empath_mpqa_NOOKAY_effects_df = iesta.stats.significance.calc_sign_effects(
                        style_features_df, 
                        "liberal", 
                        "empath-mpqa", 
                        "effect", 
                        exclude_iv_vals = ["okay"])
                        
significance_transformers_NOOKAY_effects_df = iesta.stats.significance.calc_sign_effects(
                        transformer_features_df, 
                        "liberal", 
                        "transformers", 
                        "effect", 
                        exclude_iv_vals = ["okay"])

The IV has
 ineffective    37713
okay            6835
effective       1722
provocative      522
Name: effect, dtype: int64. Excluding ['okay']...
After exclusion: 
 ineffective    37713
effective       1722
provocative      522
Name: effect, dtype: int64. Excluding ['okay']...
data has 39957 instances
bonforrini_threshold:  0.016666666666666666
there are 408 features.
Feature: mpqa_assessments
Feature: mpqa_doubt
Feature: mpqa_authority
Feature: mpqa_emphasis
Feature: mpqa_necessity
Feature: mpqa_causation
Feature: mpqa_generalization
Feature: mpqa_structure
Feature: mpqa_conditionals
Feature: mpqa_inconsistency
Feature: mpqa_possibility
Feature: mpqa_wants
Feature: mpqa_contrast
Feature: mpqa_priority
Feature: mpqa_difficulty
Feature: mpqa_inyourshoes
Feature: mpqa_rhetoricalquestion
Feature: mpqa_argumentative
Feature: mpqa_token_ratio
Feature: mpqa_args_count
Feature: empath_cold_count
Feature: empath_cold_ratio
Feature: empath_aggression_count
Feature: empath_aggression_ratio
Featu

In [11]:
significance_empath_mpqa_NOOKAY_effects_df

Unnamed: 0_level_0,effective ineffective,effective provocative,ineffective provocative
feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
mpqa_authority,0.01,,
mpqa_emphasis,0.01,,
mpqa_necessity,0.02,,-0.02
mpqa_causation,0.02,,-0.01
mpqa_conditionals,0.01,,
mpqa_inconsistency,0.03,,
mpqa_possibility,0.01,,
mpqa_contrast,0.02,,
mpqa_priority,0.01,,
mpqa_argumentative,0.03,,-0.01


In [12]:
significance_transformers_NOOKAY_effects_df

Unnamed: 0_level_0,effective ineffective,effective provocative,ineffective provocative
feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
emotion_hartmann_anger,,-0.06,-0.01
emotion_hartmann_anger_count,0.02,,
emotion_hartmann_disgust,0.02,,-0.02
emotion_hartmann_disgust_count,0.03,,-0.01
emotion_hartmann_disgust_ratio,0.02,,-0.01
emotion_hartmann_joy,0.03,,
emotion_hartmann_joy_count,0.03,,
emotion_hartmann_joy_ratio,0.02,,
emotion_hartmann_neutral,0.02,0.06,
emotion_hartmann_neutral_count,0.04,0.06,


### Binary Effects

In [13]:
significance_empath_mpqa_binaryeffects_df = iesta.stats.significance.calc_sign_effects(
                        style_features_df, 
                        "liberal", 
                        "empath-mpqa", 
                        "binary_effect", 
                        exclude_iv_vals = [])


significance_transformers_binaryeffects_df = iesta.stats.significance.calc_sign_effects(
                        transformer_features_df, 
                        "liberal", 
                        "transformers", 
                        "binary_effect", 
                        exclude_iv_vals = [])

data has 46792 instances
bonforrini_threshold:  0.05
there are 408 features.
Feature: mpqa_assessments
Feature: mpqa_doubt
Feature: mpqa_authority
Feature: mpqa_emphasis
Feature: mpqa_necessity
Feature: mpqa_causation
Feature: mpqa_generalization
Feature: mpqa_structure
Feature: mpqa_conditionals
Feature: mpqa_inconsistency
Feature: mpqa_possibility
Feature: mpqa_wants
Feature: mpqa_contrast
Feature: mpqa_priority
Feature: mpqa_difficulty
Feature: mpqa_inyourshoes
Feature: mpqa_rhetoricalquestion
Feature: mpqa_argumentative
Feature: mpqa_token_ratio
Feature: mpqa_args_count
Feature: empath_cold_count
Feature: empath_cold_ratio
Feature: empath_aggression_count
Feature: empath_aggression_ratio
Feature: empath_vacation_count
Feature: empath_vacation_ratio
Feature: empath_dispute_count
Feature: empath_dispute_ratio
Feature: empath_nervousness_count
Feature: empath_nervousness_ratio
Feature: empath_leisure_count
Feature: empath_leisure_ratio
Feature: empath_journalism_count
Feature: empath_

In [14]:
significance_empath_mpqa_binaryeffects_df

Unnamed: 0_level_0,effective ineffective
feature,Unnamed: 1_level_1
mpqa_authority,0.01
mpqa_emphasis,0.01
mpqa_necessity,0.02
mpqa_causation,0.02
mpqa_conditionals,0.01
mpqa_inconsistency,0.02
mpqa_possibility,0.01
mpqa_contrast,0.01
mpqa_priority,0.01
mpqa_argumentative,0.02


In [15]:
significance_transformers_binaryeffects_df

Unnamed: 0_level_0,effective ineffective
feature,Unnamed: 1_level_1
emotion_hartmann_anger_count,0.01
emotion_hartmann_disgust,0.01
emotion_hartmann_disgust_count,0.02
emotion_hartmann_disgust_ratio,0.01
emotion_hartmann_joy,0.03
emotion_hartmann_joy_count,0.02
emotion_hartmann_joy_ratio,0.02
emotion_hartmann_neutral,0.01
emotion_hartmann_neutral_count,0.03
emotion_hartmann_neutral_ratio,0.01


### Binary Effects - Excluding *Okay*

In [16]:
significance_empath_mpqa_NOOKAY_binaryeffects_df = iesta.stats.significance.calc_sign_effects(
                        style_features_df[style_features_df["effect"] != "okay"], 
                        "liberal", 
                        "empath-mpqa", 
                        "binary_effect", 
                        exclude_iv_vals = [])

significance_transformers_NOOKAY_binaryeffects_df = iesta.stats.significance.calc_sign_effects(
                        transformer_features_df[transformer_features_df["effect"] != "okay"], 
                        "liberal", 
                        "transformers", 
                        "binary_effect", 
                        exclude_iv_vals = [])

data has 39957 instances
bonforrini_threshold:  0.05
there are 408 features.
Feature: mpqa_assessments
Feature: mpqa_doubt
Feature: mpqa_authority
Feature: mpqa_emphasis
Feature: mpqa_necessity
Feature: mpqa_causation
Feature: mpqa_generalization
Feature: mpqa_structure
Feature: mpqa_conditionals
Feature: mpqa_inconsistency
Feature: mpqa_possibility
Feature: mpqa_wants
Feature: mpqa_contrast
Feature: mpqa_priority
Feature: mpqa_difficulty
Feature: mpqa_inyourshoes
Feature: mpqa_rhetoricalquestion
Feature: mpqa_argumentative
Feature: mpqa_token_ratio
Feature: mpqa_args_count
Feature: empath_cold_count
Feature: empath_cold_ratio
Feature: empath_aggression_count
Feature: empath_aggression_ratio
Feature: empath_vacation_count
Feature: empath_vacation_ratio
Feature: empath_dispute_count
Feature: empath_dispute_ratio
Feature: empath_nervousness_count
Feature: empath_nervousness_ratio
Feature: empath_leisure_count
Feature: empath_leisure_ratio
Feature: empath_journalism_count
Feature: empath_

In [17]:
significance_empath_mpqa_NOOKAY_binaryeffects_df

Unnamed: 0_level_0,effective ineffective
feature,Unnamed: 1_level_1
mpqa_authority,0.01
mpqa_emphasis,0.01
mpqa_necessity,0.02
mpqa_causation,0.02
mpqa_structure,0.01
mpqa_conditionals,0.01
mpqa_inconsistency,0.02
mpqa_possibility,0.01
mpqa_contrast,0.02
mpqa_priority,0.01


In [18]:
significance_transformers_NOOKAY_binaryeffects_df

Unnamed: 0_level_0,effective ineffective
feature,Unnamed: 1_level_1
emotion_hartmann_anger_count,0.02
emotion_hartmann_disgust,0.02
emotion_hartmann_disgust_count,0.03
emotion_hartmann_disgust_ratio,0.02
emotion_hartmann_joy,0.03
emotion_hartmann_joy_count,0.03
emotion_hartmann_joy_ratio,0.02
emotion_hartmann_neutral,0.02
emotion_hartmann_neutral_count,0.04
emotion_hartmann_neutral_ratio,0.01


In [None]:
import imblearn

def undersample(df, strategy='.5'):
    col=df.columns
    features=col.tolist() 
    feature=features[:-1] 
    target=features[-1]
    X=df.loc[:,feature]  
    y=df.loc[:,target]
    
    undersample = imblearn.under_sampling.RandomUnderSampler(random_state=0)#, sampling_strategy=strategy)
    X_under, y_under = undersample.fit_resample(X, y)
    print(Counter(y_under))
    return pd.concat([X_under,y_under],axis=1)