In [6]:

from comet_ml import Experiment
import GPUtil
import torch

import os   
import codecarbon
from iesta.machine_learning.dataloader import IESTAData, LABELS
from iesta.machine_learning.huggingface_loader import IESTAHuggingFace
from nlpaf.transformers.text_classification import TextClassification
from dotenv import load_dotenv, find_dotenv
import argparse
from huggingface_hub import login
from datasets import load_dataset, Dataset, DatasetDict


In [7]:
import re

import numpy as np
import pandas as pd
from ydata_profiling import ProfileReport
def _apply_no_punc(row):
    row["text_no_punc"] = re.sub(r'[^\w\s]', '', row["text"])
    return row

def profile_datasets(ds_name:str = "debateorg_w_effect_for_liberal", lower:bool = False):
    dataset_dict: DatasetDict = load_dataset(f"notaphoenix/{ds_name}")
    dfs = []
    for split, ds in dataset_dict.items():
        print(split)
        dfs.append(ds.to_pandas())

    df = pd.concat(dfs)
    len(df)
        #profile = ProfileReport(df, title="Profiling Report")
    #df["text_low"] = df["text"].str.lower()
    dissmiss_arr = []
    with open("../data/dismiss_text.txt", "r") as dismissedf:
        dissmiss_arr = list(pd.Series(dismissedf.read().splitlines()).str.lower())
        dissmiss_arr = list(set([re.sub(r'[^\w\s]', '', x) for x in dissmiss_arr]))
    
    
    df = df.apply(_apply_no_punc, axis=1)
    df = df[~df["text_no_punc"].str.lower().isin(dissmiss_arr)]
    if lower:
        df["text_low"] = df["text"].str.lower()
        profile = ProfileReport(df[["text_low"]], title="Profiling Report")
        profile.to_file(f"../data/profilers/{ds_name}_low.html")
    else:
        profile = ProfileReport(df[["text"]], title="Profiling Report")
        profile.to_file(f"../data/profilers/{ds_name}.html")
    return df, profile

In [8]:
# This is typically needed once per notebook
#%matplotlib inline 
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from nltk.tokenize import word_tokenize

sns.set(style="darkgrid")
import re


In [9]:
def plot_datasets(ds_name:str = "debateorg_w_effect_for_liberal", lower:bool = False):
    dataset_dict: DatasetDict = load_dataset(f"notaphoenix/{ds_name}")
    dfs = []
    for split, ds in dataset_dict.items():
        print(split)
        dfs.append(ds.to_pandas())

    df = pd.concat(dfs)
    len(df)
        #profile = ProfileReport(df, title="Profiling Report")
    #df["text_low"] = df["text"].str.lower()
    dissmiss_arr = []
    with open("../data/dismiss_text.txt", "r") as dismissedf:
        dissmiss_arr = list(pd.Series(dismissedf.read().splitlines()).str.lower())
        dissmiss_arr = list(set([re.sub(r'[^\w\s]', '', x) for x in dissmiss_arr]))
    
    
    df = df.apply(_apply_no_punc, axis=1)
    df = df[~df["text_no_punc"].str.lower().isin(dissmiss_arr)]

    df['num_tokens'] = df['text'].apply(lambda x: len(word_tokenize(x)))
    df['num_chars'] = df['text'].apply(lambda x: len(x))

    #df = df[df['num_tokens']>2]
    #df = df[df['num_tokens']<=1600]
    sns.set(style="darkgrid")
    fig, axes = plt.subplots(figsize=(10,7))
    # Plot histogram
    color="olive"
    for lbl, df_ in df.groupby(['label']):
        sns.histplot(df_['num_chars'], bins=50, color=color, label=lbl, stat="percent")
        color = "skyblue"

    plt.title(f'Histogram of Token Counts {ds_name} ')
    plt.xlabel("Number of Tokens")
    plt.ylabel("Frequency")
    plt.legend() 
    plt.show()
    return df
    

In [22]:
dataset_dict: DatasetDict = load_dataset(f"notaphoenix/debateorg_w_effect_for_liberal_subset")

dataset_dict = dataset_dict.remove_columns(['author', 'original_text', 'category', 'round', 'debate_id', 'idx'],)

Found cached dataset parquet (/home/elba_ro/.cache/huggingface/datasets/notaphoenix___parquet/notaphoenix--debateorg_w_effect_for_liberal_subset-9974cc12ec5dc37a/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7)


  0%|          | 0/3 [00:00<?, ?it/s]

: 

In [21]:
dataset_dict["validation"]



Dataset({
    features: ['text', 'label'],
    num_rows: 3360
})

In [None]:
for lbl, df_ in lib.groupby(['label']):
    print(lbl)
    prev = 0
    for x in range(5,2001,295):
        print(f"between {prev} and {x} (inclusive): ", len(df_[(df_['num_tokens']> prev) & (df_['num_tokens']<= x)]))
        prev = x
    print(f"<= 600: ", len(df_[(df_['num_tokens']<= 600)]))
    print(f"> 600: ", len(df_[(df_['num_tokens']> 600)]))
for lbl, df_ in lib.groupby(['label']):
    print(lbl)
    prev = 0
    for x in range(20, 10000,500):
        print(f"between {prev} and {x} (inclusive): ", len(df_[(df_['num_tokens']> prev) & (df_['num_tokens']<= x)]))
        prev = x    

In [None]:
dataset_names = ["debateorg_w_effect_for_liberal_subset",
                 "debateorg_w_effect_for_conservative_subset",
                 "debateorg_w_effect_for_liberal",
                 "debateorg_w_effect_for_conservative"]



for ds_name in dataset_names:
    #profile_datasets(ds_name, lower=True)
    plot_datasets(ds_name)
    break
    

In [1]:
from iesta.machine_learning.dataloader import IESTAData, LABELS
from iesta.machine_learning.huggingface_loader import IESTAHuggingFace
conservative_data_obj = IESTAData(ideology="conservative", keep_labels = LABELS.EFF_INEFF, )
liberal_data_obj = IESTAData(ideology="liberal", keep_labels = LABELS.EFF_INEFF, )





In [2]:
cons_df, df = conservative_data_obj.split_iesta_dataset_by_debate(True, profile=True)

/home/elba_ro/repos/github/conf22-style-transfer/iesta/../data/splitted_conservative_debate_arguments_effect_test0.3_random2.parquet
Original df len: 101932
After filtering effects 84711
Adding Cleaned text
After filtering dismissed no_punc df len: 84684
After filtering dismissed df len: 82495
Profiling data


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.rename(columns={"index": "df_index"}, inplace=True)


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.rename(columns={"index": "df_index"}, inplace=True)


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

End of profiling
is_for_eval_classifier is not in the columns, adding it - used to have data for style classification

False


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

effect      effective  ineffective
split                             
test              240         5500
training         1384        39163
validation        430        11034

True


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

effect      effective  ineffective
split                             
test               96         2443
training          611        16727
validation        169         4698
All
effect      effective  ineffective
split                             
test              336         7943
training         1995        55890
validation        599        15732


In [3]:
lib_df, df = liberal_data_obj.split_iesta_dataset_by_debate(True, profile=True)

/home/elba_ro/repos/github/conf22-style-transfer/iesta/../data/splitted_liberal_debate_arguments_effect_test0.3_random2.parquet
Original df len: 66844
After filtering effects 56394
Adding Cleaned text
After filtering dismissed no_punc df len: 56366
After filtering dismissed df len: 55353
Profiling data


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.rename(columns={"index": "df_index"}, inplace=True)


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.rename(columns={"index": "df_index"}, inplace=True)


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

End of profiling
is_for_eval_classifier is not in the columns, adding it - used to have data for style classification

False


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

effect      effective  ineffective
split                             
test              167         3719
training         1311        25824
validation        367         7337

True


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

effect      effective  ineffective
split                             
test               81         1609
training          474        11104
validation        145         3215
All
effect      effective  ineffective
split                             
test              248         5328
training         1785        36928
validation        512        10552


In [4]:
lib_df

Unnamed: 0_level_0,id,debate_id,p_name,effect,category,round,argument,cleaned_text,is_for_eval_classifier,split
idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,.-.-.-Ha-YOURE-GOING-DOWN-BEEM0R/1/|Logical-Ma...,.-.-.-Ha-YOURE-GOING-DOWN-BEEM0R/1/,Logical-Master,ineffective,Miscellaneous,0,\n \r\nRESOLVED: It is impossible not to cons...,RESOLVED: It is impossible not to consciously ...,True,training
1,.-.-.-Ha-YOURE-GOING-DOWN-BEEM0R/1/|Logical-Ma...,.-.-.-Ha-YOURE-GOING-DOWN-BEEM0R/1/,Logical-Master,ineffective,Miscellaneous,1,"\n \r\nOkay, just now reading the round, it w...","Okay, just now reading the round, it would app...",True,training
2,.-.-.-Ha-YOURE-GOING-DOWN-BEEM0R/1/|Logical-Ma...,.-.-.-Ha-YOURE-GOING-DOWN-BEEM0R/1/,Logical-Master,ineffective,Miscellaneous,2,\n \r\n--->RE:Since my opponent has left it u...,"--->RE:Since my opponent has left it up to me,...",True,training
3,.-.-.-Ha-YOURE-GOING-DOWN-BEEM0R/1/|Logical-Ma...,.-.-.-Ha-YOURE-GOING-DOWN-BEEM0R/1/,Logical-Master,ineffective,Miscellaneous,3,\n \r\nCONSCIOUS SECTION: \n \r\nLadies and...,"CONSCIOUS SECTION: Ladies and gentleman, my op...",True,training
4,.-.-.-Ha-YOURE-GOING-DOWN-BEEM0R/1/|beem0r,.-.-.-Ha-YOURE-GOING-DOWN-BEEM0R/1/,beem0r,ineffective,Miscellaneous,0,\n \r\nI would eplain the vast differences be...,I would eplain the vast differences between th...,True,training
...,...,...,...,...,...,...,...,...,...,...
55348,zoos/1/|RacH3ll3,zoos/1/,RacH3ll3,ineffective,Miscellaneous,1,\n \r\nThanks to my opponent for accepting th...,Thanks to my opponent for accepting this debat...,True,training
55349,zoos/1/|RacH3ll3,zoos/1/,RacH3ll3,ineffective,Miscellaneous,2,"\n \r\nThank you. \n \r\n""I realize that. T...","Thank you. ""I realize that. The fact is, the m...",True,training
55350,zygote-is-not-A-person/1/|go4broke,zygote-is-not-A-person/1/,go4broke,ineffective,Science,0,\n \r\nA zygote or early embryo cannot alread...,A zygote or early embryo cannot already be A h...,False,test
55351,zygote-is-not-A-person/1/|go4broke,zygote-is-not-A-person/1/,go4broke,ineffective,Science,1,"\n \r\nI am talking about human embryology, c...","I am talking about human embryology, cell biol...",False,test


In [None]:
df[0]

In [None]:
df_ = df[0]

In [None]:
df_.index.name = 'idx'
df_[["id", "debate_id", "p_name", "top_effect", "category", "round", "argument", "cleaned_text"]]

In [None]:
df_.to_parquet("temp.parquet")

In [None]:
df_