In [None]:

from comet_ml import Experiment
import GPUtil
import torch

import os   
import codecarbon
from iesta.machine_learning.dataloader import IESTAData, LABELS
from iesta.machine_learning.huggingface_loader import IESTAHuggingFace
from nlpaf.transformers.text_classification import TextClassification
from dotenv import load_dotenv, find_dotenv
import argparse
from huggingface_hub import login
from datasets import load_dataset, Dataset, DatasetDict


In [None]:
import re

import numpy as np
import pandas as pd
from ydata_profiling import ProfileReport
def _apply_no_punc(row):
    row["text_no_punc"] = re.sub(r'[^\w\s]', '', row["text"])
    return row

def profile_datasets(ds_name:str = "debateorg_w_effect_for_liberal", lower:bool = False):
    dataset_dict: DatasetDict = load_dataset(f"notaphoenix/{ds_name}")
    dfs = []
    for split, ds in dataset_dict.items():
        print(split)
        dfs.append(ds.to_pandas())

    df = pd.concat(dfs)
    len(df)
        #profile = ProfileReport(df, title="Profiling Report")
    #df["text_low"] = df["text"].str.lower()
    dissmiss_arr = []
    with open("../data/dismiss_text.txt", "r") as dismissedf:
        dissmiss_arr = list(pd.Series(dismissedf.read().splitlines()).str.lower())
        dissmiss_arr = list(set([re.sub(r'[^\w\s]', '', x) for x in dissmiss_arr]))
    
    
    df = df.apply(_apply_no_punc, axis=1)
    df = df[~df["text_no_punc"].str.lower().isin(dissmiss_arr)]
    if lower:
        df["text_low"] = df["text"].str.lower()
        profile = ProfileReport(df[["text_low"]], title="Profiling Report")
        profile.to_file(f"../data/profilers/{ds_name}_low.html")
    else:
        profile = ProfileReport(df[["text"]], title="Profiling Report")
        profile.to_file(f"../data/profilers/{ds_name}.html")
    return df, profile

In [None]:
# This is typically needed once per notebook
#%matplotlib inline 
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from nltk.tokenize import word_tokenize

sns.set(style="darkgrid")
import re


In [None]:
def plot_datasets(ds_name:str = "debateorg_w_effect_for_liberal", lower:bool = False):
    dataset_dict: DatasetDict = load_dataset(f"notaphoenix/{ds_name}")
    dfs = []
    for split, ds in dataset_dict.items():
        print(split)
        dfs.append(ds.to_pandas())

    df = pd.concat(dfs)
    len(df)
        #profile = ProfileReport(df, title="Profiling Report")
    #df["text_low"] = df["text"].str.lower()
    dissmiss_arr = []
    with open("../data/dismiss_text.txt", "r") as dismissedf:
        dissmiss_arr = list(pd.Series(dismissedf.read().splitlines()).str.lower())
        dissmiss_arr = list(set([re.sub(r'[^\w\s]', '', x) for x in dissmiss_arr]))
    
    
    df = df.apply(_apply_no_punc, axis=1)
    df = df[~df["text_no_punc"].str.lower().isin(dissmiss_arr)]

    df['num_tokens'] = df['text'].apply(lambda x: len(word_tokenize(x)))
    df['num_chars'] = df['text'].apply(lambda x: len(x))

    #df = df[df['num_tokens']>2]
    #df = df[df['num_tokens']<=1600]
    sns.set(style="darkgrid")
    fig, axes = plt.subplots(figsize=(10,7))
    # Plot histogram
    color="olive"
    for lbl, df_ in df.groupby(['label']):
        sns.histplot(df_['num_chars'], bins=50, color=color, label=lbl, stat="percent")
        color = "skyblue"

    plt.title(f'Histogram of Token Counts {ds_name} ')
    plt.xlabel("Number of Tokens")
    plt.ylabel("Frequency")
    plt.legend() 
    plt.show()
    return df
    

In [None]:
lib = plot_datasets( "debateorg_w_effect_for_liberal_subset")




In [None]:
for lbl, df_ in lib.groupby(['label']):
    print(lbl)
    prev = 0
    for x in range(5,2001,295):
        print(f"between {prev} and {x} (inclusive): ", len(df_[(df_['num_tokens']> prev) & (df_['num_tokens']<= x)]))
        prev = x
    print(f"<= 600: ", len(df_[(df_['num_tokens']<= 600)]))
    print(f"> 600: ", len(df_[(df_['num_tokens']> 600)]))
for lbl, df_ in lib.groupby(['label']):
    print(lbl)
    prev = 0
    for x in range(20, 10000,500):
        print(f"between {prev} and {x} (inclusive): ", len(df_[(df_['num_tokens']> prev) & (df_['num_tokens']<= x)]))
        prev = x    

In [None]:
dataset_names = ["debateorg_w_effect_for_liberal_subset",
                 "debateorg_w_effect_for_conservative_subset",
                 "debateorg_w_effect_for_liberal",
                 "debateorg_w_effect_for_conservative"]



for ds_name in dataset_names:
    #profile_datasets(ds_name, lower=True)
    plot_datasets(ds_name)
    break
    

In [None]:
from iesta.machine_learning.dataloader import IESTAData, LABELS
from iesta.machine_learning.huggingface_loader import IESTAHuggingFace
conservative_data_obj = IESTAData(ideology="conservative", keep_labels = LABELS.EFF_INEFF, )
liberal_data_obj = IESTAData(ideology="liberal", keep_labels = LABELS.EFF_INEFF, )



In [None]:
conservative_data_obj.split_iesta_dataset_by_debate(True, profile=True)

In [None]:
liberal_data_obj.split_iesta_dataset_by_debate(True, profile=True)

In [None]:
df_lib[0]

In [None]:
df[0]

In [None]:
df_ = df[0]

In [None]:
df_.index.name = 'idx'
df_[["id", "debate_id", "p_name", "top_effect", "category", "round", "argument", "cleaned_text"]]

In [None]:
df_.to_parquet("temp.parquet")

In [None]:
df_