In [None]:
from comet_ml import Experiment
import GPUtil
import torch

import os
import codecarbon
from iesta.data.iesta_data import IESTAData, LABELS
from iesta.data.huggingface_loader import IESTAHuggingFace
from nlpaf.transformers.text_classification import TextClassification
from dotenv import load_dotenv, find_dotenv
import argparse
from huggingface_hub import login
from datasets import load_dataset, Dataset, DatasetDict

In [None]:
import re

import numpy as np
import pandas as pd
from ydata_profiling import ProfileReport


def _apply_no_punc(row):
    row["text_no_punc"] = re.sub(r"[^\w\s]", "", row["text"])
    return row


def profile_datasets(
    ds_name: str = "debateorg_w_effect_for_liberal", lower: bool = False
):
    dataset_dict: DatasetDict = load_dataset(f"notaphoenix/{ds_name}")
    dfs = []
    for split, ds in dataset_dict.items():
        print(split)
        dfs.append(ds.to_pandas())

    df = pd.concat(dfs)
    len(df)
    # profile = ProfileReport(df, title="Profiling Report")
    # df["text_low"] = df["text"].str.lower()
    dissmiss_arr = []
    with open("../data/dismiss_text.txt", "r") as dismissedf:
        dissmiss_arr = list(
            pd.Series(dismissedf.read().splitlines()).str.lower()
        )
        dissmiss_arr = list(
            set([re.sub(r"[^\w\s]", "", x) for x in dissmiss_arr])
        )

    df = df.apply(_apply_no_punc, axis=1)
    df = df[~df["text_no_punc"].str.lower().isin(dissmiss_arr)]
    if lower:
        df["text_low"] = df["text"].str.lower()
        profile = ProfileReport(df[["text_low"]], title="Profiling Report")
        profile.to_file(f"../data/profilers/{ds_name}_low.html")
    else:
        profile = ProfileReport(df[["text"]], title="Profiling Report")
        profile.to_file(f"../data/profilers/{ds_name}.html")
    return df, profile

In [None]:
# This is typically needed once per notebook
# %matplotlib inline
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from nltk.tokenize import word_tokenize

sns.set(style="darkgrid")
import re

In [None]:
def plot_datasets(
    ds_name: str = "debateorg_w_effect_for_liberal", lower: bool = False
):
    dataset_dict: DatasetDict = load_dataset(f"notaphoenix/{ds_name}")
    dfs = []
    for split, ds in dataset_dict.items():
        print(split)
        dfs.append(ds.to_pandas())

    df = pd.concat(dfs)
    len(df)
    # profile = ProfileReport(df, title="Profiling Report")
    # df["text_low"] = df["text"].str.lower()
    dissmiss_arr = []
    with open("../data/dismiss_text.txt", "r") as dismissedf:
        dissmiss_arr = list(
            pd.Series(dismissedf.read().splitlines()).str.lower()
        )
        dissmiss_arr = list(
            set([re.sub(r"[^\w\s]", "", x) for x in dissmiss_arr])
        )

    df = df.apply(_apply_no_punc, axis=1)
    df = df[~df["text_no_punc"].str.lower().isin(dissmiss_arr)]

    df["num_tokens"] = df["text"].apply(lambda x: len(word_tokenize(x)))
    df["num_chars"] = df["text"].apply(lambda x: len(x))

    # df = df[df['num_tokens']>2]
    # df = df[df['num_tokens']<=1600]
    sns.set(style="darkgrid")
    fig, axes = plt.subplots(figsize=(10, 7))
    # Plot histogram
    color = "olive"
    for lbl, df_ in df.groupby(["label"]):
        sns.histplot(
            df_["num_chars"], bins=50, color=color, label=lbl, stat="percent"
        )
        color = "skyblue"

    plt.title(f"Histogram of Token Counts {ds_name} ")
    plt.xlabel("Number of Tokens")
    plt.ylabel("Frequency")
    plt.legend()
    plt.show()
    return df

In [None]:
dataset_dict: DatasetDict = load_dataset(
    f"notaphoenix/debateorg_w_effect_for_liberal_subset"
)

dataset_dict = dataset_dict.remove_columns(
    ["author", "original_text", "category", "round", "debate_id", "idx"],
)

In [None]:
dataset_dict["validation"]

In [None]:
for lbl, df_ in lib.groupby(["label"]):
    print(lbl)
    prev = 0
    for x in range(5, 2001, 295):
        print(
            f"between {prev} and {x} (inclusive): ",
            len(df_[(df_["num_tokens"] > prev) & (df_["num_tokens"] <= x)]),
        )
        prev = x
    print(f"<= 600: ", len(df_[(df_["num_tokens"] <= 600)]))
    print(f"> 600: ", len(df_[(df_["num_tokens"] > 600)]))
for lbl, df_ in lib.groupby(["label"]):
    print(lbl)
    prev = 0
    for x in range(20, 10000, 500):
        print(
            f"between {prev} and {x} (inclusive): ",
            len(df_[(df_["num_tokens"] > prev) & (df_["num_tokens"] <= x)]),
        )
        prev = x

In [None]:
dataset_names = [
    "debateorg_w_effect_for_liberal_subset",
    "debateorg_w_effect_for_conservative_subset",
    "debateorg_w_effect_for_liberal",
    "debateorg_w_effect_for_conservative",
]


for ds_name in dataset_names:
    # profile_datasets(ds_name, lower=True)
    plot_datasets(ds_name)
    break

In [None]:
from iesta.data.iesta_data import IESTAData, LABELS
from iesta.data.huggingface_loader import IESTAHuggingFace

conservative_data_obj = IESTAData(
    ideology="conservative",
    keep_labels=LABELS.EFF_INEFF,
)
liberal_data_obj = IESTAData(
    ideology="liberal",
    keep_labels=LABELS.EFF_INEFF,
)

In [None]:
cons_df, df = conservative_data_obj.split_iesta_dataset_by_debate(
    True, profile=True
)

In [None]:
lib_df, df = liberal_data_obj.split_iesta_dataset_by_debate(True, profile=True)

In [None]:
lib_df

In [None]:
df[0]

In [None]:
df_ = df[0]

In [None]:
df_.index.name = "idx"
df_[
    [
        "id",
        "debate_id",
        "p_name",
        "top_effect",
        "category",
        "round",
        "argument",
        "cleaned_text",
    ]
]

In [None]:
df_.to_parquet("temp.parquet")

In [None]:
pip show iesta

In [None]:
%pip -q install -U ../



In [4]:
from iesta.data.iesta_data import IESTAData, LABELS
from iesta.data.huggingface_loader import IESTAHuggingFace

ideology = "liberal"
data_obj = IESTAData(
        ideology=ideology,
        keep_labels=LABELS.EFF_INEFF,
    )
hf = IESTAHuggingFace(data_obj, reload_preprocess=False)
style_eval_data = hf.upload_w_labels(
            is_for_style_classifier=False, force_reload=False
        )

ModuleNotFoundError: No module named 'iesta.data'

In [1]:
import iesta

dotenv was True


In [2]:
import iesta.data

ModuleNotFoundError: No module named 'iesta.data'

In [2]:
from datasets.dataset_dict import Dataset, DatasetDict


# notaphoenix/debateorg_w_effect_for_liberal

In [5]:
from datasets import load_dataset
ds = load_dataset("notaphoenix/debateorg_w_effect_for_liberal", use_auth_token=True)

Found cached dataset parquet (C:/Users/elba_ro/.cache/huggingface/datasets/notaphoenix___parquet/notaphoenix--debateorg_w_effect_for_liberal-d0ad1689381f171f/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7)


  0%|          | 0/3 [00:00<?, ?it/s]

In [6]:
ds

DatasetDict({
    validation: Dataset({
        features: ['text', 'label', 'author', 'original_text', 'category', 'round', 'debate_id', 'idx'],
        num_rows: 7704
    })
    test: Dataset({
        features: ['text', 'label', 'author', 'original_text', 'category', 'round', 'debate_id', 'idx'],
        num_rows: 3886
    })
    training: Dataset({
        features: ['text', 'label', 'author', 'original_text', 'category', 'round', 'debate_id', 'idx'],
        num_rows: 27135
    })
})

In [10]:
for split in []
dd = ds["validation"]
dd.to_pandas()['label']

Unnamed: 0,text,label,author,original_text,category,round,debate_id,idx
0,Con Now I know full good and well that most of...,0,jmlandf,\n \r\nCon \r\nNow I know full good and well...,Science,0,.999...-is-equal-to-1/2/,18
1,Thank you for taking the Debate. 1. I plan on ...,0,jmlandf,\n \r\nThank you for taking the Debate. \n ...,Science,1,.999...-is-equal-to-1/2/,19
2,"I have full BoP, .9r will represent .9 repeati...",0,SeventhProfessor,"\n \n I have full BoP, .9r will represent .9...",Miscellaneous,0,.999...-is-equal-to-one./1/,20
3,"For my first proof, I will take ,9r, and put i...",0,SeventhProfessor,"\n \n For my first proof, I will take ,9r, a...",Miscellaneous,1,.999...-is-equal-to-one./1/,21
4,I'm pretty bored so I was wondering to post a ...,0,Biowza,\n \r\nI'm pretty bored so I was wondering to...,Science,0,.999...-is-exactly-equal-to-1/1/,22
...,...,...,...,...,...,...,...,...
7699,gmail is nothin' in front of yahoo...yahoo is ...,0,harshita123,\n \r\ngmail is nothin' in front of yahoo...y...,Technology,0,yahoo-is-better-than-gmail.../1/,55283
7700,As a portal Yahoo is better than gmail. The re...,0,harshita123,\n \r\nAs a portal Yahoo is better than gmail...,Technology,1,yahoo-is-better-than-gmail.../1/,55284
7701,The features I mentioned are'nt provided by gm...,0,harshita123,\n \r\nThe features I mentioned are'nt provid...,Technology,2,yahoo-is-better-than-gmail.../1/,55285
7702,My temper is shorter than the average citizen ...,0,MassiveDump,\n \n My temper is shorter than the average ...,Miscellaneous,0,you-are-a-nice-person/1/,55300
