In [1]:
import pandas as pd
import sqlite3
from datasets import Dataset, DatasetDict

## Convert SQLITE database to DataFrame and label

In [2]:
def sql_to_df(database_filename: str) -> pd.DataFrame:
    """converts sqlite database to dataframe according to annotated values"""

    # import sql database
    conn = sqlite3.connect(database_filename)
    with conn:
        df = pd.read_sql_query("SELECT * FROM posts;", conn, index_col="id")

    # give corresponding label to each row
    df["label"] = 0  # default to 0
    for row in df.iterrows():
        idx, post = row[0], row[1]  # abbrv to make code easier to read
        if post["sarcasm_irony"] != 2:
            if (
                post["hatred"] == 1
                or post["threat"] == 1
                or (post["us_vs_them"] == 1 and post["aggressiveness"] > 0)
                or (post["foreigner_attitude"] == 2 and post["aggressiveness"] > 1)
                or (post["male_preference"] == 1 and post["aggressiveness"] > 1)
            ):
                df.loc[idx, "label"] = 1
    return df

In [4]:
df = sql_to_df("data/familjeliv_annotated_data.db")
df = df[["text", "label"]]
familjeliv_dataset = DatasetDict({
    "test": Dataset.from_pandas(df.reset_index(drop=True))
})
familjeliv_dataset

DatasetDict({
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 978
    })
})

In [5]:
familjeliv_dataset.save_to_disk("datasets/familjeliv_testset")