# Notebook: explore_datasets.ipynb

This notebook is a simple exploration of the dataset. It is a good starting point to understand the data structures.


In [1]:
import numpy
import pandas
from lib import filepaths


## Load the data


In [2]:
# First version of the dataset (BAN-PL_1.zip)
# - Upload Date: 16.08.2023
# - Rows: 24,000
# - Classes: 0 – non-harmful, 1 – harmful
df1: pandas.DataFrame = pandas.read_csv(filepaths.datasets / "BAN-PL_1.csv")

# Second version of the dataset (BAN-PL_2.zip)
# - Upload Date: 05.04.2023
# - Rows: 24,000
# - Classes: 0 – non-harmful, 1 – harmful
# - Moderation reasons: 4 pseudonymized classes representing moderation reasons
df2: pandas.DataFrame = pandas.read_csv(filepaths.datasets / "BAN-PL_2.csv")


## Inspect the data

Columns, rows, data types, missing values.

In [3]:
def get_column_min_max(
    df: pandas.DataFrame,
) -> dict[str, tuple[float, float]]:
    """
    Get min and max values for each numerical column in the DataFrame.

    Args:
        df (pandas.DataFrame): DataFrame to get min and max values from.

    Returns:
        dict[str, tuple[float, float]]: Dictionary with column names as keys and tuples with min and max values as values.
    """
    return {
        column: (df[column].min(), df[column].max())
        for column in df.select_dtypes(include=[numpy.number]).columns
    }


In [4]:
print(
    f"(1) Number of rows:\n  df1 = {len(df1):,}\n  df2 = {len(df2):,}",
)

print(
    f"\n\n(2) Number of columns:\n  df1 = {len(df1.columns):,}\n  df2 = {len(df2.columns):,}",
)

print(
    f"\n\n(3) Column names:\n  df1 = {df1.columns.tolist()}\n  df2 = {df2.columns.tolist()}",
)


print(
    f"\n\n(4) Column range (min, max):\n  df1 = {get_column_min_max(df1)}\n  df2 = {get_column_min_max(df2)}",
)


print(
    f"\n\n(5) Column types:\n  df1 = {df1.dtypes.to_dict()}\n  df2 = {df2.dtypes.to_dict()}",
)

print(
    f"\n\n(6) Number of unique values in each column:\n  df1 = {df1.nunique().to_dict()}\n  df2 = {df2.nunique().to_dict()}",
)

print(
    f"\n\n(7) Number of NA values in each column:\n  df1 = {df1.isna().sum().to_dict()}\n  df2 = {df2.isna().sum().to_dict()}",
)


(1) Number of rows:
  df1 = 24,000
  df2 = 24,000


(2) Number of columns:
  df1 = 2
  df2 = 3


(3) Column names:
  df1 = ['Text', 'Class']
  df2 = ['Text', 'Class', 'Reason']


(4) Column range (min, max):
  df1 = {'Class': (0, 1)}
  df2 = {'Class': (0, 1), 'Reason': (1, 4)}


(5) Column types:
  df1 = {'Text': dtype('O'), 'Class': dtype('int64')}
  df2 = {'Text': dtype('O'), 'Class': dtype('int64'), 'Reason': dtype('int64')}


(6) Number of unique values in each column:
  df1 = {'Text': 23985, 'Class': 2}
  df2 = {'Text': 23985, 'Class': 2, 'Reason': 4}


(7) Number of NA values in each column:
  df1 = {'Text': 0, 'Class': 0}
  df2 = {'Text': 0, 'Class': 0, 'Reason': 0}


First rows, random rows, last rows.

In [5]:
print(
    f"(1) First row:\n  df1 = {df1.head(1).to_dict(orient='records')}\n  df2 = {df2.head(1).to_dict(orient='records')}",
)


print(
    f"\n\n(2) Last row:\n  df1 = {df1.tail(1).to_dict(orient='records')}\n  df2 = {df2.tail(1).to_dict(orient='records')}",
)

print(
    f"\n\n(3) Random row:\n  df1 = {df1.sample(1).to_dict(orient='records')}\n  df2 = {df2.sample(1).to_dict(orient='records')}",
)

print(
    f"\n\n(4) Random 10 rows (text only):\n  df1 = {[row['Text'] for row in df1.sample(10).to_dict(orient='records')]}\n  df2 = {[row['Text'] for row in df2.sample(10).to_dict(orient='records')]}",
)

print(
    f"\n\n(5) Top 25 most common words:\n  df1 = {df1['Text'].str.split(expand=True).stack().value_counts().head(25).to_dict()}\n  df2 = {df2['Text'].str.split(expand=True).stack().value_counts().head(25).to_dict()}"
)


(1) First row:
  df1 = [{'Text': 'Polska wtedy oficjalnie powinna przyznać, że oddadzą (oczywiście zgodnie z prawem po wyrokach sądów polskich prawowitym spadkobiercom) jak tylko Niemcy oddadzą za zniszczenia jakich dokonali w II WŚ na terenie Polski (⌐ ͡■ ͜ʖ ͡■)', 'Class': 0}]
  df2 = [{'Text': 'Polska wtedy oficjalnie powinna przyznać, że oddadzą (oczywiście zgodnie z prawem po wyrokach sądów polskich prawowitym spadkobiercom) jak tylko Niemcy oddadzą za zniszczenia jakich dokonali w II WŚ na terenie Polski (⌐ ͡■ ͜ʖ ͡■)', 'Class': 0, 'Reason': 1}]


(2) Last row:
  df1 = [{'Text': 'Przecież murzynów nie da się szanować ani tolerować. Zawsze to samo. Jeden asfalt zostanie odstrzelony - reszta małp rozpie**** miasto. Gdzie tu logika?', 'Class': 1}]
  df2 = [{'Text': 'Przecież murzynów nie da się szanować ani tolerować. Zawsze to samo. Jeden asfalt zostanie odstrzelony - reszta małp rozpie**** miasto. Gdzie tu logika?', 'Class': 1, 'Reason': 3}]


(3) Random row:
  df1 = [{'Text': '{USE