In [1]:
# ================================================================
# Bachelor Thesis — Fairness in Toxic Comment Classification
# ---------------------------------------------------------------
# Notebook: data_preprocessing_and_preparation.ipynb
# Author: Philipp Stocker
# Created: 02.11.2025
# Purpose: This notebook handles the preprocessing and preparation of the dataset for model training.
# It includes data cleaning, label creation, text vectorization, and train–validation–test splitting,
# ensuring that the data is ready for subsequent model development and bias evaluation stages.
# ================================================================

# --- Basic setup ---
import os
import sys
import warnings
warnings.filterwarnings("ignore")  # keep output clean for reports

# Automatically add project root to path so src/ modules are importable
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
if project_root not in sys.path:
    sys.path.append(project_root)

# --- Standard imports ---
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import src.data_preprocessing_and_preparation as data_prep

# --- "Global variables" ---
DATA_RAW = os.path.join(project_root, "data", "raw")
DATA_PROCESSED = os.path.join(project_root, "data", "processed")

IDENTITY_COLUMNS = ["male", "female", "heterosexual", "homosexual_gay_or_lesbian", "bisexual", "transgender", "other_gender", "other_sexual_orientation"]

print("✅ Environment ready")


✅ Environment ready


DATASET SPLITTING (Creation of training-, test- and validation sets)

In [2]:
df = pd.read_csv(os.path.join(DATA_RAW, "train.csv"))

train_prepared, test_prepared, validation_prepared = data_prep.split_dataset(df, IDENTITY_COLUMNS) # split dataset into train, test and validation set

# Add binary labels for toxicity and presence of identity (for analysis purposes)
for d in (train_prepared, test_prepared, validation_prepared):
    d = data_prep.binarize_labels(d, target_col="target", new_col_name="labelled_as_toxic", threshold=0.5)
    d["has_identity"] = (d[IDENTITY_COLUMNS] > 0.5).any(axis=1).astype(int)

print(train_prepared.shape, test_prepared.shape, validation_prepared.shape) # print shapes of the prepared datasets

(1263411, 47) (270731, 47) (270732, 47)


In [3]:
train_path = os.path.join(DATA_PROCESSED, "train.csv")
test_path = os.path.join(DATA_PROCESSED, "test.csv")
validation_path = os.path.join(DATA_PROCESSED, "validation.csv")

# Save prepared datasets to processed data folder
train_prepared.to_csv(train_path, index=False)
test_prepared.to_csv(test_path, index=False)
validation_prepared.to_csv(validation_path, index=False)

print("Saved prepared datasets successfully:"
      f"\n- Train: {train_path}"
      f"\n- Test: {test_path}"
      f"\n- Validation: {validation_path}")

Saved prepared datasets successfully:
- Train: c:\Users\phili\OneDrive\Dokumente\Uni\Kurse\7. Semester\Bachelorarbeit\BA_Arbeitsmappe\BachelorThesis_BiasInToxicCommentClassification\data\processed\train.csv
- Test: c:\Users\phili\OneDrive\Dokumente\Uni\Kurse\7. Semester\Bachelorarbeit\BA_Arbeitsmappe\BachelorThesis_BiasInToxicCommentClassification\data\processed\test.csv
- Validation: c:\Users\phili\OneDrive\Dokumente\Uni\Kurse\7. Semester\Bachelorarbeit\BA_Arbeitsmappe\BachelorThesis_BiasInToxicCommentClassification\data\processed\validation.csv


DATA PREPROCESSING (Comment text cleaning)

In [4]:
# Load prepared datasets
train_df = pd.read_csv(os.path.join(DATA_PROCESSED, "train.csv"))
val_df = pd.read_csv(os.path.join(DATA_PROCESSED, "validation.csv"))
test_df = pd.read_csv(os.path.join(DATA_PROCESSED, "test.csv"))

In [5]:
import swifter # for faster apply operations (parallelization)

# Clean comment texts by applying text preprocessing function
train_df["comment_text_processed"] = train_df["comment_text"].swifter.apply(data_prep.clean_comment)
val_df["comment_text_processed"] = val_df["comment_text"].swifter.apply(data_prep.clean_comment)
test_df["comment_text_processed"] = test_df["comment_text"].swifter.apply(data_prep.clean_comment)

Pandas Apply:   0%|          | 0/1263411 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/270732 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/270731 [00:00<?, ?it/s]

Peek at processing results:

In [6]:
train_df[["comment_text", "comment_text_processed"]].head()


Unnamed: 0,comment_text,comment_text_processed
0,Ain't it amazing ? Them too-----and Trump stil...,ain't it amazing them too and trump still won ...
1,The owners can discipline the players involved...,the owners can discipline the players involved...
2,comedy. Obama has dealt with single digit unem...,comedy obama has dealt with single digit unemp...
3,"Likely. ""His prior convictions include DUI a...",likely his prior convictions include dui and f...
4,Blaming the tourists for the poor quality of o...,blaming the tourists for the poor quality of o...


In [7]:
test_df[["comment_text", "comment_text_processed"]].head()

Unnamed: 0,comment_text,comment_text_processed
0,I listened to Scheer re Omar Khadr; he sounded...,i listened to scheer re omar khadr he sounded ...
1,For more on Pre check out; prespeople.com,for more on pre check out prespeople com
2,They also have to go to places where people sh...,they also have to go to places where people sh...
3,This is why many experts and policy makers are...,this is why many experts and policy makers are...
4,"Yeah, I know...many regular miles as well.",yeah i know many regular miles as well


In [8]:
val_df[["comment_text", "comment_text_processed"]].head()

Unnamed: 0,comment_text,comment_text_processed
0,"""The rumour at the time was that the RCMP has ...",the rumour at the time was that the rcmp has a...
1,"You may jest, but Amazon offering to allow pur...",you may jest but amazon offering to allow purv...
2,You are completely out to lunch! There is abso...,you are completely out to lunch there is absol...
3,Good decision. This decision is probably savin...,good decision this decision is probably saving...
4,"I would suggest REITs, especially those invest...",i would suggest reits especially those investi...


Save processed dataframes (to CSV and parquet):

In [9]:
# Save processed dataframes to parquet (more efficient storage format)
train_df.to_parquet(os.path.join(DATA_PROCESSED, "train_processed.parquet"), index=False)
val_df.to_parquet(os.path.join(DATA_PROCESSED, "validation_processed.parquet"), index=False)
test_df.to_parquet(os.path.join(DATA_PROCESSED, "test_processed.parquet"), index=False)

In [10]:
# Save processed dataframes to CSV as well (for easier inspection)
train_df.to_csv(os.path.join(DATA_PROCESSED, "train_processed.csv"), index=False)
val_df.to_csv(os.path.join(DATA_PROCESSED, "validation_processed.csv"), index=False)
test_df.to_csv(os.path.join(DATA_PROCESSED, "test_processed.csv"), index=False)

DATA VECTORIZATION

In [11]:
# Load processed dataframes from parquet files
train_df = pd.read_parquet(os.path.join(DATA_PROCESSED, "train_processed.parquet"))
val_df = pd.read_parquet(os.path.join(DATA_PROCESSED, "validation_processed.parquet"))
test_df = pd.read_parquet(os.path.join(DATA_PROCESSED, "test_processed.parquet"))

# Ensure that processed comment texts are strings and handle any missing values
train_df["comment_text_processed"] = train_df["comment_text_processed"].astype(str).fillna("")
val_df["comment_text_processed"]   = val_df["comment_text_processed"].astype(str).fillna("")
test_df["comment_text_processed"]  = test_df["comment_text_processed"].astype(str).fillna("")

In [12]:
# Display first few rows of processed dataframes before vectorization
train_df.head()


Unnamed: 0,id,target,comment_text,severe_toxicity,obscene,identity_attack,insult,threat,asian,atheist,...,wow,sad,likes,disagree,sexual_explicit,identity_annotator_count,toxicity_annotator_count,labelled_as_toxic,has_identity,comment_text_processed
0,812993,0.0,Ain't it amazing ? Them too-----and Trump stil...,0.0,0.0,0.0,0.0,0.0,,,...,0,0,4,1,0.0,0,4,0,0,ain't it amazing them too and trump still won ...
1,6027970,0.0,The owners can discipline the players involved...,0.0,0.0,0.0,0.0,0.0,,,...,0,0,6,1,0.0,0,4,0,0,the owners can discipline the players involved...
2,348458,0.2,comedy. Obama has dealt with single digit unem...,0.0,0.0,0.0,0.2,0.0,0.0,0.0,...,0,0,1,0,0.0,4,5,0,0,comedy obama has dealt with single digit unemp...
3,5368848,0.0,"Likely. ""His prior convictions include DUI a...",0.0,0.0,0.0,0.0,0.0,,,...,0,1,0,0,0.0,0,4,0,0,likely his prior convictions include dui and f...
4,5862887,0.0,Blaming the tourists for the poor quality of o...,0.0,0.0,0.0,0.0,0.0,,,...,0,0,12,0,0.0,0,4,0,0,blaming the tourists for the poor quality of o...


In [13]:
# build TF-IDF vectorizer based on training data (with standard parameters)
vectorizer = data_prep.build_tfidf_vectorizer()

# extract necessary columns for model training and evaluation
x_train = train_df["comment_text_processed"]
y_train = train_df["labelled_as_toxic"]

x_val = val_df["comment_text_processed"]
y_val = val_df["labelled_as_toxic"]

x_test = test_df["comment_text_processed"]
y_test = test_df["labelled_as_toxic"]

Apply data transformation:

In [14]:
# apply transformation
x_train_vec = vectorizer.fit_transform(x_train) # use fit (Learns the vocabulary and IDF weights) only on training data to avoid data leakage
x_val_vec   = vectorizer.transform(x_val) 
x_test_vec  = vectorizer.transform(x_test)

In [15]:
# print shapes of the resulting vectors
print("Shapes (train/val/test):", x_train_vec.shape, x_val_vec.shape, x_test_vec.shape)

Shapes (train/val/test): (1263411, 1039908) (270732, 1039908) (270731, 1039908)


Save vectorizer and bundles of vectorized data + its label:

In [16]:
import joblib # for saving models and data bundles efficiently

# Save the vectorizer for later use
joblib.dump(vectorizer, project_root + "/models/tfidf_vectorizer.joblib")

# Save bundles of vectorized data + its label
joblib.dump(
    {"x": x_train_vec, "y": y_train},
    project_root + "/data/processed/train_tfidf_bundle.joblib"
)

joblib.dump(
    {"x": x_val_vec, "y": y_val},
    project_root + "/data/processed/val_tfidf_bundle.joblib"
)

joblib.dump(
    {"x": x_test_vec, "y": y_test},
    project_root + "/data/processed/test_tfidf_bundle.joblib"
)



['c:\\Users\\phili\\OneDrive\\Dokumente\\Uni\\Kurse\\7. Semester\\Bachelorarbeit\\BA_Arbeitsmappe\\BachelorThesis_BiasInToxicCommentClassification/data/processed/test_tfidf_bundle.joblib']