In [5]:
# Import necessary libraries
import sys
import os

# Add the src directory to the system path to import modules
sys.path.append(os.path.abspath('..'))

from data_loader import DataLoader
from text_vectorization import TextVectorizer
from dataset_preparation import DatasetPreparation
from toxicity_pipeline import ToxicityAnalysisPipeline

In [7]:
# Step 1: Load Data
data_file = '../data/train.csv.zip'  # Path to your dataset
loader = DataLoader(data_file)
X, y = loader.load_data()

In [8]:
# Step 2: Vectorize Text Data
vectorizer = TextVectorizer()
vectorizer.fit(X.values)  # Fit on text data

In [9]:
vectorized_text = vectorizer.transform(X.values)

In [10]:
# Step 3: Prepare Dataset for Training, Validation, and Testing
prep = DatasetPreparation(vectorized_text, y)
dataset = prep.create_dataset()  # Create TensorFlow Dataset

In [11]:
# Step 4: Split Dataset into train/val/test sets
train_ds, val_ds, test_ds = prep.split_dataset(dataset)

In [12]:
# Optional: Check shapes of datasets for verification
print(f'Train Dataset Size: {len(train_ds)}')
print(f'Validation Dataset Size: {len(val_ds)}')
print(f'Test Dataset Size: {len(test_ds)}')

Train Dataset Size: 6981
Validation Dataset Size: 1994
Test Dataset Size: 999
