# Text Preprocessing - Dataset 1

**Objective:** Prepare the multi-labeled toxic comments dataset for BERT training

**Tasks:**
1. Load and clean Bengali text data
2. Handle Bengali-specific preprocessing needs
3. Prepare data for tokenization
4. Create train/validation/test splits
5. Basic text analysis for BERT compatibility

**Dataset:** Multi-labeled toxic comments (16,073 samples)
**Labels:** vulgar, hate, religious, threat, troll, Insult

In [1]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import warnings
from collections import Counter

# Text processing
import string
from sklearn.model_selection import train_test_split

# Set random seed for reproducibility
np.random.seed(42)
warnings.filterwarnings('ignore')

print("Starting Bengali text preprocessing...")

Starting Bengali text preprocessing...


In [2]:
# Load Dataset 1
df = pd.read_csv('../data/Multi_labeled_toxic_comments.csv')

print(f"Dataset loaded: {df.shape}")
print(f"Columns: {list(df.columns)}")

# Quick verification
print(f"\nText column sample:")
print(df['text'].head(3).tolist())

# Label columns
label_columns = ['vulgar', 'hate', 'religious', 'threat', 'troll', 'Insult']
print(f"\nLabel columns: {label_columns}")

Dataset loaded: (16073, 7)
Columns: ['text', 'vulgar', 'hate', 'religious', 'threat', 'troll', 'Insult']

Text column sample:
['প্রধানমন্ত্রী হক সাহেবের ক্ষতি হলে জাতির স্বার্থে কেনো কোনো বাম পক্ষ কে ছাড় দেয়ার উচিত না', 'আমি বললাম, ‘দেন’', 'অসাধারণ তানজিন তিশা আমার বালো লাগার একজনকাতার থেকে']

Label columns: ['vulgar', 'hate', 'religious', 'threat', 'troll', 'Insult']


In [3]:
# Analyze text characteristics for BERT preprocessing
print("Text Analysis for BERT Compatibility")
print("=" * 40)

# Basic text statistics
df['text_length'] = df['text'].str.len()
df['word_count'] = df['text'].str.split().str.len()

print(f"Text length statistics:")
print(f"  Mean: {df['text_length'].mean():.1f} characters")
print(f"  Median: {df['text_length'].median():.1f} characters")
print(f"  Max: {df['text_length'].max()} characters")
print(f"  95th percentile: {df['text_length'].quantile(0.95):.1f} characters")

print(f"\nWord count statistics:")
print(f"  Mean: {df['word_count'].mean():.1f} words")
print(f"  Median: {df['word_count'].median():.1f} words")
print(f"  Max: {df['word_count'].max()} words")

# Check for very short texts
short_texts = (df['text_length'] < 5).sum()
print(f"\nTexts with less than 5 characters: {short_texts}")

# Check for very long texts (BERT has 512 token limit)
long_texts = (df['text_length'] > 500).sum()
print(f"Texts with more than 500 characters: {long_texts}")

Text Analysis for BERT Compatibility
Text length statistics:
  Mean: 80.0 characters
  Median: 59.0 characters
  Max: 1402 characters
  95th percentile: 205.0 characters

Word count statistics:
  Mean: 14.2 words
  Median: 11.0 words
  Max: 238 words

Texts with less than 5 characters: 25
Texts with more than 500 characters: 49


In [4]:
def clean_bengali_text(text):
    """
    Clean Bengali text for BERT preprocessing
    """
    if pd.isna(text):
        return ""
    
    # Convert to string
    text = str(text)
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text)
    
    # Remove leading/trailing whitespace
    text = text.strip()
    
    # Remove URLs (common in social media)
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    
    # Remove email addresses
    text = re.sub(r'\S+@\S+', '', text)
    
    # Remove excessive punctuation (keep single instances)
    text = re.sub(r'[!]{2,}', '!', text)
    text = re.sub(r'[?]{2,}', '?', text)
    text = re.sub(r'[.]{3,}', '...', text)
    
    # Remove extra spaces again
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

# Test the cleaning function
print("Testing text cleaning function:")
test_texts = [
    "আমি খুব খুশি!!!",
    "এটা কি???   অনেক স্পেস   ",
    "http://example.com এই লিংক দেখো",
    "test@email.com এই ইমেইল"
]

for text in test_texts:
    cleaned = clean_bengali_text(text)
    print(f"Original: {text}")
    print(f"Cleaned:  {cleaned}")
    print()

Testing text cleaning function:
Original: আমি খুব খুশি!!!
Cleaned:  আমি খুব খুশি!

Original: এটা কি???   অনেক স্পেস   
Cleaned:  এটা কি? অনেক স্পেস

Original: http://example.com এই লিংক দেখো
Cleaned:  এই লিংক দেখো

Original: test@email.com এই ইমেইল
Cleaned:  এই ইমেইল



In [5]:
# Apply cleaning to the dataset
print("Applying text cleaning to dataset...")

# Create a copy for safety
df_clean = df.copy()

# Apply cleaning function
df_clean['text_cleaned'] = df_clean['text'].apply(clean_bengali_text)

# Compare before and after
print("Cleaning comparison examples:")
for i in range(3):
    original = df['text'].iloc[i]
    cleaned = df_clean['text_cleaned'].iloc[i]
    print(f"\nExample {i+1}:")
    print(f"Original: {original}")
    print(f"Cleaned:  {cleaned}")

# Check if any texts became empty after cleaning
empty_after_cleaning = (df_clean['text_cleaned'].str.len() == 0).sum()
print(f"\nTexts that became empty after cleaning: {empty_after_cleaning}")

# Update length statistics
df_clean['cleaned_length'] = df_clean['text_cleaned'].str.len()
print(f"\nCleaned text length statistics:")
print(f"  Mean: {df_clean['cleaned_length'].mean():.1f} characters")
print(f"  Median: {df_clean['cleaned_length'].median():.1f} characters")
print(f"  Max: {df_clean['cleaned_length'].max()} characters")

Applying text cleaning to dataset...
Cleaning comparison examples:

Example 1:
Original: প্রধানমন্ত্রী হক সাহেবের ক্ষতি হলে জাতির স্বার্থে কেনো কোনো বাম পক্ষ কে ছাড় দেয়ার উচিত না
Cleaned:  প্রধানমন্ত্রী হক সাহেবের ক্ষতি হলে জাতির স্বার্থে কেনো কোনো বাম পক্ষ কে ছাড় দেয়ার উচিত না

Example 2:
Original: আমি বললাম, ‘দেন’
Cleaned:  আমি বললাম, ‘দেন’

Example 3:
Original: অসাধারণ তানজিন তিশা আমার বালো লাগার একজনকাতার থেকে
Cleaned:  অসাধারণ তানজিন তিশা আমার বালো লাগার একজনকাতার থেকে

Texts that became empty after cleaning: 0

Cleaned text length statistics:
  Mean: 79.8 characters
  Median: 59.0 characters
  Max: 1395 characters


In [6]:
# Handle problematic texts
print("Handling problematic texts...")

# Find texts that are too short after cleaning
min_length = 3  # Minimum meaningful text length
short_mask = df_clean['cleaned_length'] < min_length

print(f"Texts shorter than {min_length} characters: {short_mask.sum()}")

if short_mask.sum() > 0:
    print("Examples of short texts:")
    short_examples = df_clean[short_mask][['text', 'text_cleaned', 'cleaned_length']].head()
    print(short_examples)
    
    # Decision: Remove very short texts or keep original
    # For this project, we'll remove texts shorter than 3 characters
    df_clean = df_clean[~short_mask].copy()
    
    print(f"Dataset size after removing short texts: {len(df_clean)}")
else:
    print("No problematic short texts found.")

# Reset index
df_clean = df_clean.reset_index(drop=True)

Handling problematic texts...
Texts shorter than 3 characters: 5
Examples of short texts:
      text text_cleaned  cleaned_length
857     মর           মর               2
3466    হু           হু               2
8898    --           --               2
9816    --           --               2
14385    -            -               1
Dataset size after removing short texts: 16068


In [7]:
# Analyze label distribution in cleaned dataset
print("Label Distribution Analysis")
print("=" * 30)

label_columns = ['vulgar', 'hate', 'religious', 'threat', 'troll', 'Insult']

# Calculate label statistics
label_stats = {}
for label in label_columns:
    count = df_clean[label].sum()
    percentage = (count / len(df_clean)) * 100
    label_stats[label] = {'count': count, 'percentage': percentage}
    print(f"{label}: {count:,} samples ({percentage:.1f}%)")

# Multi-label statistics
df_clean['total_labels'] = df_clean[label_columns].sum(axis=1)
print(f"\nMulti-label statistics:")
print(f"Samples with 0 labels: {(df_clean['total_labels'] == 0).sum():,}")
print(f"Samples with 1+ labels: {(df_clean['total_labels'] > 0).sum():,}")
print(f"Average labels per sample: {df_clean['total_labels'].mean():.2f}")

# Show distribution of number of labels per sample
label_distribution = df_clean['total_labels'].value_counts().sort_index()
print(f"\nDistribution of labels per sample:")
for num_labels, count in label_distribution.items():
    print(f"  {num_labels} labels: {count:,} samples")

Label Distribution Analysis
vulgar: 2,505 samples (15.6%)
hate: 1,898 samples (11.8%)
religious: 1,418 samples (8.8%)
threat: 1,418 samples (8.8%)
troll: 1,643 samples (10.2%)
Insult: 2,719 samples (16.9%)

Multi-label statistics:
Samples with 0 labels: 7,581
Samples with 1+ labels: 8,487
Average labels per sample: 0.72

Distribution of labels per sample:
  0 labels: 7,581 samples
  1 labels: 5,836 samples
  2 labels: 2,209 samples
  3 labels: 421 samples
  4 labels: 21 samples


In [8]:
# Create stratified splits for multi-label data
print("Creating train/validation/test splits...")

# For multi-label stratification, we'll use the total number of labels as a proxy
# This ensures similar distribution of label density across splits

X = df_clean['text_cleaned']
y = df_clean[label_columns]

# First split: separate test set (15%)
X_temp, X_test, y_temp, y_test = train_test_split(
    X, y, 
    test_size=0.15, 
    random_state=42,
    stratify=df_clean['total_labels']
)

# Second split: separate train and validation (70% train, 15% val from remaining)
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp,
    test_size=0.176,  # 0.15/0.85 ≈ 0.176 to get 15% of original data
    random_state=42,
    stratify=y_temp.sum(axis=1)  # Use total labels for stratification
)

print(f"Dataset splits:")
print(f"  Training set: {len(X_train):,} samples ({len(X_train)/len(df_clean)*100:.1f}%)")
print(f"  Validation set: {len(X_val):,} samples ({len(X_val)/len(df_clean)*100:.1f}%)")
print(f"  Test set: {len(X_test):,} samples ({len(X_test)/len(df_clean)*100:.1f}%)")

# Verify label distribution across splits
print(f"\nLabel distribution across splits:")
for label in label_columns:
    train_pct = (y_train[label].sum() / len(y_train)) * 100
    val_pct = (y_val[label].sum() / len(y_val)) * 100
    test_pct = (y_test[label].sum() / len(y_test)) * 100
    print(f"  {label}: Train {train_pct:.1f}% | Val {val_pct:.1f}% | Test {test_pct:.1f}%")

Creating train/validation/test splits...
Dataset splits:
  Training set: 11,253 samples (70.0%)
  Validation set: 2,404 samples (15.0%)
  Test set: 2,411 samples (15.0%)

Label distribution across splits:
  vulgar: Train 15.6% | Val 16.1% | Test 15.3%
  hate: Train 11.9% | Val 11.6% | Test 11.4%
  religious: Train 8.9% | Val 9.4% | Test 8.0%
  threat: Train 8.7% | Val 8.9% | Test 9.2%
  troll: Train 10.3% | Val 9.1% | Test 11.0%
  Insult: Train 16.8% | Val 17.2% | Test 17.3%


In [9]:
# Save preprocessed data for model training
print("Saving preprocessed data...")

# Create train/val/test dataframes
train_df = pd.DataFrame({
    'text': X_train,
    **{col: y_train[col] for col in label_columns}
})

val_df = pd.DataFrame({
    'text': X_val,
    **{col: y_val[col] for col in label_columns}
})

test_df = pd.DataFrame({
    'text': X_test,
    **{col: y_test[col] for col in label_columns}
})

# Save to CSV files
train_df.to_csv('../data/train_dataset1.csv', index=False)
val_df.to_csv('../data/val_dataset1.csv', index=False)
test_df.to_csv('../data/test_dataset1.csv', index=False)

# Save the complete cleaned dataset
df_clean[['text_cleaned'] + label_columns].to_csv('../data/dataset1_cleaned.csv', index=False)

print("Saved files:")
print("  train_dataset1.csv")
print("  val_dataset1.csv") 
print("  test_dataset1.csv")
print("  dataset1_cleaned.csv")

# Summary statistics
summary_stats = {
    'original_samples': len(df),
    'cleaned_samples': len(df_clean),
    'train_samples': len(train_df),
    'val_samples': len(val_df),
    'test_samples': len(test_df),
    'label_columns': label_columns,
    'avg_text_length': df_clean['cleaned_length'].mean(),
    'total_toxic_samples': (df_clean['total_labels'] > 0).sum()
}

print(f"\nPreprocessing Summary:")
for key, value in summary_stats.items():
    print(f"  {key}: {value}")

Saving preprocessed data...
Saved files:
  train_dataset1.csv
  val_dataset1.csv
  test_dataset1.csv
  dataset1_cleaned.csv

Preprocessing Summary:
  original_samples: 16073
  cleaned_samples: 16068
  train_samples: 11253
  val_samples: 2404
  test_samples: 2411
  label_columns: ['vulgar', 'hate', 'religious', 'threat', 'troll', 'Insult']
  avg_text_length: 79.8660069703759
  total_toxic_samples: 8487
