In [None]:
# Notebook 1: Data Exploration and Preprocessing

In [None]:
## 1. Setup and Data Loading


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
sys.path.append('../src')

# Import our custom preprocessing functions
from preprocess import preprocess_pipeline, clean_text, normalize_emojis

sns.set_style('whitegrid')

# Load the dataset
try:
    df = pd.read_csv('../data/cyberbullying_data.csv')
    print("Dataset loaded successfully!")
    print(f"Shape: {df.shape}")
except FileNotFoundError:
    print("Error: Dataset not found. Please place `cyberbullying_data.csv` in the `data/` directory.")


Dataset loaded successfully!
Shape: (677, 2)


In [None]:
df.head()


## 2. Exploratory Data Analysis (EDA)


In [None]:
print(df.info())


In [None]:
# Check for class imbalance
plt.figure(figsize=(8, 5))
sns.countplot(x='bullying_label', data=df)
plt.title('Distribution of Cyberbullying Labels')
plt.xlabel('Label (1 = Bullying, 0 = Not Bullying)')
plt.ylabel('Count')
plt.show()


The plot above shows the class distribution. If the dataset is highly imbalanced, we need to consider techniques like `class_weight='balanced'` in our models or resampling methods (e.g., SMOTE).


## 3. Text Preprocessing


Here, we'll use the functions defined in `src/preprocess.py` to clean our text data. This ensures consistency between our exploration and the final training script.


In [None]:
# Show a 'before and after' example
sample_text = "I'm literally dying of laughter at this meme you sent me! You're killing me with these jokes! 😂💀 @somefriend"

normalized_emojis = normalize_emojis(sample_text)
cleaned_final = clean_text(normalized_emojis)

print(f"Original:      {sample_text}")
print(f"Processed:     {cleaned_final}")


In [None]:
# Apply the full pipeline to the dataframe
df_processed = preprocess_pipeline(df, 'text')

# Save the processed data for the training script
df_processed.to_csv('../data/processed/cleaned_data.csv', index=False)

print("Processed data saved to `data/processed/cleaned_data.csv`")
df_processed[['text', 'processed_text']].head()
