In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
!pip install wordcloud
!pip install nltk
import warnings
from wordcloud import WordCloud
import nltk
from nltk.corpus import stopwords
import re

nltk.download("stopwords")
stop_words = set(stopwords.words("english"))

# Load data
df = pd.read_csv("../data/jigsaw_toxic_full.csv")

# --- 1. Label Distribution
labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
label_counts = df[labels].sum().sort_values(ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(x=label_counts.index, y=label_counts.values, palette="magma")
plt.title("Distribution of Toxic Comment Labels")
plt.ylabel("Number of Samples")
plt.xlabel("Toxicity Type")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# --- 2. WordClouds
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"[^a-z\s]", "", text)
    text = " ".join([word for word in text.split() if word not in stop_words])
    return text

toxic_text = " ".join(df[df['toxic'] == 1]['comment_text'].apply(clean_text).tolist())
non_toxic_text = " ".join(df[df['toxic'] == 0]['comment_text'].apply(clean_text).tolist())

plt.figure(figsize=(16, 6))
plt.subplot(1, 2, 1)
wc1 = WordCloud(width=800, height=400, background_color='black').generate(toxic_text)
plt.imshow(wc1, interpolation='bilinear')
plt.title("WordCloud - Toxic Comments", fontsize=14)
plt.axis("off")

plt.subplot(1, 2, 2)
wc2 = WordCloud(width=800, height=400, background_color='white').generate(non_toxic_text)
plt.imshow(wc2, interpolation='bilinear')
plt.title("WordCloud - Non-Toxic Comments", fontsize=14)
plt.axis("off")
plt.tight_layout()
plt.show()

# --- 3. Comment Length Distribution
df["text_length"] = df["comment_text"].astype(str).apply(lambda x: len(x.split()))
plt.figure(figsize=(10, 5))
sns.histplot(data=df, x="text_length", hue="toxic", bins=50, palette="coolwarm", kde=True)
plt.title("Comment Length by Toxicity")
plt.xlabel("Word Count")
plt.ylabel("Frequency")
plt.legend(["Non-Toxic", "Toxic"])
plt.tight_layout()
plt.show()

# --- 4. Top Identity Terms (from identity_hate = 1)
identity_df = df[df["identity_hate"] == 1]["comment_text"].dropna().apply(clean_text)
from collections import Counter
identity_words = Counter(" ".join(identity_df).split())
top_identity_terms = dict(identity_words.most_common(20))

plt.figure(figsize=(10, 5))
sns.barplot(x=list(top_identity_terms.values()), y=list(top_identity_terms.keys()), palette="rocket")
plt.title("Top Identity Terms in Identity Hate Comments")
plt.xlabel("Frequency")
plt.tight_layout()
plt.show()


Collecting wordcloud
  Downloading wordcloud-1.9.4-cp310-cp310-macosx_11_0_arm64.whl (167 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/167.7 kB[0m [31m?[0m eta [36m-:--:--[0m  Downloading wordcloud-1.9.4-cp310-cp310-macosx_11_0_arm64.whl (167 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m167.7/167.7 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m167.7/167.7 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: wordcloud
Successfully installed wordcloud-1.9.4

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Installing collected packages: wordcloud
Successfully installed wordcloud-1.9.4

[1m[[0m[34;49mnotice[0m

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/rajkumarmyakala/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


FileNotFoundError: [Errno 2] No such file or directory: 'data/jigsaw_toxic_full.csv'