# Vectorizing the NL Data

In [None]:
! pip install -U scikit-learn
! pip install pandas
! pip install numpy

Collecting scikit-learn
  Using cached scikit_learn-1.7.2-cp312-cp312-macosx_12_0_arm64.whl.metadata (11 kB)
Collecting scipy>=1.8.0 (from scikit-learn)
  Using cached scipy-1.16.3-cp312-cp312-macosx_14_0_arm64.whl.metadata (62 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Using cached threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Using cached scikit_learn-1.7.2-cp312-cp312-macosx_12_0_arm64.whl (8.6 MB)
Using cached scipy-1.16.3-cp312-cp312-macosx_14_0_arm64.whl (20.9 MB)
Using cached threadpoolctl-3.6.0-py3-none-any.whl (18 kB)
Installing collected packages: threadpoolctl, scipy, scikit-learn
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3/3[0m [scikit-learn][0m [scikit-learn]
[1A[2KSuccessfully installed scikit-learn-1.7.2 scipy-1.16.3 threadpoolctl-3.6.0


In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
import re
from collections import Counter
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv("DATA/train_updated.csv")
label_cols = df.columns
label_cols = label_cols[3:9]

In [11]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(
    df['text_clean'], 
    df[label_cols],
    test_size=0.2,
    random_state=42
)

## Vectorizers

In [None]:
## Vectorizers: https://mljourney.com/tf-idf-vectorizer-vs-countvectorizer-the-key-differences-for-text-analysis/
# We can use both of these and compare how models do depending on the vectorizer if we want to.
# TF-IDF Vectorization (https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html)
# This is better rep of the data but is usually more complicated
print("TF-IDF Vectorization:")
tfidf = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 2), # unigrams and bigrams
    min_df=5, # ignore terms that appear in less than 5
    max_df=0.8 # ignore terms that appear in more than 80%
)

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

print(f"Training shape: {X_train_tfidf.shape}")
print(f"Testing shape: {X_test_tfidf.shape}")
print(f"Vocab size: {len(tfidf.vocabulary_)}")

TF-IDF Vectorization:
Training shape: (127656, 5000)
Testing shape: (31915, 5000)
Vocab size: 5000


In [13]:
# Count Vectorization (https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html)
# Much simpler vectorizer since it's based purely on counts
print("Count Vectorization")
count_vec = CountVectorizer(
    max_features=5000,
    ngram_range=(1, 2),
    min_df=5
)

X_train_count = count_vec.fit_transform(X_train)
X_test_count = count_vec.transform(X_test)

print(f"Training shape: {X_train_count.shape}")
print(f"Testing shape: {X_test_count.shape}")

Count Vectorization
Training shape: (127656, 5000)
Testing shape: (31915, 5000)


## Feature Importance

In [16]:
# Get top words for toxic comments
toxic_texts = df[df['toxic'] == 1]['text_clean']
non_toxic_texts = df[df['toxic'] == 0]['text_clean']

# Fit on toxic vs non-toxic
tfidf_analysis = TfidfVectorizer(max_features=100, stop_words='english')
tfidf_analysis.fit(pd.concat([toxic_texts, non_toxic_texts]))

toxic_vectors = tfidf_analysis.transform(toxic_texts)
non_toxic_vectors = tfidf_analysis.transform(non_toxic_texts)

# Get mean TF-IDF scores
toxic_means = np.array(toxic_vectors.mean(axis=0)).flatten()
non_toxic_means = np.array(non_toxic_vectors.mean(axis=0)).flatten()

feature_names = tfidf_analysis.get_feature_names_out()
toxic_importance = pd.DataFrame({
    'word': feature_names,
    'toxic_score': toxic_means,
    'non_toxic_score': non_toxic_means,
    'difference': toxic_means - non_toxic_means
})

print("Top 15 words in toxic comments:")
print(toxic_importance.nlargest(15, 'toxic_score')[['word', 'toxic_score']])

print("Top 15 words in non-toxic comments:")
print(toxic_importance.nlargest(15, 'non_toxic_score')[['word', 'non_toxic_score']])

Top 15 words in toxic comments:
         word  toxic_score
32       fuck     0.120268
45       like     0.057798
24        don     0.050216
96  wikipedia     0.049664
42       just     0.048543
75       stop     0.040768
55       page     0.036231
57     people     0.035855
43       know     0.035074
83      think     0.026563
92       want     0.023648
19        did     0.022887
3     article     0.022641
78       talk     0.022412
9       block     0.022211
Top 15 words in non-toxic comments:
         word  non_toxic_score
3     article         0.075722
78       talk         0.064025
55       page         0.057674
96  wikipedia         0.052549
42       just         0.043774
45       like         0.039543
24        don         0.035662
83      think         0.034837
80     thanks         0.033272
43       know         0.031585
25       edit         0.029388
57     people         0.027950
19        did         0.027721
84       time         0.026802
4    articles         0.024652


## Save Our Data

In [None]:
## to-do