# Imports

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import re
from tqdm import tqdm
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pickle
import gzip
import os

2025-07-29 20:57:53.309051: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-07-29 20:57:53.636946: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-07-29 20:57:53.897117: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753833474.125815   17210 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753833474.197771   17210 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1753833474.702713   17210 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linkin

Baixar gloVe em http://nlp.stanford.edu/data/glove.840B.300d.zip e adicionar nesse diretório

In [2]:
!unzip -q glove.840B.300d.zip

In [8]:
RANDOM_STATE = 42

In [4]:
def load_glove_embeddings(glove_path="glove.840B.300d.txt"):
    embeddings = {}
    with open(glove_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.rstrip().split(' ')
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings[word] = vector
    return embeddings

In [None]:
# glove_embeddings = load_glove_embeddings()
# with gzip.open("glove.840B.300d.pkl.gz", 'wb') as f:
#         pickle.dump(glove_embeddings, f, protocol=pickle.HIGHEST_PROTOCOL)

In [24]:
with gzip.open("glove.840B.300d.pkl.gz", 'rb') as f:
            print("Loading cached GloVe embeddings...")
            GLOVE_EMBEDDINGS = pickle.load(f)

Loading cached GloVe embeddings...


In [26]:
def embed_dataset(folder, X_train, X_val, X_test, glove_embeddings, max_len=100):
    embedding_dim = 300

    tokenizer = Tokenizer(oov_token="<OOV>")
    tokenizer.fit_on_texts(X_train)

    with open(f"{folder}/tokenizer.pkl", "wb") as f:
        pickle.dump(tokenizer, f)

    # Convert to sequences
    X_train_seq = tokenizer.texts_to_sequences(X_train)
    X_val_seq   = tokenizer.texts_to_sequences(X_val)
    X_test_seq  = tokenizer.texts_to_sequences(X_test)

    X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding='post', truncating='post')
    X_val_pad   = pad_sequences(X_val_seq, maxlen=max_len, padding='post', truncating='post')
    X_test_pad  = pad_sequences(X_test_seq, maxlen=max_len, padding='post', truncating='post')

    np.save(f"{folder}/X_train_pad.npy", X_train_pad)
    np.save(f"{folder}/X_val_pad.npy", X_val_pad)
    np.save(f"{folder}/X_test_pad.npy", X_test_pad)

    word_index = tokenizer.word_index
    vocab_size = len(word_index) + 1  # +1 for padding token

    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    for word, i in word_index.items():
        vector = glove_embeddings.get(word)
        if vector is not None:
            embedding_matrix[i] = vector

    np.save(f"{folder}/embedding_matrix.npy", embedding_matrix)

# Rotten Tomatoes Movie Reviews

In [10]:
# Define file paths
neg_file_path = 'rtmr/rt-polarity.neg'
pos_file_path = 'rtmr/rt-polarity.pos'

# Read files
with open(neg_file_path, 'r', encoding='latin-1') as f:
    neg_lines = f.readlines()

with open(pos_file_path, 'r', encoding='latin-1') as f:
    pos_lines = f.readlines()

# Create DataFrames
df_neg = pd.DataFrame({'text': [line.strip() for line in neg_lines], 'target': 0})
df_pos = pd.DataFrame({'text': [line.strip() for line in pos_lines], 'target': 1})

# Combine them
df = pd.concat([df_neg, df_pos], ignore_index=True)

# Optional: shuffle the dataset
df = df.sample(frac=1, random_state=RANDOM_STATE).reset_index(drop=True)

df.head()

Unnamed: 0,text,target
0,"this film seems thirsty for reflection , itsel...",1
1,the movie's thesis -- elegant technology for t...,1
2,tries too hard to be funny in a way that's too...,0
3,disturbingly superficial in its approach to th...,0
4,"an ugly , pointless , stupid movie .",0


In [11]:
X_train, X_temp, y_train, y_temp = train_test_split(df["text"], df["target"], test_size=0.2, random_state=RANDOM_STATE, stratify=df["target"])
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=RANDOM_STATE, stratify=y_temp)

In [None]:
# X_train.to_csv("rtmr/X_train.csv", index=False)
# y_train.to_csv("rtmr/y_train.csv", index=False)

# X_val.to_csv("rtmr/X_val.csv", index=False)
# y_val.to_csv("rtmr/y_val.csv", index=False)

# X_test.to_csv("rtmr/X_test.csv", index=False)
# y_test.to_csv("rtmr/y_test.csv", index=False)

In [3]:
X_train_df = pd.read_csv("rtmr/X_train.csv")
X_val_df   = pd.read_csv("rtmr/X_val.csv")
X_test_df  = pd.read_csv("rtmr/X_test.csv")

# Flatten to list of strings
X_train = X_train_df.iloc[:, 0].astype(str)
X_val   = X_val_df.iloc[:, 0].astype(str)
X_test  = X_test_df.iloc[:, 0].astype(str)

# Check shapes
print("Num train samples:", len(X_train))
print("Num val samples  :", len(X_val))
print("Num test samples :", len(X_test))

Num train samples: 8529
Num val samples  : 1066
Num test samples : 1067


In [4]:
def load_glove_embeddings(glove_path="glove.840B.300d.txt"):
    embeddings = {}
    with open(glove_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.rstrip().split(' ')
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings[word] = vector
    return embeddings

glove_embeddings = load_glove_embeddings()
embedding_dim = 300

In [18]:
tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

with open("rtmr/tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)

# Convert to sequences
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq   = tokenizer.texts_to_sequences(X_val)
X_test_seq  = tokenizer.texts_to_sequences(X_test)

# Pad sequences to the same length
max_len = 100  # or compute dynamically via np.percentile or np.max
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding='post', truncating='post')
X_val_pad   = pad_sequences(X_val_seq, maxlen=max_len, padding='post', truncating='post')
X_test_pad  = pad_sequences(X_test_seq, maxlen=max_len, padding='post', truncating='post')

In [16]:
np.save("rtmr/X_train_pad.npy", X_train_pad)
np.save("rtmr/X_val_pad.npy", X_val_pad)
np.save("rtmr/X_test_pad.npy", X_test_pad)

In [9]:
word_index = tokenizer.word_index
vocab_size = len(word_index) + 1  # +1 for padding token

embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in word_index.items():
    vector = glove_embeddings.get(word)
    if vector is not None:
        embedding_matrix[i] = vector

In [19]:
np.save("rtmr/embedding_matrix.npy", embedding_matrix)

# Hate Speech and Offensive Language

Disponível em https://github.com/t-davidson/hate-speech-and-offensive-language/blob/master/data/labeled_data.csv

In [25]:
data = pd.read_csv("hate/labeled_data.csv")
data.head()

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


In [26]:
data['class'] = data['class'].map(lambda x: 1 if x in [0, 1] else 0)

In [29]:
from sklearn.utils import resample

# Separate the two classes
class_1 = data[data['class'] == 1]
class_0 = data[data['class'] == 0]

# Downsample class 1
class_1_downsampled = resample(class_1,
                               replace=False,
                               n_samples=len(class_0),
                               random_state=RANDOM_STATE)

# Concatenate back
balanced_data = pd.concat([class_0, class_1_downsampled])

# Optional: Shuffle the dataset
balanced_data = balanced_data.sample(frac=1, random_state=RANDOM_STATE).reset_index(drop=True)

# Final class distribution
print(balanced_data['class'].value_counts())

class
1    4163
0    4163
Name: count, dtype: int64


In [31]:
balanced_data = balanced_data[["tweet", "class"]]

In [36]:
X_train, X_temp, y_train, y_temp = train_test_split(balanced_data["tweet"], balanced_data["class"], test_size=0.2, random_state=RANDOM_STATE, stratify=balanced_data["class"])
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=RANDOM_STATE, stratify=y_temp)

In [None]:
# X_train.to_csv("hate/X_train.csv", index=False)
# y_train.to_csv("hate/y_train.csv", index=False)

# X_val.to_csv("hate/X_val.csv", index=False)
# y_val.to_csv("hate/y_val.csv", index=False)

# X_test.to_csv("hate/X_test.csv", index=False)
# y_test.to_csv("hate/y_test.csv", index=False)

In [38]:
X_train_df = pd.read_csv("hate/X_train.csv")
X_val_df   = pd.read_csv("hate/X_val.csv")
X_test_df  = pd.read_csv("hate/X_test.csv")

# Flatten to list of strings
X_train = X_train_df.iloc[:, 0].astype(str)
X_val   = X_val_df.iloc[:, 0].astype(str)
X_test  = X_test_df.iloc[:, 0].astype(str)

# Check shapes
print("Num train samples:", len(X_train))
print("Num val samples  :", len(X_val))
print("Num test samples :", len(X_test))

Num train samples: 6660
Num val samples  : 833
Num test samples : 833


In [43]:
embed_dataset("hate", X_train, X_val, X_test, glove_embeddings)

# IMDB

In [1]:
from datasets import load_dataset

dataset = load_dataset("imdb")

README.md: 0.00B [00:00, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [12]:
full_train = dataset["train"]
test = dataset["test"]

texts = list(full_train["text"])
labels = list(map(int, full_train["label"]))

# Split training set into train and validation (80/20)
X_train, X_val, y_train, y_val = train_test_split(
    texts, labels, test_size=0.2, random_state=RANDOM_STATE, stratify=labels
)

# Extract test set
X_test = test["text"]
y_test = list(map(int, test["label"]))

# Save each split as CSV
pd.DataFrame({"text": X_train, "label": y_train}).to_csv("IMDB/train.csv", index=False)
pd.DataFrame({"text": X_val,   "label": y_val}).to_csv("IMDB/val.csv", index=False)
pd.DataFrame({"text": X_test,  "label": y_test}).to_csv("IMDB/test.csv", index=False)

print("✅ CSV files saved to folder IMDB")

✅ CSV files saved to folder IMDB


In [18]:
train = pd.read_csv("IMDB/train.csv")
val = pd.read_csv("IMDB/val.csv")
test = pd.read_csv("IMDB/test.csv")

embed_dataset("IMDB", train["text"], val["text"], test["text"], GLOVE_EMBEDDINGS, max_len=300)

# Kaggle

In [6]:
df = pd.read_csv("kaggle/train_og.csv")

In [7]:
toxic_columns = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
df["label"] = df[toxic_columns].any(axis=1).astype(int)

In [12]:
df = df.rename(columns={"comment_text": "text"})
df = df[df["text"].apply(lambda x: len(str(x).split()) <= 200)]
df = df.reset_index(drop=True)

In [15]:
df_0 = df[df["label"] == 0]
df_1 = df[df["label"] == 1]

# Undersample class 0 to match class 1
df_0_downsampled = df_0.sample(n=len(df_1), random_state=42)

# Combine balanced data
df_balanced = pd.concat([df_0_downsampled, df_1], ignore_index=True)

# Shuffle the result
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

print(df_balanced["label"].value_counts())

label
1    15559
0    15559
Name: count, dtype: int64


In [17]:
df_balanced = df_balanced[["text", "label"]]

In [14]:
df["label"].value_counts()

label
0    133925
1     15559
Name: count, dtype: int64

In [22]:
train_df, temp_df = train_test_split(
    df_balanced, test_size=0.2, stratify=df_balanced["label"], random_state=42
)

val_df, test_df = train_test_split(
    temp_df, test_size=0.5, stratify=temp_df["label"], random_state=42
)

# Check sizes
print("Train:", len(train_df))
print("Validation:", len(val_df))
print("Test:", len(test_df))

# Save to CSV
train_df.to_csv("kaggle/train.csv", index=False)
val_df.to_csv("kaggle/val.csv", index=False)
test_df.to_csv("kaggle/test.csv", index=False)

print("✅ Saved train.csv, val.csv, and test.csv")

Train: 24894
Validation: 3112
Test: 3112
✅ Saved train.csv, val.csv, and test.csv


In [27]:
train = pd.read_csv("kaggle/train.csv")
val = pd.read_csv("kaggle/val.csv")
test = pd.read_csv("kaggle/test.csv")

embed_dataset("kaggle", train["text"], val["text"], test["text"], GLOVE_EMBEDDINGS)