# Imports

In [17]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import re
from tqdm import tqdm
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pickle

Baixar gloVe em http://nlp.stanford.edu/data/glove.840B.300d.zip e adicionar nesse diretório

In [15]:
!unzip -q glove.840B.300d.zip

In [3]:
RANDOM_STATE = 42

# Rotten Tomatoes Movie Reviews

In [10]:
# Define file paths
neg_file_path = 'rtmr/rt-polarity.neg'
pos_file_path = 'rtmr/rt-polarity.pos'

# Read files
with open(neg_file_path, 'r', encoding='latin-1') as f:
    neg_lines = f.readlines()

with open(pos_file_path, 'r', encoding='latin-1') as f:
    pos_lines = f.readlines()

# Create DataFrames
df_neg = pd.DataFrame({'text': [line.strip() for line in neg_lines], 'target': 0})
df_pos = pd.DataFrame({'text': [line.strip() for line in pos_lines], 'target': 1})

# Combine them
df = pd.concat([df_neg, df_pos], ignore_index=True)

# Optional: shuffle the dataset
df = df.sample(frac=1, random_state=RANDOM_STATE).reset_index(drop=True)

df.head()

Unnamed: 0,text,target
0,"this film seems thirsty for reflection , itsel...",1
1,the movie's thesis -- elegant technology for t...,1
2,tries too hard to be funny in a way that's too...,0
3,disturbingly superficial in its approach to th...,0
4,"an ugly , pointless , stupid movie .",0


In [11]:
X_train, X_temp, y_train, y_temp = train_test_split(df["text"], df["target"], test_size=0.2, random_state=RANDOM_STATE, stratify=df["target"])
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=RANDOM_STATE, stratify=y_temp)

In [None]:
# X_train.to_csv("rtmr/X_train.csv", index=False)
# y_train.to_csv("rtmr/y_train.csv", index=False)

# X_val.to_csv("rtmr/X_val.csv", index=False)
# y_val.to_csv("rtmr/y_val.csv", index=False)

# X_test.to_csv("rtmr/X_test.csv", index=False)
# y_test.to_csv("rtmr/y_test.csv", index=False)

In [3]:
X_train_df = pd.read_csv("rtmr/X_train.csv")
X_val_df   = pd.read_csv("rtmr/X_val.csv")
X_test_df  = pd.read_csv("rtmr/X_test.csv")

# Flatten to list of strings
X_train = X_train_df.iloc[:, 0].astype(str)
X_val   = X_val_df.iloc[:, 0].astype(str)
X_test  = X_test_df.iloc[:, 0].astype(str)

# Check shapes
print("Num train samples:", len(X_train))
print("Num val samples  :", len(X_val))
print("Num test samples :", len(X_test))

Num train samples: 8529
Num val samples  : 1066
Num test samples : 1067


In [4]:
def load_glove_embeddings(glove_path="glove.840B.300d.txt"):
    embeddings = {}
    with open(glove_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.rstrip().split(' ')
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings[word] = vector
    return embeddings

glove_embeddings = load_glove_embeddings()
embedding_dim = 300

In [18]:
tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

with open("rtmr/tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)

# Convert to sequences
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq   = tokenizer.texts_to_sequences(X_val)
X_test_seq  = tokenizer.texts_to_sequences(X_test)

# Pad sequences to the same length
max_len = 100  # or compute dynamically via np.percentile or np.max
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding='post', truncating='post')
X_val_pad   = pad_sequences(X_val_seq, maxlen=max_len, padding='post', truncating='post')
X_test_pad  = pad_sequences(X_test_seq, maxlen=max_len, padding='post', truncating='post')

In [16]:
np.save("rtmr/X_train_pad.npy", X_train_pad)
np.save("rtmr/X_val_pad.npy", X_val_pad)
np.save("rtmr/X_test_pad.npy", X_test_pad)

In [9]:
word_index = tokenizer.word_index
vocab_size = len(word_index) + 1  # +1 for padding token

embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in word_index.items():
    vector = glove_embeddings.get(word)
    if vector is not None:
        embedding_matrix[i] = vector

In [19]:
np.save("rtmr/embedding_matrix.npy", embedding_matrix)