In [None]:
# Data_Preprocessing.ipynb

# Import necessary libraries
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from torchtext.data import Field, LabelField, TabularDataset, BucketIterator

# Load the dataset
df = pd.read_csv('data/sentiment_dataset.csv')

# Tokenization and preprocessing
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [token for token in tokens if token.isalpha() and token not in stop_words]
    return tokens

df['tokens'] = df['text'].apply(preprocess_text)

# Save the preprocessed data
df.to_csv('data/preprocessed_sentiment_dataset.csv', index=False)

# Define TorchText fields
TEXT = Field(tokenize='spacy', include_lengths=True)
LABEL = LabelField(dtype=torch.float)

# Load preprocessed data into TorchText TabularDataset
data_fields = [('text', TEXT), ('label', LABEL)]
train_data, valid_data, test_data = TabularDataset.splits(
    path='data/',
    train='preprocessed_sentiment_dataset.csv',
    validation='preprocessed_sentiment_dataset.csv',
    test='preprocessed_sentiment_dataset.csv',
    format='csv',
    fields=data_fields
)

# Build vocabulary and create iterators
TEXT.build_vocab(train_data, max_size=25000, vectors='glove.6B.100d', unk_init=torch.Tensor.normal_)
LABEL.build_vocab(train_data)

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    datasets=(train_data, valid_data, test_data),
    batch_size=64,
    sort_within_batch=True,
    sort_key=lambda x: len(x.text),
    device=device
)
