## NLP Assignment: Binary Sentiment Classification

### Downloading Dataset using Kaggle API

- Import Kaggle.json to Google Colab
- Installing kaggle CLI
- Download dataset from kaggle

In [None]:
from google.colab import files
files.upload()

In [None]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

# Install kaggle CLI if not already
!pip install -q kaggle

# Download the dataset
!kaggle datasets download -d kritanjalijain/amazon-reviews

# Unzip it
!unzip amazon-reviews.zip -d data/

### Importing necessary libraries

In [3]:
import pandas as pd
import re
import string
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import Sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dropout, Dense
from sklearn.model_selection import train_test_split

### Loading Dataset

In [4]:
train_df = pd.read_csv('data/train.csv', header=None, names=['polarity', 'title', 'text'])
train_df['polarity'] = train_df['polarity'] - 1

### Text Preprocessing

1. Combining `title` and `text` columns to create a `content` column

In [5]:
train_df['content'] = train_df['title'].astype(str) + " " + train_df['text'].astype(str)

2. Defining preprocessing function

In [6]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'<.*?>', ' ', text)
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

3. Applying this function to content column

In [7]:
train_df['clean_content'] = train_df['content'].apply(clean_text)
train_df[['content', 'clean_content']].head()

Unnamed: 0,content,clean_content
0,Stuning even for the non-gamer This sound trac...,stuning even for the nongamer this sound track...
1,The best soundtrack ever to anything. I'm read...,the best soundtrack ever to anything im readin...
2,Amazing! This soundtrack is my favorite music ...,amazing this soundtrack is my favorite music o...
3,Excellent Soundtrack I truly like this soundtr...,excellent soundtrack i truly like this soundtr...
4,"Remember, Pull Your Jaw Off The Floor After He...",remember pull your jaw off the floor after hea...


4. Tokenization

In [8]:
vocab_size = 50000
oov_token  = '<OOV>'

tokenizer  = Tokenizer(num_words=vocab_size, oov_token=oov_token)
tokenizer.fit_on_texts(train_df['clean_content'].tolist())

5. Padding

In [9]:
class DataGenerator(Sequence):
    def __init__(self, texts, labels, tokenizer, maxlen=200, batch_size=128, shuffle=True):
        self.texts      = texts
        self.labels     = labels
        self.tokenizer  = tokenizer
        self.maxlen     = maxlen
        self.batch_size = batch_size
        self.shuffle    = shuffle
        self.indices    = np.arange(len(self.texts))
        if self.shuffle:
            np.random.shuffle(self.indices)

    def __len__(self):
        return int(np.ceil(len(self.texts) / self.batch_size))

    def __getitem__(self, index):
        batch_ids   = self.indices[index * self.batch_size : (index + 1) * self.batch_size]
        batch_texts = [self.texts[i] for i in batch_ids]
        batch_labels= [self.labels[i] for i in batch_ids]

        # Tokenize + pad only this batch
        sequences = self.tokenizer.texts_to_sequences(batch_texts)
        padded    = pad_sequences(sequences, maxlen=self.maxlen, padding='post', truncating='post')
        return np.array(padded), np.array(batch_labels)

    def on_epoch_end(self):
        if self.shuffle:
            np.random.shuffle(self.indices)

In [10]:
texts  = train_df['clean_content'].tolist()
labels = train_df['polarity'].tolist()
batch_size = 512
maxlen     = 200

X_train, X_val, y_train, y_val = train_test_split(
    texts, labels,
    test_size=0.2,
    random_state=42,
    stratify=labels
)

train_generator = DataGenerator(
    texts=X_train,
    labels=y_train,
    tokenizer=tokenizer,
    maxlen=maxlen,
    batch_size=batch_size,
    shuffle=True
)

val_generator = DataGenerator(
    texts=X_val,
    labels=y_val,
    tokenizer=tokenizer,
    maxlen=maxlen,
    batch_size=batch_size,
    shuffle=False
)

6. Saving Cleaned Training Dataset and tokenizer

In [None]:
import pickle
with open('tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)

train_df.to_csv('cleaned_train.csv', index=False)