## NLP Assignment: Binary Sentiment Classification

### Importing necessary libraries

In [None]:
import pandas as pd
import numpy as np
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, confusion_matrix
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import Sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dropout, Dense

### Loading Cleaned Dataset and using Tokenizer made in data preprocessing

1. Load Dataset

In [None]:
train_df = pd.read_csv('cleaned_train', header=None)

2. Tokenization

In [None]:
import pickle
with open('/content/tokenizer.pkl', 'rb') as f:
    tokenizer = pickle.load(f)

vocab_size = 50000
maxlen     = 200
batch_size = 512

3. Padding

In [None]:
class DataGenerator(Sequence):
    def __init__(self, texts, labels, tokenizer, maxlen=200, batch_size=128, shuffle=True):
        self.texts      = texts
        self.labels     = labels
        self.tokenizer  = tokenizer
        self.maxlen     = maxlen
        self.batch_size = batch_size
        self.shuffle    = shuffle
        self.indices    = np.arange(len(self.texts))
        if self.shuffle:
            np.random.shuffle(self.indices)

    def __len__(self):
        return int(np.ceil(len(self.texts) / self.batch_size))

    def __getitem__(self, index):
        batch_ids   = self.indices[index * self.batch_size : (index + 1) * self.batch_size]
        batch_texts = [self.texts[i] for i in batch_ids]
        batch_labels= [self.labels[i] for i in batch_ids]

        # Tokenize + pad only this batch
        sequences = self.tokenizer.texts_to_sequences(batch_texts)
        padded    = pad_sequences(sequences, maxlen=self.maxlen, padding='post', truncating='post')
        return np.array(padded), np.array(batch_labels)

    def on_epoch_end(self):
        if self.shuffle:
            np.random.shuffle(self.indices)

In [None]:
texts  = train_df['clean_content'].tolist()
labels = train_df['polarity'].tolist()
batch_size = 512
maxlen     = 200

X_train, X_val, y_train, y_val = train_test_split(
    texts, labels,
    test_size=0.2,
    random_state=42,
    stratify=labels
)

train_generator = DataGenerator(
    texts=X_train,
    labels=y_train,
    tokenizer=tokenizer,
    maxlen=maxlen,
    batch_size=batch_size,
    shuffle=True
)

val_generator = DataGenerator(
    texts=X_val,
    labels=y_val,
    tokenizer=tokenizer,
    maxlen=maxlen,
    batch_size=batch_size,
    shuffle=False
)

### Model Definition and Training

1. Embedding Layer using Glove

In [None]:
embedding_dim = 100
embedding_index = {}
glove_path = '/content/glove.6B.100d.txt'  # Make sure this file is in your project folder

with open(glove_path, encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word, vector = values[0], np.asarray(values[1:], dtype='float32')
        embedding_index[word] = vector

# 6. Create Embedding Matrix
word_index = tokenizer.word_index
embedding_matrix = np.zeros((vocab_size, embedding_dim))

for word, i in word_index.items():
    if i < vocab_size:
        embedding_vector = embedding_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector


2. Defining the Model

In [None]:
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim,
              weights=[embedding_matrix], input_length=maxlen, trainable=False),
    Bidirectional(LSTM(128, return_sequences=False)),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')
])



3. Compiling the model

In [None]:
from tensorflow.keras.optimizers import Adam
model.compile(optimizer=Adam(learning_rate=1e-3), loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

4. Training the model on training dataset

In [None]:
history = model.fit(
    train_generator,
    validation_data=val_generator,
    epochs=5
)


In [None]:
from sklearn.metrics import f1_score, confusion_matrix

y_true = y_val
y_pred = (model.predict(val_generator) > 0.5).astype("int")

print("F1 Score:", f1_score(y_true, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred))