In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model,Sequential
from tensorflow.keras.layers import LSTM,Dense,Embedding,Dropout,Input,Bidirectional,GRU,LayerNormalization
from sklearn.metrics import confusion_matrix
from tensorflow.keras.callbacks import EarlyStopping,ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2


In [None]:
df=pd.read_csv('/kaggle/input/sentiment-cleaned/sentipreprocessed.csv')

In [None]:
df.rename(columns={"Review_body":"review","Label":"label"},inplace=True)

In [None]:
df.shape

In [None]:
df=df.dropna(axis=0).reset_index(drop=True)

In [None]:
df=df[:3500000]

In [None]:
df["label"].value_counts()

In [None]:
tokenizer = Tokenizer(num_words=33250,  # use only top 33250 most common words
                      oov_token='<OOV>') # i done this manually and found that word at index from top (33250) is coming 50 times in whole corpora

In [None]:
tokenizer.fit_on_texts(df['review'])

In [None]:
len(tokenizer.word_index)

In [None]:
review_sequences=tokenizer.texts_to_sequences(df['review'])

In [None]:
max_len=0
for i in range(len(review_sequences)):
  max_len=max(max_len,len(review_sequences[i]))

In [None]:
df["label"]=df["label"]-1;
review_labels=np.array(df["label"])

In [None]:
# train_ds = tf.data.Dataset.from_tensor_slices((padded_seq[:500000], review_labels[:500000]))
# train_ds = train_ds.shuffle(1024).batch(256).prefetch(tf.data.AUTOTUNE)
# val_ds = tf.data.Dataset.from_tensor_slices((padded_seq[1500000:1600000], review_labels[1500000:1600000]))
# val_ds = val_ds.batch(256).prefetch(tf.data.AUTOTUNE)
AUTOTUNE = tf.data.AUTOTUNE

# Create the training dataset
train_ds = (
    tf.data.Dataset.from_tensor_slices((padded_seq[:3000000], review_labels[:3000000]))  # Use full training data
    .cache()                             # Keep data in memory after first epoch
    .shuffle(buffer_size=10000)          # Reasonable shuffle buffer
    .batch(256, drop_remainder=True)     # Drop last batch for shape consistency
    .prefetch(buffer_size=AUTOTUNE)      # Pipeline the data loading
)

# Create the validation dataset
val_ds = (
    tf.data.Dataset.from_tensor_slices((padded_seq[3000000:3400000], review_labels[3000000:3400000]))
    .batch(256)
    .cache()                             # Cache validation set
    .prefetch(buffer_size=AUTOTUNE)
)

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dropout, BatchNormalization, Dense
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

# Hyperparameters
VOCAB_SIZE       = 33250
EMBED_DIM        = 128
MAX_SEQUENCE_LEN = 208
LSTM_UNITS       = 128
DROPOUT_RATE     = 0.4

# Build model
model = Sequential([
    Embedding(input_dim=VOCAB_SIZE,
              output_dim=EMBED_DIM,
              input_length=MAX_SEQUENCE_LEN,
              mask_zero=True,
              name="embedding"),

    Bidirectional(LSTM(LSTM_UNITS, return_sequences=True, dropout=0.2), name="bilstm_1"),
    BatchNormalization(name="bn_1"),
    Dropout(DROPOUT_RATE, name="dropout_1"),

    Bidirectional(LSTM(LSTM_UNITS, return_sequences=True, dropout=0.2), name="bilstm_2"),
    BatchNormalization(name="bn_2"),
    Dropout(DROPOUT_RATE, name="dropout_2"),

    Bidirectional(LSTM(LSTM_UNITS, return_sequences=False, dropout=0.2), name="bilstm_3"),
    BatchNormalization(name="bn_3"),
    Dropout(DROPOUT_RATE, name="dropout_3"),

    Dense(64, activation="relu", name="fc1"),
    BatchNormalization(name="bn_fc1"),
    Dropout(0.3, name="dropout_fc1"),

    Dense(1, activation="sigmoid", name="classifier")
])

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
    loss="binary_crossentropy",
    metrics=["accuracy", tf.keras.metrics.AUC(name="auc")]
)

# Summary (optional)
model.summary()

# --- Callbacks ---
callbacks = [
    EarlyStopping(monitor="val_loss", patience=2, restore_best_weights=True),
    ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=2, min_lr=1e-5)
]

# --- Training ---
history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=10,
    callbacks=callbacks
)


In [None]:
model.save("sentiment_bilstm_model.h5")

In [None]:
with open("tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# Step 1: Convert predicted probabilities to binary labels using the best threshold
y_pred_binary = (y_pred_prob >= 0.43).astype(int)

# Step 2: Compute confusion matrix
cm = confusion_matrix(y_val, y_pred_binary)

# Step 3: Plot confusion matrix
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["Negative", "Positive"])
disp.plot(cmap="Blues", values_format="d")
plt.title(f"Confusion Matrix (Threshold = 0.43)")
plt.grid(False)
plt.show()


In [None]:
from sklearn.metrics import classification_report

# Generate and print classification report
report = classification_report(y_val, y_pred_binary, target_names=["Negative", "Positive"])
print(report)


In [None]:
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

# Predict probabilities for the positive class
y_pred_prob = model.predict(X_val, batch_size=512)

# Compute false positive rate, true positive rate, thresholds
fpr, tpr, thresholds = roc_curve(y_val, y_pred_prob)
roc_auc = auc(fpr, tpr)

# Plot ROC Curve
plt.figure(figsize=(8,6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f"ROC Curve (AUC = {roc_auc:.4f})")
plt.plot([0, 1], [0, 1], color='gray', linestyle='--')  # Random guess line
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc='lower right')
plt.grid(True)
plt.show()
