<a href="https://colab.research.google.com/github/rutuja-patil24/CMPE-258-Deep_Learning/blob/main/Assignment_05/01_Simple_Text_classification_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# ✅ Install required libraries
!pip install --quiet tensorflow datasets numpy

import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from datasets import load_dataset

# ✅ Load IMDb Dataset
dataset = load_dataset("imdb")

# ✅ Convert dataset to Pandas DataFrame
df_train = pd.DataFrame(dataset["train"])
df_test = pd.DataFrame(dataset["test"])

# ✅ Preprocess Data
train_texts = df_train["text"].values
train_labels = df_train["label"].values
test_texts = df_test["text"].values
test_labels = df_test["label"].values

# ✅ Tokenization
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(train_texts)

train_sequences = tokenizer.texts_to_sequences(train_texts)
test_sequences = tokenizer.texts_to_sequences(test_texts)

# ✅ Padding Sequences
MAX_LENGTH = 200
train_padded = pad_sequences(train_sequences, maxlen=MAX_LENGTH, padding="post", truncating="post")
test_padded = pad_sequences(test_sequences, maxlen=MAX_LENGTH, padding="post", truncating="post")

# ✅ Convert labels to NumPy
train_labels = np.array(train_labels)
test_labels = np.array(test_labels)

# ✅ Define Simple LSTM Model
model = Sequential([
    Embedding(input_dim=10000, output_dim=64, input_length=MAX_LENGTH),
    LSTM(64, return_sequences=True),
    LSTM(32),
    Dense(16, activation="relu"),
    Dropout(0.5),
    Dense(1, activation="sigmoid")  # Binary Classification
])

# ✅ Compile Model
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

# ✅ Train Model
model.fit(train_padded, train_labels, validation_data=(test_padded, test_labels), epochs=3, batch_size=64)

# ✅ Function to Predict Sentiment
def predict_sentiment(texts):
    sequences = tokenizer.texts_to_sequences(texts)
    padded_sequences = pad_sequences(sequences, maxlen=MAX_LENGTH, padding="post", truncating="post")
    predictions = model.predict(padded_sequences)

    sentiment_labels = ["Negative", "Positive"]
    for text, pred in zip(texts, predictions):
        print(f"🎬 Review: {text[:100]}... → **Sentiment: {sentiment_labels[int(pred > 0.5)]}**")

# ✅ Example Predictions
sample_reviews = [
    "This movie was amazing! The story and visuals were fantastic!",
    "Absolutely terrible. One of the worst movies I've ever seen.",
    "It was decent, but nothing too special."
]

predict_sentiment(sample_reviews)


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/487.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m487.4/487.4 kB[0m [31m25.5 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/116.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/143.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/194.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.8/194.8 kB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
[?25h

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]



Epoch 1/3
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 27ms/step - accuracy: 0.5090 - loss: 0.6924 - val_accuracy: 0.5378 - val_loss: 0.6868
Epoch 2/3
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 25ms/step - accuracy: 0.5829 - loss: 0.6658 - val_accuracy: 0.5605 - val_loss: 0.6765
Epoch 3/3
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 25ms/step - accuracy: 0.5680 - loss: 0.6729 - val_accuracy: 0.6047 - val_loss: 0.6386
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 249ms/step
🎬 Review: This movie was amazing! The story and visuals were fantastic!... → **Sentiment: Positive**
🎬 Review: Absolutely terrible. One of the worst movies I've ever seen.... → **Sentiment: Negative**
🎬 Review: It was decent, but nothing too special.... → **Sentiment: Negative**


  print(f"🎬 Review: {text[:100]}... → **Sentiment: {sentiment_labels[int(pred > 0.5)]}**")
