<a href="https://colab.research.google.com/github/samw8/aiie2025-/blob/main/Sam_AIIE_2025_Text_Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##**NLP Sentiment Analysis**

   ### Objective: Train → Save → Evaluate → Load → Predict

In [None]:
!pip install tensorflow nltk

Import libraries

In [1]:
import tensorflow as tf
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import numpy as np
import string
import os
import pickle

    Downloads the tokenizer (punkt) and list of English stopwords.

In [2]:
# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab') # Download the missing resource

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [3]:
# 1. Sample raw text and labels
raw_sentences = [
    "I love machine learning!",
    "This is a great course.",
    "NLP is fascinating.",
    "I hate this subject.",
    "This is boring.",
    "I do not like winters."
]
labels = [1, 1, 1, 0, 0, 0]  # 1 = positive, 0 = negative

In [4]:
# 2. Preprocessing: lowercase, remove punctuation, remove stopwords
stop_words = set(stopwords.words('english'))

def preprocess_text(sentence):
    tokens = word_tokenize(sentence.lower())  # lowercase + tokenize
    tokens = [word for word in tokens if word.isalpha()]  # remove punctuation
    tokens = [word for word in tokens if word not in stop_words]  # remove stopwords
    return ' '.join(tokens)

cleaned_sentences = [preprocess_text(sent) for sent in raw_sentences]

In [5]:
print(cleaned_sentences)

['love machine learning', 'great course', 'nlp fascinating', 'hate subject', 'boring', 'like winters']


In [6]:
# Tokenize texts
tokenizer = Tokenizer(num_words=100, oov_token='<OOV>')
tokenizer.fit_on_texts(cleaned_sentences)
sequences = tokenizer.texts_to_sequences(cleaned_sentences)
padded = pad_sequences(sequences, padding='post')



In [7]:
print("Word Index (Tokenized Vocabulary):")
print(tokenizer.word_index)

Word Index (Tokenized Vocabulary):
{'<OOV>': 1, 'love': 2, 'machine': 3, 'learning': 4, 'great': 5, 'course': 6, 'nlp': 7, 'fascinating': 8, 'hate': 9, 'subject': 10, 'boring': 11, 'like': 12, 'winters': 13}


In [8]:
print("\nTokenized Sentences (as sequences):")
for i, seq in enumerate(sequences):
    print(f"{raw_sentences[i]} → {seq}")


Tokenized Sentences (as sequences):
I love machine learning! → [2, 3, 4]
This is a great course. → [5, 6]
NLP is fascinating. → [7, 8]
I hate this subject. → [9, 10]
This is boring. → [11]
I do not like winters. → [12, 13]


In [9]:
  print(tokenizer)

FileNotFoundError: [Errno 2] No such file or directory: 'tokenizer.pkl'

In [10]:
# Build model with input_shape explicitly declared in build()
model = Sequential([
    Embedding(input_dim=100, output_dim=16, input_length=padded.shape[1]),
    GlobalAveragePooling1D(),
    Dense(16, activation='relu'),
    Dense(1, activation='sigmoid')
])

# Explicitly build the model before summary
model.build(input_shape=(None, padded.shape[1]))

# Now summary will display shapes and parameters
model.summary()




In [11]:
# Compile and train
import numpy as np

# Convert labels list to numpy array
labels = np.array(labels)

# Compile and train
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(padded, labels, epochs=5, verbose=2)



Epoch 1/5
1/1 - 2s - 2s/step - accuracy: 0.3333 - loss: 0.6967
Epoch 2/5
1/1 - 0s - 330ms/step - accuracy: 0.3333 - loss: 0.6952
Epoch 3/5
1/1 - 0s - 37ms/step - accuracy: 0.6667 - loss: 0.6937
Epoch 4/5
1/1 - 0s - 58ms/step - accuracy: 0.6667 - loss: 0.6921
Epoch 5/5
1/1 - 0s - 59ms/step - accuracy: 0.6667 - loss: 0.6905


<keras.src.callbacks.history.History at 0x7e281a6a0b90>

In [12]:
# 9. Testing on new sentences
test_sentences = ["I love this subject!", "This is terrible."]
cleaned_test = [preprocess_text(sent) for sent in test_sentences]
test_seq = tokenizer.texts_to_sequences(cleaned_test)
test_pad = pad_sequences(test_seq, padding='post', maxlen=padded.shape[1])

predictions = model.predict(test_pad)

for i, sentence in enumerate(test_sentences):
    sentiment = "Positive" if predictions[i][0] > 0.5 else "Negative"
    print(f"'{sentence}' → {sentiment} ({predictions[i][0]:.2f})")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 219ms/step
'I love this subject!' → Negative (0.50)
'This is terrible.' → Negative (0.50)
