# Import necessary packages

In [54]:
# Load the necessary packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import zipfile
from sklearn.model_selection import train_test_split

from tensorflow.keras.datasets import imdb
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

%matplotlib inline

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# PART A

**CONTEXT**: The objective of this project is to build a text classification model that analyses the customer's sentiments based on their reviews in the IMDB database. The model uses a complex deep learning model to build an embedding layer followed by a classification algorithm to analyse the sentiment of the customers.

**PROJECT OBJECTIVE**: To Build a sequential NLP classifier which can use input text parameters to determine the customer sentiments.

In [4]:
# Q1 - Import and analyse the data set

(train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=10000)

print("Train data shape:", train_data.shape)
print("Test data shape:", test_data.shape)
print("Train label shape:", train_labels.shape)
print("Test label shape:", test_labels.shape)

Train data shape: (25000,)
Test data shape: (25000,)
Train label shape: (25000,)
Test label shape: (25000,)


In [5]:
# Q2 Perform relevant sequence adding on the data

max_sequence_length = 500
train_data = pad_sequences(train_data, maxlen=max_sequence_length)
test_data = pad_sequences(test_data, maxlen=max_sequence_length)


In [7]:
#Q3 Perform following data analysis:
        # Print shape of features and labels
        # Print value of any one feature and it's label

print("Train data shape:", train_data.shape)
print("Test data shape:", test_data.shape)
print("Train label shape:", train_labels.shape)
print("Test label shape:", test_labels.shape)

# Print the first feature and its label
print("Example Feature:", train_data[10])
print("Example Label:", train_labels[10])


Train data shape: (25000, 500)
Test data shape: (25000, 500)
Train label shape: (25000,)
Test label shape: (25000,)
Example Feature: [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    1  785  189  438   47  110
  142    7    6 7475  120    4  236  378    7  153   19   87  108  141
   17 1004    5    2  883    2   23    8    4  136    2    2    4 7475
   43 1076   21 1407  419    5 5202  120   91  682  189 2818    5    9
 1348   31    7    4  118  785  189  108  126   93    2   16  540  324
   23    6  364  352   21   14    9   93   56   18   11  230   53  771
   74   31   34    4 2834    7    4   22    5   14   11  471    9    2
   34    4  321  487    5  116   15 6584    4   22    9    6 2286    4
  114 2679   23  107  293 1008 1172    5  328 1236    4 1375  109    9
    6  132  773

In [14]:
# Q4 Decode the feature value to get original sentence

feature_index = 1
word_index = imdb.get_word_index()
reverse_word_index = {value: key for key, value in word_index.items()}
decoded_review = ' '.join([reverse_word_index.get(i - 3, '') for i in train_data[feature_index]])

print("Decoded Review:", decoded_review.strip())
print("Label for the Example Review:", train_labels[feature_index])


feature_index = 10
word_index = imdb.get_word_index()
reverse_word_index = {value: key for key, value in word_index.items()}
decoded_review = ' '.join([reverse_word_index.get(i - 3, '') for i in train_data[feature_index]])

print("Decoded Review:", decoded_review.strip())
print("Label for the Example Review:", train_labels[feature_index])


Decoded Review: big hair big boobs bad music and a giant safety pin these are the words to best describe this terrible movie i love cheesy horror movies and i've seen hundreds but this had got to be on of the worst ever made the plot is paper thin and ridiculous the acting is an abomination the script is completely laughable the best is the end showdown with the cop and how he worked out who the killer is it's just so damn terribly written the clothes are sickening and funny in equal  the hair is big lots of boobs  men wear those cut  shirts that show off their  sickening that men actually wore them and the music is just  trash that plays over and over again in almost every scene there is trashy music boobs and  taking away bodies and the gym still doesn't close for  all joking aside this is a truly bad film whose only charm is to look back on the disaster that was the 80's and have a good old laugh at how bad everything was back then
Label for the Example Review: 0
Decoded Review: fre

In [15]:
#Q5 Design, train, tune and test a sequential model.

model = Sequential()
model.add(Embedding(10000, 32, input_length=max_sequence_length))
model.add(LSTM(32))
model.add(Dense(1, activation='sigmoid'))


model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model with train_data and train_labels
model.fit(train_data, train_labels, epochs=10, batch_size=64, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x783ea41d1120>

In [16]:
# Evaluate the model on the test data
loss, accuracy = model.evaluate(test_data, test_labels)


print("Test Accuracy:", accuracy)

Test Accuracy: 0.8558800220489502


In [38]:
# Q6 Use the designed model to print the prediction on any one sample

sample_index = 20
sample_data = train_data[sample_index]
sample_label = train_labels[sample_index]

print("sample_label", sample_label)
sample_data = sample_data.reshape(1, -1)

accuracy = model.evaluate(sample_data, np.array([sample_label]))
print("Sample Accuracy:", accuracy)

prediction = model.predict(sample_data)
print("prediction:", prediction)

sample_label 0
Sample Accuracy: [0.0002179304137825966, 1.0]
prediction: [[0.00021791]]


In [39]:
if prediction >= 0.7:
    sentiment = "positive"
    confidence = prediction
else:
    sentiment = "negative"
    confidence = 1 - prediction

print("Sentiment:", sentiment)
print("Confidence:", confidence)

sentiment_label = 1 if prediction >= 0.5 else 0
print("Prediction Label:", sentiment_label)

Sentiment: negative
Confidence: [[0.9997821]]
Prediction Label: 0


In [40]:
# Q6 Use the designed model to print the prediction on any one sample

sample_review = "a giant safety pin these are the words to best describe this terrible movie i love cheesy horror movies and i've seen hundreds but this had got to be on of the worst ever made the plot is paper thin and ridiculous the acting is an abomination the script is completely laughable the best is the end showdown with the cop and how he worked out who the killer is it's just so damn terribly written the clothes are sickening and funny in equal  the hair is big lots of boobs  men wear those cut  shirts that show off their  sickening that men actually wore them and the music is just  trash that plays over and over again in almost every scene there is trashy music boobs and  taking away bodies and the gym still doesn't close for  all joking aside this is a truly bad film whose only charm is to look back on the disaster that was the 80's and have a good old laugh at how bad everything was back then"
sample_review = [word_index.get(word, 0) for word in sample_review.split()]
sample_review = pad_sequences([sample_review], maxlen=max_sequence_length)

prediction = model.predict(np.array(sample_review))
print("Sentiment Prediction:", prediction)

Sentiment Prediction: [[0.0002642]]


In [41]:
if prediction >= 0.7:
    sentiment = "positive"
    confidence = prediction
else:
    sentiment = "negative"
    confidence = 1 - prediction

print("Sentiment:", sentiment)
print("Confidence:", confidence)

Sentiment: negative
Confidence: [[0.9997358]]


**CONTEXT**: Past studies in Sarcasm Detection mostly make use of Twitter datasets collected using hashtag based supervision but such datasets are noisy in terms of labels and language. Furthermore, many tweets are replies to
other tweets and detecting sarcasm in these requires the availability of contextual tweets.In this hands-on project, the goal is to build a model to detect whether a sentence is sarcastic or not, using Bidirectional LSTMs.

**PROJECT OBJECTIVE**: Build a sequential NLP classifier which can use input text parameters to determine the customer sentiments.

In [42]:
# Q1 - Read and explore the data
data = pd.read_json("/content/drive/My Drive/AIML/NLP/project/Sarcasm_Headlines_Dataset.json", lines=True)
data.head()

Unnamed: 0,is_sarcastic,headline,article_link
0,1,thirtysomething scientists unveil doomsday clo...,https://www.theonion.com/thirtysomething-scien...
1,0,dem rep. totally nails why congress is falling...,https://www.huffingtonpost.com/entry/donna-edw...
2,0,eat your veggies: 9 deliciously different recipes,https://www.huffingtonpost.com/entry/eat-your-...
3,1,inclement weather prevents liar from getting t...,https://local.theonion.com/inclement-weather-p...
4,1,mother comes pretty close to using word 'strea...,https://www.theonion.com/mother-comes-pretty-c...


In [43]:
# Q2 Retain relevant columns
data = data[["is_sarcastic", "headline"]]

In [45]:
# Q3 Get length of each sentence
data['headline_length'] = data['headline'].apply(lambda x: len(x.split()))
data.head()

Unnamed: 0,is_sarcastic,headline,headline_length
0,1,thirtysomething scientists unveil doomsday clo...,8
1,0,dem rep. totally nails why congress is falling...,13
2,0,eat your veggies: 9 deliciously different recipes,7
3,1,inclement weather prevents liar from getting t...,8
4,1,mother comes pretty close to using word 'strea...,9


In [47]:
#Q4 Define parameters

#max_features: Number of words to take from tokenizer
#max_sequence_length: Maximum length of each sentence
#embedding_dim: Size of embedding vector

max_features = 10000
max_sequence_length = 20
embedding_dim = 100

In [48]:
# Q5 Get indices for words
# Q6 Create features and labels

tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(data['headline'])
sequences = tokenizer.texts_to_sequences(data['headline'])

X = pad_sequences(sequences, maxlen=maxlen, padding='post')
y = data['is_sarcastic'].values


In [59]:
print(X)
print(y)

[[ 354 3166 7473 ...    0    0    0]
 [7474 1774  757 ...    0    0    0]
 [ 862   32  261 ...    0    0    0]
 ...
 [   3   99  628 ...    0    0    0]
 [1869 1312 3316 ...    0    0    0]
 [ 216 3282   20 ...    0    0    0]]
[1 0 0 ... 0 1 1]


In [50]:
# Q7 Get vocabulary size
vocab_size = len(tokenizer.word_index) + 1
print("vocab_size" , vocab_size)

vocab_size 30885


In [51]:
# Q8 Create a weight matrix using GloVe embeddings

glove_zip_path = '/content/drive/My Drive/AIML/NLP/project/glove.6B.zip'

with zipfile.ZipFile(glove_zip_path, 'r') as zip_ref:
    zip_ref.extractall('glove')

glove_file = 'glove/glove.6B.100d.txt'

embeddings_index = {}
with open(glove_file, encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs


In [52]:
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector


In [53]:
# Q9 Define and compile a Bidirectional LSTM model.
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, weights=[embedding_matrix], input_length=maxlen, trainable=False))
model.add(Bidirectional(LSTM(64)))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])


In [55]:
# Q10 Fit the model and check the validation accuracy

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

history = model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=20, batch_size=64)


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [56]:
# Check validation accuracy
val_accuracy = model.evaluate(X_val, y_val)[1]

print(f"Validation Accuracy: {val_accuracy * 100:.2f}%")


Validation Accuracy: 85.81%
