PS: Classification using Deep neural network - Binary classification using Deep Neural Networks Example: Classify movie reviews into positive" reviews and "negative" reviews, just based on the text content of the reviews. Use IMDB dataset

# CSV Version

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Flatten
from tensorflow.keras.preprocessing.sequence import pad_sequences

2024-04-29 18:41:30.053199: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# Load the IMDb dataset
df = pd.read_csv("IMDB Dataset.csv")

df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [4]:
max_words = 1000      # Number of words to be extracted from 1 particular row while importing the dataset
max_len = 150         # For each sentence in 1 row, consider only 1st 100 common words [Cut texts after this number of words (among top max_features most common words)]

tokenizer = Tokenizer(num_words=max_words)

tokenizer.fit_on_texts(df['review'])
X = tokenizer.texts_to_sequences(df['review'])

X = pad_sequences(X, maxlen=max_len)
y = np.array(df['sentiment'].map({'positive': 1, 'negative': 0}))

In [5]:
X

array([[433,  14,  12, ...,  16, 125, 486],
       [  0,   0,   0, ...,  23,  69, 221],
       [  0,   0,   0, ...,  63,  16, 350],
       ...,
       [262, 764, 177, ...,  16,   2,   2],
       [  0,   0,   0, ...,  67, 739,  42],
       [  0,   0,   0, ..., 794,  11,  17]], dtype=int32)

In [6]:
y

array([1, 1, 1, ..., 0, 0, 0])

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

In [8]:
# Build the neural network model
model = Sequential([
    Embedding(max_words, 128, input_length=max_len),
    Flatten(),
    Dense(16, activation='relu'),
    Dense(1, activation='sigmoid')
])



In [9]:
# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

2024-04-29 18:41:42.068982: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-04-29 18:41:42.076418: W tensorflow/core/common_runtime/gpu/gpu_device.cc:2251] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


In [10]:
# Train the model
history = model.fit(X_train, y_train, epochs=5, batch_size=128,validation_split=0.2)

Epoch 1/5
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 16ms/step - accuracy: 0.6730 - loss: 0.5636 - val_accuracy: 0.8485 - val_loss: 0.3441
Epoch 2/5
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 15ms/step - accuracy: 0.8950 - loss: 0.2651 - val_accuracy: 0.8338 - val_loss: 0.3781
Epoch 3/5
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 15ms/step - accuracy: 0.9547 - loss: 0.1448 - val_accuracy: 0.8219 - val_loss: 0.4692
Epoch 4/5
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 15ms/step - accuracy: 0.9856 - loss: 0.0565 - val_accuracy: 0.8205 - val_loss: 0.5389
Epoch 5/5
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 15ms/step - accuracy: 0.9971 - loss: 0.0184 - val_accuracy: 0.8216 - val_loss: 0.6331


In [11]:
# Evaluate the model
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print("Test Loss:", test_loss)
print("Test Accuracy:", test_accuracy)

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.8294 - loss: 0.6005
Test Loss: 0.6001037359237671
Test Accuracy: 0.8282999992370605
