# Classification using Deep neural network (Any One from the following)
## 1. Multiclass classification using Deep Neural Networks: Example: Use the OCR letter recognition dataset https://archive.ics.uci.edu/ml/datasets/letter+recognition
## 2. Binary classification using Deep Neural Networks Example: Classify movie reviews into "positive" reviews and "negative" reviews, just based on the text content of the reviews. Use IMDB dataset

In [15]:
import tensorflow as tf
import os
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import re
import string
from keras.datasets import imdb


raw_data_df = pd.read_csv('IMDB_Dataset.csv')
raw_data_df['sentiment'] = raw_data_df['sentiment'].apply(lambda row : 1 if row == 'positive' else 0)
raw_data_df['review'] = raw_data_df['review'].apply(lambda row : row.lower())

raw_data_df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,1
1,a wonderful little production. <br /><br />the...,1
2,i thought this was a wonderful way to spend ti...,1
3,basically there's a family where a little boy ...,0
4,"petter mattei's ""love in the time of money"" is...",1


In [16]:
features = raw_data_df['review'].to_numpy()
labels = raw_data_df['sentiment'].to_numpy()

features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size=0.4, random_state = 0)
features_valid, features_test, labels_valid, labels_test = train_test_split(features_test, labels_test, test_size=0.5, random_state=0)
features_train = tf.convert_to_tensor(features_train)
labels_train = tf.convert_to_tensor(labels_train)
features_valid = tf.convert_to_tensor(features_valid)
labels_valid = tf.convert_to_tensor(labels_valid)
features_test = tf.convert_to_tensor(features_test)
labels_test = tf.convert_to_tensor(labels_test)

In [17]:
valid_ds = tf.data.Dataset.from_tensor_slices((features_valid, labels_valid))
next(iter(valid_ds))

valid_ds = tf.data.Dataset.from_tensor_slices((features_valid, labels_valid))
next(iter(valid_ds))

test_ds = tf.data.Dataset.from_tensor_slices((features_test, labels_test))
next(iter(test_ds))

train_ds = tf.data.Dataset.from_tensor_slices((features_train, labels_train))
next(iter(train_ds))

(<tf.Tensor: shape=(), dtype=string, numpy=b'liked stanley & iris very much. acting was very good. story had a unique and interesting arrangement. the absence of violence and porno sex was refreshing. characters were very convincing and felt like you could understand their feelings. very enjoyable movie.'>,
 <tf.Tensor: shape=(), dtype=int64, numpy=1>)

In [18]:
BATCH_SIZE = 64

train_ds = train_ds.batch(batch_size=BATCH_SIZE)
train_ds.cardinality()

train_ds = train_ds.batch(batch_size=BATCH_SIZE)
train_ds.cardinality()
valid_ds = valid_ds.batch(batch_size=BATCH_SIZE)
train_ds.cardinality()
test_ds = test_ds.batch(batch_size=BATCH_SIZE)
test_ds.cardinality()
AUTOTUNE = tf.data.AUTOTUNE
train_ds = train_ds.prefetch(buffer_size=AUTOTUNE)
valid_ds = valid_ds.prefetch(buffer_size=AUTOTUNE)
test_ds = test_ds.prefetch(buffer_size=AUTOTUNE)

def custom_standardization(input_data):
  std_text = tf.strings.lower(input_data)#remove any urls from the text
  std_text = tf.strings.regex_replace(std_text, r"https:\/\/.*[\r\n]*", '')
  std_text = tf.strings.regex_replace(std_text, r"www\.\w*\.\w\w\w", '')
  std_text = tf.strings.regex_replace(std_text, r"<[\w]*[\s]*/>", '')
  std_text = tf.strings.regex_replace(std_text, '[%s]' % re.escape(string.punctuation), '')
  std_text = tf.strings.regex_replace(std_text, '\s{2}', '')
  std_text = tf.strings.strip(std_text)
  return std_text

In [19]:

VOCAB_SIZE = 1000
vectorizer_layer = tf.keras.layers.experimental.preprocessing.TextVectorization(max_tokens=VOCAB_SIZE,
                                                                                standardize=custom_standardization)
                                                                                #output_mode='tf_idf')

vectorizer_layer.adapt(train_ds.map(lambda text, label: text))

vocab = np.array(vectorizer_layer.get_vocabulary())

examples, labels = next(iter(train_ds.take(1)))

ValueError: in user code:

    File "c:\Users\rahul-al\AppData\Local\Programs\Python\Python38\lib\site-packages\keras\engine\base_preprocessing_layer.py", line 123, in adapt_step  *
        self.update_state(data)
    File "c:\Users\rahul-al\AppData\Local\Programs\Python\Python38\lib\site-packages\keras\layers\preprocessing\text_vectorization.py", line 475, in update_state  **
        self._lookup_layer.update_state(self._preprocess(data))
    File "c:\Users\rahul-al\AppData\Local\Programs\Python\Python38\lib\site-packages\keras\layers\preprocessing\text_vectorization.py", line 573, in _preprocess
        raise ValueError(

    ValueError: When using `TextVectorization` to tokenize strings, the input rank must be 1 or the last shape dimension must be 1. Received: inputs.shape=(None, None) with rank=2


In [20]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Read the dataset
raw_data_df = pd.read_csv('IMDB_Dataset.csv')

# Convert sentiment labels to binary values
raw_data_df['sentiment'] = raw_data_df['sentiment'].apply(lambda row: 1 if row == 'positive' else 0)

# Convert reviews to lowercase
raw_data_df['review'] = raw_data_df['review'].apply(lambda row: row.lower())

# Split the dataset into training and testing sets
train_data = raw_data_df.sample(frac=0.8, random_state=42)
test_data = raw_data_df.drop(train_data.index)

# Tokenize the reviews
tokenizer = Tokenizer(num_words=10000, oov_token='<OOV>')
tokenizer.fit_on_texts(train_data['review'])

# Convert text to sequences
train_sequences = tokenizer.texts_to_sequences(train_data['review'])
test_sequences = tokenizer.texts_to_sequences(test_data['review'])

# Pad the sequences to ensure equal length
max_length = 200  # or choose an appropriate maximum sequence length
train_padded = pad_sequences(train_sequences, maxlen=max_length, truncating='post')
test_padded = pad_sequences(test_sequences, maxlen=max_length, truncating='post')

# Create the model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=10000, output_dim=16, input_length=max_length),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(16, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
num_epochs = 10  # adjust as needed
model.fit(train_padded, train_data['sentiment'], epochs=num_epochs, validation_data=(test_padded, test_data['sentiment']))

# Evaluate the model
model.evaluate(test_padded, test_data['sentiment'])


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


[0.4229661226272583, 0.859000027179718]

In [27]:
# sample_text = "it was bad movie nothing to watch"
sample_text = "such great movie loved must watch"

tokenizer_text = tokenizer.texts_to_sequences([sample_text])

tokenizer_text = pad_sequences(tokenizer_text, maxlen=max_length, truncating='post')

predictions = model.predict(tokenizer_text)

if predictions[0][0] > 0.5:
    print("Positive")
else:
    print("Negative")

Positive
