<a href="https://colab.research.google.com/github/smf-9000/Text-Intent-Classification/blob/main/Text_Classification_CNN_for_NLP_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>



```
# This is formatted as code
```



In [18]:
import numpy as np
import pandas as pd
import re
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.preprocessing import text, sequence

In [2]:
!wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz

--2021-07-08 15:39:40--  http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
Resolving ai.stanford.edu (ai.stanford.edu)... 171.64.68.10
Connecting to ai.stanford.edu (ai.stanford.edu)|171.64.68.10|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 84125825 (80M) [application/x-gzip]
Saving to: ‘aclImdb_v1.tar.gz’


2021-07-08 15:39:43 (28.7 MB/s) - ‘aclImdb_v1.tar.gz’ saved [84125825/84125825]



In [3]:
!tar -xf aclImdb_v1.tar.gz

In [4]:
num_words = 30000
seq_max_len = 500

In [21]:
from pathlib import Path

def read_imdb_split(split_dir):
    split_dir = Path(split_dir)
    texts = []
    labels = []
    for label_dir in ["pos", "neg"]:
        for text_file in (split_dir/label_dir).iterdir():
            texts.append(text_file.read_text())
            labels.append(0 if label_dir is "neg" else 1)

    return texts, labels

train_texts, train_labels = read_imdb_split('aclImdb/train')
test_texts, test_labels = read_imdb_split('aclImdb/test')

train_labels = np.asarray(train_labels)
test_labels = np.asarray(test_labels)

In [22]:
tokenizer = text.Tokenizer(num_words)
tokenizer.fit_on_texts(np.concatenate([train_texts, test_texts]))

train_tokenized = tokenizer.texts_to_sequences(train_texts) 
train_inputs = sequence.pad_sequences(train_tokenized, maxlen=seq_max_len)

test_tokenized = tokenizer.texts_to_sequences(test_texts) 
test_inputs = sequence.pad_sequences(test_tokenized, maxlen=seq_max_len)

In [24]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip

--2021-07-08 15:54:23--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2021-07-08 15:54:23--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2021-07-08 15:54:23--  http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2021-0

In [25]:
glove_dim = 200
glove_file = '/content/glove.6B.' + str(glove_dim) + 'd.txt'
emb_dict = {}
glove = open(glove_file)
for line in glove:
  values = line.split()
  word = values[0]
  vector = np.asarray(values[1:], dtype='float32')
  emb_dict[word] = vector
glove.close()

In [26]:
emb_matrix = np.zeros((num_words, glove_dim))
for w, i in tokenizer.word_index.items():
  if i < num_words:
    vect = emb_dict.get(w)
    if vect is not None:
      emb_matrix[i] = vect
  else:
    break

## dcnn

In [37]:
class DCNN(tf.keras.Model):
    
    def __init__(
            self,
            vocab_size,
            emb_dim=128,
            nb_filters=50,
            FFN_units=512,
            nb_classes=2,
            dropout_rate=0.1,
            training=False,
            name="dcnn",
            ngrams=[2,3,4]):
        super(DCNN, self).__init__(name=name)
        
        self.embedding = layers.Embedding(
                            vocab_size,
                            emb_dim,
                            embeddings_initializer=tf.keras.initializers.Constant(emb_matrix),
                            trainable=False)
        
        self.conv1d_list = []
        for n in ngrams:
          conv_tmp = layers.Conv1D(
                            filters=nb_filters,
                            kernel_size=n,
                            padding="valid",
                            activation="relu")
          self.conv1d_list.append(conv_tmp)

        self.conv1 = layers.Conv1D(
                        filters=nb_filters,
                        kernel_size=3,
                        padding="valid",
                        activation="relu")
        
        self.conv2 = layers.Conv1D(
                        filters=nb_filters,
                        kernel_size=5,
                        padding="valid",
                        activation="relu")
        
        self.pool_1 = layers.MaxPooling1D()
        self.pool_2 = layers.GlobalMaxPool1D()

        self.dense_1 = layers.Dense(
                            units=FFN_units,
                            activation="relu")
        
        self.dropout_e = layers.Dropout(rate=dropout_rate)
        self.dropout_d = layers.Dropout(rate=dropout_rate)

        if nb_classes == 2:
            self.last_dense = layers.Dense(
                            units=1,
                            activation="sigmoid")
        else:
            self.last_dense = layers.Dense(
                            units=nb_classes,
                            activation="softmax")
    
    def call(self, inputs, training):
        x = self.embedding(inputs)
        x = self.dropout_e(x, training)

        x_1 = self.conv1(x)
        x_1 = self.pool_1(x_1)
        x_1 = self.conv2(x_1)
        x_1 = self.pool_2(x_1)

        x_2 = [x_1]
        for i, _ in enumerate(self.conv1d_list):
          x_t = self.conv1d_list[i](x)
          x_t = self.pool_2(x_t)
          x_2.append(x_t)

        output = tf.concat(x_2, axis=-1)
        output = self.dense_1(output)
        output = self.dropout_d(output, training)
        output = self.last_dense(output)

        return output


In [38]:
# print(len(set(train_labels)))

In [80]:
VOCAB_SIZE = num_words

EMB_DIM = glove_dim
NB_FILTERS = 256
FFN_UNITS = 256
NB_CLASSES = len(set(train_labels))

DROPOUT_RATE = 0.2

BATCH_SIZE = 256
NB_EPOCHS = 5

In [81]:
Dcnn = DCNN(
          vocab_size=VOCAB_SIZE,
          emb_dim=EMB_DIM,
          nb_filters=NB_FILTERS,
          FFN_units=FFN_UNITS,
          nb_classes=NB_CLASSES,
          dropout_rate=DROPOUT_RATE,
          ngrams=[3,4,5])

In [82]:
if NB_CLASSES == 2:
    Dcnn.compile(
        loss="binary_crossentropy",
        optimizer="adam",
        metrics=["accuracy"])
else:
    Dcnn.compile(
        loss="sparse_categorical_crossentropy",
        optimizer="adam",
        metrics=["sparse_categorical_accuracy"])

In [83]:
checkpoint_path = "/content/CNN_for_NLP/ckpt/"

ckpt = tf.train.Checkpoint(Dcnn=Dcnn)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)

if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)
    print("Latest checkpoint restored")

In [84]:
Dcnn.fit(
    train_inputs,
    train_labels,
    batch_size=BATCH_SIZE,
    epochs=NB_EPOCHS,
    shuffle=True,
    validation_data=(test_inputs, test_labels))
# ckpt_manager.save()

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7fe1521e5290>

In [None]:
results = Dcnn.evaluate(test_inputs, test_labels, batch_size=BATCH_SIZE)
print(results)

In [None]:
Dcnn.summary()