<a href="https://colab.research.google.com/github/smf-9000/Text-Intent-Classification/blob/main/Text_Classification_CNN_for_NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>



```
start link:
https://www.udemy.com/course/modern-nlp

[TODO] Try some external word embedding.
```



In [None]:
!wget http://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip

In [None]:
!mkdir /content/CNN_for_NLP
!mkdir /content/CNN_for_NLP/data
!mkdir /content/CNN_for_NLP/ckpt
!unzip /content/trainingandtestdata.zip -d /content/CNN_for_NLP/data

In [4]:
import numpy as np
import pandas as pd
import math
import re
from bs4 import BeautifulSoup
import tensorflow as tf
from tensorflow.keras import layers
import tensorflow_datasets as tfds

In [5]:
cols = ["sentiment","id","date","query","user","text"]

In [6]:
train_data = pd.read_csv(
    "/content/CNN_for_NLP/data/training.1600000.processed.noemoticon.csv",
    header=None,
    names=cols,
    engine="python",
    encoding="latin1")
test_data = pd.read_csv(
    "/content/CNN_for_NLP/data/testdata.manual.2009.06.14.csv",
    header=None,
    names=cols,
    engine="python",
    encoding="latin1")

In [7]:
remove_n = 1300000
drop_indices = np.random.choice(train_data.index, remove_n, replace=False)
train_data = train_data.drop(drop_indices).reset_index(drop=True)

In [None]:
(train_data['sentiment'] == 4).sum()

In [None]:
train_data.head()

In [None]:
train_data.shape[0]

In [11]:
train_data.drop(
    ["id","date","query","user"],
    axis =1,
    inplace=True)
test_data.drop(
    ["id","date","query","user"],
    axis =1,
    inplace=True)

In [None]:
train_data.head()

In [13]:
test_data = test_data[test_data.sentiment.values != 2]  # there are some "2" in test set

In [14]:
test_data = test_data.reset_index(drop=True)

In [15]:
train_data_labels = train_data.sentiment.values
train_data_labels[train_data_labels==4] = 1
test_data_labels = test_data.sentiment.values
test_data_labels[test_data_labels==4] = 1

In [16]:
def clean_tweet(tweet):
  tweet = BeautifulSoup(tweet,"lxml").get_text()
  tweet = re.sub(r"@[A-Za-z0-9]+",' ', tweet)
  tweet = re.sub(r"https?://[A-Za-z0-9./]+",' ', tweet)
  tweet = re.sub(r"[^a-zA-Z.!?']",' ', tweet)
  tweet = re.sub(r" +", " ", tweet)
  return tweet

In [17]:
train_data_clean = [clean_tweet(tweet) for tweet in train_data.text]
test_data_clean = [clean_tweet(tweet) for tweet in test_data.text]

In [18]:
# set(test_data_labels)

In [19]:
tokenizer = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(
                                                      train_data_clean + test_data_clean,
                                                      target_vocab_size=500,
                                                      max_subword_length=10)

In [20]:
train_inputs = [tokenizer.encode(sentence) for sentence in train_data_clean]
test_inputs = [tokenizer.encode(sentence) for sentence in test_data_clean]

In [21]:
train_labels = train_data_labels
test_labels = test_data_labels

In [22]:
MAX_LEN = max([len(sentence) for sentence in train_inputs + test_inputs])
train_inputs = tf.keras.preprocessing.sequence.pad_sequences(
    train_inputs,
    value=0,
    padding="post",
    maxlen=MAX_LEN)
test_inputs = tf.keras.preprocessing.sequence.pad_sequences(
    test_inputs,
    value=0,
    padding="post",
    maxlen=MAX_LEN)

In [23]:
class DCNN(tf.keras.Model):
    
    def __init__(
            self,
            vocab_size,
            emb_dim=128,
            nb_filters=50,
            FFN_units=512,
            nb_classes=2,
            dropout_rate=0.1,
            training=False,
            name="dcnn",
            ngrams=[2,3,4]):
        super(DCNN, self).__init__(name=name)
        
        self.embedding = layers.Embedding(
                            vocab_size,
                            emb_dim)
        self.conv1_list = []
        for n in ngrams:
          conv1 = layers.Conv1D(
                            filters=nb_filters,
                            kernel_size=n,
                            padding="valid",
                            activation="relu")
          self.conv1_list.append(conv1)

        self.pool_1 = layers.GlobalMaxPool1D()
        self.dense_1 = layers.Dense(
                            units=FFN_units,
                            activation="relu")
        self.dropout = layers.Dropout(rate=dropout_rate)

        if nb_classes == 2:
            self.last_dense = layers.Dense(
                            units=1,
                            activation="sigmoid")
        else:
            self.last_dense = layers.Dense(
                            units=nb_classes,
                            activation="softmax")
    
    def call(self, inputs, training):
        x = self.embedding(inputs)
        x_x = []
        for i, _ in enumerate(self.conv1_list):
          x_t = self.conv1_list[i](x)
          x_t = self.pool_1(x_t)
          x_x.append(x_t)
        
        merged = tf.concat(x_x, axis=-1) # (batch_size, 3 * nb_filters)
        merged = self.dense_1(merged)
        merged = self.dropout(merged, training)
        output = self.last_dense(merged)
        
        return output

In [83]:
# print(len(set(train_labels)))

In [29]:
VOCAB_SIZE = tokenizer.vocab_size

EMB_DIM = 100
NB_FILTERS = 200
FFN_UNITS = 128
NB_CLASSES = len(set(train_labels))

DROPOUT_RATE = 0.2

BATCH_SIZE = 256
NB_EPOCHS = 5

In [30]:
Dcnn = DCNN(
          vocab_size=VOCAB_SIZE,
          emb_dim=EMB_DIM,
          nb_filters=NB_FILTERS,
          FFN_units=FFN_UNITS,
          nb_classes=NB_CLASSES,
          dropout_rate=DROPOUT_RATE)

In [31]:
if NB_CLASSES == 2:
    Dcnn.compile(
        loss="binary_crossentropy",
        optimizer="adam",
        metrics=["accuracy"])
else:
    Dcnn.compile(
        loss="sparse_categorical_crossentropy",
        optimizer="adam",
        metrics=["sparse_categorical_accuracy"])

In [32]:
checkpoint_path = "/content/CNN_for_NLP/ckpt/"

ckpt = tf.train.Checkpoint(Dcnn=Dcnn)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)

if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)
    print("Latest checkpoint restored")

In [33]:
Dcnn.fit(
    train_inputs,
    train_labels,
    batch_size=BATCH_SIZE,
    epochs=NB_EPOCHS,
    shuffle=True,
    validation_data=(test_inputs, test_labels))
# ckpt_manager.save()

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f16787d41d0>

In [None]:
results = Dcnn.evaluate(test_inputs, test_labels, batch_size=BATCH_SIZE)
print(results)