<a href="https://colab.research.google.com/github/smf-9000/Text-Intent-Classification/blob/main/Text_Classification_CNN_for_NLP_v1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>



```
start link:
https://www.udemy.com/course/modern-nlp
```



In [None]:
!wget http://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip

--2021-07-08 10:44:38--  http://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip
Resolving cs.stanford.edu (cs.stanford.edu)... 171.64.64.64
Connecting to cs.stanford.edu (cs.stanford.edu)|171.64.64.64|:80... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip [following]
--2021-07-08 10:44:38--  https://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip
Connecting to cs.stanford.edu (cs.stanford.edu)|171.64.64.64|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 81363704 (78M) [application/zip]
Saving to: ‘trainingandtestdata.zip’


2021-07-08 10:44:41 (31.7 MB/s) - ‘trainingandtestdata.zip’ saved [81363704/81363704]



In [None]:
!mkdir /content/CNN_for_NLP
!mkdir /content/CNN_for_NLP/data
!mkdir /content/CNN_for_NLP/ckpt
!unzip /content/trainingandtestdata.zip -d /content/CNN_for_NLP/data

Archive:  /content/trainingandtestdata.zip
  inflating: /content/CNN_for_NLP/data/testdata.manual.2009.06.14.csv  
  inflating: /content/CNN_for_NLP/data/training.1600000.processed.noemoticon.csv  


In [None]:
import numpy as np
import pandas as pd
import math
import re
from bs4 import BeautifulSoup
import tensorflow as tf
from tensorflow.keras import layers
import tensorflow_datasets as tfds

In [None]:
cols = ["sentiment","id","date","query","user","text"]

In [None]:
data_from_csv = pd.read_csv(
    "/content/CNN_for_NLP/data/training.1600000.processed.noemoticon.csv",
    header=None,
    names=cols,
    engine="python",
    encoding="latin1")

In [None]:
remove_n = 1000000
drop_indices = np.random.choice(data_from_csv.index, remove_n, replace=False)
data = data_from_csv.drop(drop_indices).reset_index(drop=True)

train_indecies = np.random.choice(data.index, 400000, replace=False)
train_data = data.iloc[train_indecies].reset_index(drop=True)

test_data = data.drop(train_indecies).reset_index(drop=True)


In [None]:
(train_data['sentiment'] == 0).sum()

In [None]:
train_data.head()

In [None]:
train_data.shape[0]

In [None]:
train_data.drop(
    ["id","date","query","user"],
    axis =1,
    inplace=True)
test_data.drop(
    ["id","date","query","user"],
    axis =1,
    inplace=True)

In [None]:
train_data.head()

In [None]:
train_data_labels = train_data.sentiment.values
train_data_labels[train_data_labels==4] = 1
test_data_labels = test_data.sentiment.values
test_data_labels[test_data_labels==4] = 1

In [None]:
def clean_tweet(tweet):
  tweet = BeautifulSoup(tweet,"lxml").get_text()
  tweet = re.sub(r"@[A-Za-z0-9]+",' ', tweet)
  tweet = re.sub(r"https?://[A-Za-z0-9./]+",' ', tweet)
  tweet = re.sub(r"[^a-zA-Z.!?']",' ', tweet)
  tweet = re.sub(r" +", " ", tweet)
  return tweet

In [None]:
train_data_clean = [clean_tweet(tweet) for tweet in train_data.text]
test_data_clean = [clean_tweet(tweet) for tweet in test_data.text]

In [None]:
# set(test_data_labels)

In [None]:
tokenizer = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(
                                                      train_data_clean + test_data_clean,
                                                      target_vocab_size=5000,
                                                      max_subword_length=7)

In [None]:
train_inputs = [tokenizer.encode(sentence) for sentence in train_data_clean]
test_inputs = [tokenizer.encode(sentence) for sentence in test_data_clean]

In [None]:
train_labels = train_data_labels
test_labels = test_data_labels

In [None]:
MAX_LEN = max([len(sentence) for sentence in train_inputs + test_inputs])
train_inputs = tf.keras.preprocessing.sequence.pad_sequences(
    train_inputs,
    value=0,
    padding="post",
    maxlen=MAX_LEN)
test_inputs = tf.keras.preprocessing.sequence.pad_sequences(
    test_inputs,
    value=0,
    padding="post",
    maxlen=MAX_LEN)

In [None]:
class DCNN(tf.keras.Model):
    
    def __init__(
            self,
            vocab_size,
            emb_dim=128,
            nb_filters=50,
            FFN_units=512,
            nb_classes=2,
            dropout_rate=0.1,
            training=False,
            name="dcnn",
            ngrams=[2,3,4]):
        super(DCNN, self).__init__(name=name)
        
        self.embedding = layers.Embedding(
                            vocab_size,
                            emb_dim)
        self.conv1_list = []
        for n in ngrams:
          conv1 = layers.Conv1D(
                            filters=nb_filters,
                            kernel_size=n,
                            padding="valid",
                            activation="relu")
          self.conv1_list.append(conv1)

        self.pool_1 = layers.GlobalMaxPool1D()
        self.dense_1 = layers.Dense(
                            units=FFN_units,
                            activation="relu")
        self.dropout_e = layers.Dropout(rate=0.4)
        self.dropout_d = layers.Dropout(rate=dropout_rate)

        if nb_classes == 2:
            self.last_dense = layers.Dense(
                            units=1,
                            activation="sigmoid")
        else:
            self.last_dense = layers.Dense(
                            units=nb_classes,
                            activation="softmax")
    
    def call(self, inputs, training):
        x = self.embedding(inputs)
        x = self.dropout_e(x, training)
        x_x = []
        for i, _ in enumerate(self.conv1_list):
          x_t = self.conv1_list[i](x)
          x_t = self.pool_1(x_t)
          x_x.append(x_t)
        
        merged = tf.concat(x_x, axis=-1) # (batch_size, 3 * nb_filters)
        merged = self.dense_1(merged)
        merged = self.dropout_d(merged, training)
        output = self.last_dense(merged)
        
        return output

In [None]:
# print(len(set(train_labels)))

In [None]:
VOCAB_SIZE = tokenizer.vocab_size

EMB_DIM = 128
NB_FILTERS = 64
FFN_UNITS = 64
NB_CLASSES = len(set(train_labels))

DROPOUT_RATE = 0.2

BATCH_SIZE = 256
NB_EPOCHS = 5

In [None]:
Dcnn = DCNN(
          vocab_size=VOCAB_SIZE,
          emb_dim=EMB_DIM,
          nb_filters=NB_FILTERS,
          FFN_units=FFN_UNITS,
          nb_classes=NB_CLASSES,
          dropout_rate=DROPOUT_RATE,
          ngrams=[2,3,5])

In [None]:
if NB_CLASSES == 2:
    Dcnn.compile(
        loss="binary_crossentropy",
        optimizer="adam",
        metrics=["accuracy"])
else:
    Dcnn.compile(
        loss="sparse_categorical_crossentropy",
        optimizer="adam",
        metrics=["sparse_categorical_accuracy"])

In [None]:
checkpoint_path = "/content/CNN_for_NLP/ckpt/"

ckpt = tf.train.Checkpoint(Dcnn=Dcnn)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)

if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)
    print("Latest checkpoint restored")

In [None]:
Dcnn.fit(
    train_inputs,
    train_labels,
    batch_size=BATCH_SIZE,
    epochs=NB_EPOCHS,
    shuffle=True,
    validation_data=(test_inputs, test_labels))
# ckpt_manager.save()

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f71cf524e10>

In [None]:
results = Dcnn.evaluate(test_inputs, test_labels, batch_size=BATCH_SIZE)
print(results)