<a href="https://colab.research.google.com/github/rsekhar-vai/nlptc/blob/master/TextClassification_Jupyter_Notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Setup the Environment

In [0]:
!rm -r nlptc
!git clone https://github.com/rsekhar-vai/nlptc.git

Cloning into 'nlptc'...
remote: Enumerating objects: 65, done.[K
remote: Counting objects:   1% (1/65)[Kremote: Counting objects:   3% (2/65)[Kremote: Counting objects:   4% (3/65)[Kremote: Counting objects:   6% (4/65)[Kremote: Counting objects:   7% (5/65)[Kremote: Counting objects:   9% (6/65)[Kremote: Counting objects:  10% (7/65)[Kremote: Counting objects:  12% (8/65)[Kremote: Counting objects:  13% (9/65)[Kremote: Counting objects:  15% (10/65)[Kremote: Counting objects:  16% (11/65)[Kremote: Counting objects:  18% (12/65)[Kremote: Counting objects:  20% (13/65)[Kremote: Counting objects:  21% (14/65)[Kremote: Counting objects:  23% (15/65)[Kremote: Counting objects:  24% (16/65)[Kremote: Counting objects:  26% (17/65)[Kremote: Counting objects:  27% (18/65)[Kremote: Counting objects:  29% (19/65)[Kremote: Counting objects:  30% (20/65)[Kremote: Counting objects:  32% (21/65)[Kremote: Counting objects:  33% (22/65)[Kremote: Counting ob

In [0]:
from argparse import Namespace
from collections import Counter
import json
import os
import re
import string
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm_notebook
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import sys
import spacy
import torchtext
from torchtext import data
from torchtext.data import Field, BucketIterator
from torchtext import vocab


print('Python version:',sys.version)
print('Pandas version:',pd.__version__)
print('Pytorch version:', torch.__version__)
print('Torch Text version:', torchtext.__version__)
print('Spacy version:', spacy.__version__)


Python version: 3.6.9 (default, Nov  7 2019, 10:44:02) 
[GCC 8.3.0]
Pandas version: 1.0.3
Pytorch version: 1.4.0
Torch Text version: 0.3.1
Spacy version: 2.2.4


In [0]:
  args = Namespace(
      text_csv="nlptc/data/news.csv",
      train_csv="nlptc/data/train.csv",
      val_csv="nlptc/data/val.csv",
      test_csv="nlptc/data/test.csv",
      vectorizer_file="vectorizer.json",
      model_state_file="model.pth",
      save_dir="model_storage/Clf",
      glove_filepath='D:\\Projects\\Text Analytics\\Glove\\glove.6B.100d.txt',
      hidden_dim=100,
      num_channels=100,
      seed=1337,
      learning_rate=0.001,
      dropout_p=0.1,
      batch_size=64,
      num_epochs=20,
      early_stopping_criteria=5,
      cuda=True,
      catch_keyboard_interrupt=True,
      reload_from_files=False,
      expand_filepaths_to_save_dir=True,
      token_type = 'w',
      max_text_length = 256,
      pretrained_embeddings= 'Glove',
      embedding_size=100,
      build_simple_char_cnn = False,
      build_simple_word_cnn = True,
      build_convrec_bilstm = False,
      build_vdcnn = False,

  )

In [0]:
from nlptc.classes import *
from nlptc.functions import *

setup_environment(args)


Expanded filepaths: 
	model_storage/Clf/vectorizer.json
	model_storage/Clf/model.pth
Using CUDA: True


# Read Data file and split into Train, Validation and Test
Text data file should have columns named as 'text' and 'category'. If the names are different, they should be renamed before processing further

In [0]:
dataframe_data = pd.read_csv(args.text_csv)
        
max_text_length = 256,
args.token_type = 'w'
#text_df = pd.read_csv(args.text_csv)
text_df_orig = pd.read_csv(args.text_csv)
print(text_df_orig.columns)

text_df_orig.rename(columns={'title':'text'},inplace=True)
text_df = text_df_orig[['text','category']] 
print(text_df.columns)

dataset = NLPDatasets(text_df,args)
train_df, val_df, test_df = dataset.get_splits()

train_df.to_csv(args.train_csv, index=False)
val_df.to_csv(args.val_csv, index=False)
test_df.to_csv(args.test_csv, index=False)


Index(['title', 'category'], dtype='object')
Index(['text', 'category'], dtype='object')


In [0]:
#dataframe_data = pd.read_csv('nlptc/data/train.csv')

#encoder = preprocessing.LabelEncoder()
#dataframe_data['sentiment'] = encoder.fit_transform(dataframe_data['sentiment'])

#dataframe_train, dataframe_val = train_test_split(dataframe_data)
#dataframe_train.to_csv("file_train.csv", index=False)
#dataframe_val.to_csv("file_val.csv", index=False)

train_df.head()


Unnamed: 0,text,category
83622,Iran Says Preliminary Nuclear Deal Reached wit...,World
71641,"Powell, Japan #39;s Machimura Discuss Iraq, Af...",World
34791,China grabs software research deals despite risks,Sci/Tech
9430,Williams-Sonoma Profit Up; Keeps Forecast (Reu...,Business
116739,HIH inquiry claims its biggest scalp,Business


# Define TorchText Fields Variables and map them to Data file Columns
TEXT variable maps to the text we will process. LABEL variable maps to Category associated with the TEXT.

TEXT and LABEL are of type FIELD which is part of TorchText package. FIELD comes with many built in functions that help in simplifying proprocessing. For example we can pass tokenizer as the argument to TEXT. Using this, TorchText does automatic tokenization while parsing the Text File Columns




In [0]:
nlp = spacy.load('en_core_web_sm')

def tokenizer(sentence):
    tokens = [w.text.lower() for w in nlp(clean_text(sentence))]
    return tokens

In [0]:
Field_TEXT = data.Field(tokenize=tokenizer, sequential=True, 
                        use_vocab=True,batch_first=True,fix_length=args.max_text_length)
Field_LABEL = data.LabelField(sequential=False)
mapping_with_file_columns = [('text', Field_TEXT), ('category', Field_LABEL)]
Dataset_train, Dataset_val = data.TabularDataset.splits(
    path='',
    train=args.train_csv,
    validation=args.val_csv,
    format='csv',
    fields=mapping_with_file_columns,
    skip_header=True
)

# Create TorchText Dataset wrappers around Train and Validation Data files
TorchText Dataset is a wrapper around a normal data file. It comes with many build in functions that help simplify processing. Dataset wrapper needs the mapping between TorchText Fields and data file columns (as defined in the previous step) 

In [0]:
 Dataset_train, Dataset_val, Dataset_test = data.TabularDataset.splits(
                                         path = '',
                                         train = args.train_csv,
                                         validation = args.val_csv,
                                         test = args.test_csv,
                                           format = 'csv',
                                         fields = mapping_with_file_columns,
                                         skip_header = True
     )
print(vars(Dataset_train[0]))


{'text': ['iran', 'says', 'preliminary', 'nuclear', 'deal', 'reached', 'with', 'eu'], 'category': 'World'}


# Build Vocabulary and Word Vectors using the Datasets Created

Volcabulary is list of unique Tokens in the text data. We will map each of the unique tokens to Word Vectors (or embeddings) using Glove Database

In [0]:
 %%time
MAX_VOCAB_SIZE = 25000
#vec = vocab.Vectors('glove.6B.100d.txt', 'D:/qBots/nlptc/glove_embedding/')
Field_TEXT.build_vocab(Dataset_train,Dataset_val,
                  max_size = MAX_VOCAB_SIZE, 
                  vectors = "glove.6B.100d", 
##                vectors = vec,
                  unk_init = torch.Tensor.normal_)
Field_LABEL.build_vocab(Dataset_train)


CPU times: user 891 ms, sys: 130 ms, total: 1.02 s
Wall time: 1.02 s


In [0]:
Field_TEXT.vocab.vectors.shape

torch.Size([25002, 100])

In [0]:
Field_TEXT.vocab.vectors[Field_TEXT.vocab.stoi['testing']]

tensor([ 0.1258,  0.2037, -0.1123,  0.3045, -0.9769, -0.5542,  0.4286,  0.7928,
        -0.7157,  0.7587, -0.1077, -0.3529,  0.0333, -0.0764,  0.3469,  0.2451,
         0.5385,  0.7100,  0.0522, -0.1323, -0.4734, -0.1800,  0.1982, -0.0651,
        -0.4349,  0.4343, -0.0951, -0.3966, -0.4798,  0.4940, -0.3679,  0.2031,
        -0.3497,  0.2970,  1.0122,  0.0933, -0.3492, -0.4592, -0.8168,  0.0374,
        -0.8714, -0.1617, -0.0595, -0.4740, -0.3519,  0.2339,  0.4807, -0.6236,
        -0.3462, -0.7426,  0.8620,  0.0581, -0.8875,  0.8843,  0.0444, -1.3311,
        -0.7529,  0.0039,  1.9239, -0.0039,  0.2040,  0.3463,  0.9224,  0.5198,
         0.4460,  0.6641, -0.2858, -0.2286,  0.1590,  0.1543, -0.0106,  0.5443,
        -0.1647,  0.2509, -0.1672,  0.4843,  0.4626, -0.4024, -1.2572, -0.2214,
         0.6287, -0.2696, -0.5828,  0.2838, -1.7209,  0.6726,  0.8789,  0.3482,
        -0.9895,  0.4533, -0.5951,  0.0075,  0.0057,  0.5793,  0.6368,  0.5939,
        -0.1434, -0.2136,  0.4768,  0.39

In [0]:
print(Field_LABEL.vocab.stoi)

defaultdict(<function _default_unk_index at 0x7fb95ed2ca60>, {'Sci/Tech': 0, 'World': 1, 'Business': 2, 'Sports': 3})


In [0]:
embeddings = Field_TEXT.vocab.vectors.numpy()


# Create the Torchtext Batches as wrapper around Dataset_train and Dataset_val for iterating over during Training/Validation

In [0]:
BATCH_SIZE = 3
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

Batches_train, Batches_val, Batches_test = data.BucketIterator.splits(
    (Dataset_train, Dataset_val,Dataset_test), 
    batch_size = args.batch_size,
    sort_key=lambda x: len(x.text), 
    sort_within_batch=True, 
    device = device)

print("***** Number of Train and Validation batches are :",len(Batches_train), len(Batches_val))


***** Number of Train and Validation batches are : 1055 352


In [0]:
batch = next(iter(Batches_train))

In [0]:
batch.category.shape

torch.Size([64])

In [0]:
batch.text.shape

torch.Size([64, 256])

In [0]:
def idxtosent(batch, idx):
    return ' '.join([Field_TEXT.vocab.itos[i] for i in batch.text[idx,:].cpu().data.numpy()])

In [0]:
for i in range(2):
  print(idxtosent(batch,i))

In [0]:
class BatchGenerator:
    def __init__(self, dl, x_field, y_field):
        self.dl, self.x_field, self.y_field = dl, x_field, y_field
        
    def __len__(self):
        return len(self.dl)
    
    def __iter__(self):
        for batch in self.dl:
            X = getattr(batch, self.x_field)
            y = getattr(batch, self.y_field)
            yield (X,y)


# Define a Simpe Neural Network Model

In [0]:

class WordCNN_Simple(nn.Module):
    def __init__(self, embedding_size, num_embeddings, num_channels,
                 hidden_dim, num_classes, dropout_p,
                 pretrained_embeddings=None, padding_idx=0):
        """
        Args:
            embedding_size (int): size of the embedding vectors
            num_embeddings (int): number of embedding vectors
            filter_width (int): width of the convolutional kernels
            num_channels (int): number of convolutional kernels per layer
            hidden_dim (int): the size of the hidden dimension
            num_classes (int): the number of classes in classification
            dropout_p (float): a dropout parameter
            pretrained_embeddings (numpy.array): previously trained word embeddings
                default is None. If provided,
            padding_idx (int): an index representing a null position
        """
        super(WordCNN_Simple, self).__init__()

        if pretrained_embeddings is None:

            self.emb = nn.Embedding(embedding_dim=embedding_size,
                                    num_embeddings=num_embeddings,
                                    padding_idx=padding_idx)
        else:
            pretrained_embeddings = torch.from_numpy(pretrained_embeddings).float()
            self.emb = nn.Embedding(embedding_dim=embedding_size,
                                    num_embeddings=num_embeddings,
                                    padding_idx=padding_idx,
                                    _weight=pretrained_embeddings)

        self.convnet = nn.Sequential(
            nn.Conv1d(in_channels=embedding_size,
                      out_channels=num_channels, kernel_size=3),
            nn.ELU(),
            nn.Conv1d(in_channels=num_channels, out_channels=num_channels,
                      kernel_size=3, stride=2),
            nn.ELU(),
            nn.Conv1d(in_channels=num_channels, out_channels=num_channels,
                      kernel_size=3, stride=2),
            nn.ELU(),
            nn.Conv1d(in_channels=num_channels, out_channels=num_channels,
                      kernel_size=3),
            nn.ELU()
        )

        self._dropout_p = dropout_p
        self.fc1 = nn.Linear(num_channels, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, num_classes)

    def forward(self, x_in, apply_softmax=False):
        """The forward pass of the classifier

        Args:
            x_in (torch.Tensor): an input data tensor.
                x_in.shape should be (batch, dataset._max_seq_length)
            apply_softmax (bool): a flag for the softmax activation
                should be false if used with the Cross Entropy losses
        Returns:
            the resulting tensor. tensor.shape should be (batch, num_classes)
        """

        # embed and permute so features are channels
        x_embedded = self.emb(x_in).permute(0, 2, 1)

        features = self.convnet(x_embedded)

        # average and remove the extra dimension
        remaining_size = features.size(dim=2)
        features = F.avg_pool1d(features, remaining_size).squeeze(dim=2)
        features = F.dropout(features, p=self._dropout_p)

        # mlp classifier
        intermediate_vector = F.relu(F.dropout(self.fc1(features), p=self._dropout_p))
        prediction_vector = self.fc2(intermediate_vector)

        if apply_softmax:
            prediction_vector = F.softmax(prediction_vector, dim=1)

        return prediction_vector

In [0]:
embeddings = Field_TEXT.vocab.vectors.numpy()
type(embeddings)

numpy.ndarray

In [0]:
classifier = WordCNN_Simple(embedding_size=args.embedding_size,
                                    num_embeddings=len(Field_TEXT.vocab),
                                    num_channels=args.num_channels,
                                    hidden_dim=args.hidden_dim,
                                    num_classes=len(Field_LABEL.vocab),
                                    dropout_p=args.dropout_p,
                                    pretrained_embeddings=embeddings,
                                    padding_idx=0)

In [0]:
classifier = classifier.to(device)
#dataset.class_weights = dataset.class_weights.to(device)
#loss_func = nn.CrossEntropyLoss(dataset.class_weights)
loss_func = nn.CrossEntropyLoss()
optimizer = optim.Adam(classifier.parameters(), lr=args.learning_rate)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer,
                                                  mode='min', factor=0.5,
                                                  patience=1)


In [0]:
text_df.rename(columns={'title':'text'},inplace=True)
text_df.columns

Index(['text', 'category'], dtype='object')

In [0]:
vectorizer = Vectorizer(text_df)
dataset._vectorizer = vectorizer


In [0]:
classifier = classifier.to(args.device)
#dataset.class_weights = dataset.class_weights.to(args.device)
#loss_func = nn.CrossEntropyLoss(dataset.class_weights)
loss_func = nn.CrossEntropyLoss()

optimizer = optim.Adam(classifier.parameters(), lr=args.learning_rate)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer,
                                                  mode='min', factor=0.5,
                                                  patience=1)
print("------- # of Parameters ---->: ", sum(p.numel() for p in classifier.parameters() if p.requires_grad))


------- # of Parameters ---->:  2631104


In [0]:
dataset.text_df.rename(columns={'title':'text'},inplace=True)
dataset.text_df.columns

Index(['text', 'category'], dtype='object')

In [0]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(classifier):,} trainable parameters')


The model has 2,631,104 trainable parameters


In [0]:
train_state = make_train_state(args)


In [0]:
train_batch_it = BatchGenerator(Batches_train, 'text', 'category')
X,y = next(iter(train_batch_it))

In [0]:
print(X.shape)
y_pred = classifier(X)
print(y_pred.shape)


torch.Size([64, 256])
torch.Size([64, 4])


In [0]:
y_pred.shape

torch.Size([64, 4])

In [0]:

def build_model(args,dataset,classifier,loss_func,optimizer,scheduler):

    train_state = make_train_state(args)

    try:
        for epoch_index in range(args.num_epochs):
            train_state['epoch_index'] = epoch_index
            print("--------------------- @epoch ",epoch_index,"---------------------")

            # Iterate over training dataset

            # setup: batch generator, set loss and acc to 0, set train mode on

            dataset.set_split('train')
            batch_generator = generate_batches(dataset,
                                               batch_size=args.batch_size,
                                               device=args.device)
            running_loss = 0.0
            running_acc = 0.0
            classifier.train()
            batches = BatchGenerator(Batches_train, 'text', 'category')

            for batch_index, batch_dict in enumerate(batches):
              optimizer.zero_grad()
              y_pred = classifier(batch_dict[0])
              loss = loss_func(y_pred, batch_dict[1])
              loss_t = loss.item()
              running_loss += (loss_t - running_loss) / (batch_index + 1)
              loss.backward()
              optimizer.step()
              acc_t = compute_accuracy(y_pred, batch_dict[1])
              running_acc += (acc_t - running_acc) / (batch_index + 1)

            train_state['train_loss'].append(running_loss)
            train_state['train_acc'].append(running_acc)
            print('  training loss/accuracy {:.5f} / {:.2f}'.format(running_loss, running_acc))

            dataset.set_split('val')

            batches = BatchGenerator(Batches_val, 'text', 'category')
            running_loss = 0.
            running_acc = 0.
            classifier.eval()

            for batch_index, batch_dict in enumerate(batches):
              optimizer.zero_grad()
              y_pred = classifier(batch_dict[0])
              loss = loss_func(y_pred, batch_dict[1])
              loss_t = loss.item()
              running_loss += (loss_t - running_loss) / (batch_index + 1)
              loss.backward()
              optimizer.step()
              acc_t = compute_accuracy(y_pred, batch_dict[1])
              running_acc += (acc_t - running_acc) / (batch_index + 1)

            train_state['val_loss'].append(running_loss)
            train_state['val_acc'].append(running_acc)
            print('validation loss/accuracy {:.5f} / {:.2f}'.format(running_loss, running_acc))

            train_state = update_train_state(args=args, model=classifier,
                                             train_state=train_state)

            scheduler.step(train_state['val_loss'][-1])

            if train_state['stop_early']:
                break

    except KeyboardInterrupt:
        print("Exiting loop")

    # compute the loss & accuracy on the test set using the best available model

    classifier.load_state_dict(torch.load(train_state['model_filename']))
    classifier = classifier.to(args.device)
    loss_func = nn.CrossEntropyLoss()
    dataset.set_split('test')
    batches = BatchGenerator(Batches_test, 'text', 'category')
    running_loss = 0.
    running_acc = 0.
    classifier.eval()

    for batch_index, batch_dict in enumerate(batches):
      optimizer.zero_grad()
      y_pred = classifier(batch_dict[0])
      loss = loss_func(y_pred, batch_dict[1])
      loss_t = loss.item()
      running_loss += (loss_t - running_loss) / (batch_index + 1)
      loss.backward()
      optimizer.step()
      acc_t = compute_accuracy(y_pred, batch_dict[1])
      running_acc += (acc_t - running_acc) / (batch_index + 1)

    train_state['test_loss'] = running_loss
    train_state['test_acc'] = running_acc

    print("Test loss: {:.3f}".format(running_loss))
    print("Test Accuracy: {:.2f}".format(running_acc))

    return train_state


In [0]:
results = build_model(args,dataset,classifier,loss_func,optimizer,scheduler)


--------------------- @epoch  0 ---------------------
  training loss/accuracy 1.30919 / 31.06
validation loss/accuracy 0.89407 / 54.15
--------------------- @epoch  1 ---------------------
  training loss/accuracy 0.50743 / 82.18
validation loss/accuracy 0.38762 / 86.94
--------------------- @epoch  2 ---------------------
  training loss/accuracy 0.35369 / 88.30
validation loss/accuracy 0.31346 / 89.59
--------------------- @epoch  3 ---------------------
  training loss/accuracy 0.28862 / 90.58
validation loss/accuracy 0.26521 / 91.26
--------------------- @epoch  4 ---------------------
  training loss/accuracy 0.24929 / 91.86
validation loss/accuracy 0.23265 / 92.58
--------------------- @epoch  5 ---------------------
  training loss/accuracy 0.22109 / 92.85
validation loss/accuracy 0.20563 / 93.24
--------------------- @epoch  6 ---------------------
  training loss/accuracy 0.19406 / 93.72
validation loss/accuracy 0.18329 / 94.26
--------------------- @epoch  7 ----------------

In [0]:
print(results)

{'stop_early': False, 'early_stopping_step': 0, 'early_stopping_best_val': 100000000.0, 'learning_rate': 0.001, 'epoch_index': 19, 'train_loss': [1.3091858076258294, 0.5074347156483979, 0.35368603939693766, 0.2886233720734219, 0.2492928947494214, 0.22108668155297276, 0.1940635639220761, 0.1678905941775483, 0.1491452649244555, 0.12753541925789616, 0.1109703586423567, 0.09641759916963455, 0.08256678381202052, 0.07361107674290504, 0.06352113793831386, 0.05546512596410703, 0.04954216448740234, 0.04512659004002246, 0.040942304286558436, 0.037469739202957285], 'train_acc': [31.062715424386024, 82.17982550624733, 88.29707022834991, 90.57922231796634, 91.8618052563551, 92.85060318828086, 93.72037914691938, 94.60092632485994, 95.21178909952613, 95.90868698836714, 96.32041684618682, 96.74547608789317, 97.13714454976306, 97.45704976303335, 97.75622037914701, 97.9828199052133, 98.19689788884097, 98.25385071090042, 98.47896919431274, 98.62491921585524], 'val_loss': [0.8940749955786901, 0.3876240466