In [1]:
#@title Case Study Walkthrough
#@markdown  NLP with CNNs
from IPython.display import HTML

HTML("""<video width="320" height="240" controls>
  <source src="https://cdn.talentsprint.com/talentsprint/archives/sc/aiml/aiml_2018_b7_hyd/preview_videos/nlp_with_cnns.mp4">
</video>
""")

The objective of this experiment is to see the application of Convolutional Neural Networks in NLP.

####Note that this case study based on this [paper.](http://www.aclweb.org/anthology/D14-1181)

In [2]:
! wget https://cdn.talentsprint.com/aiml/Experiment_related_data/week9/Exp2/AIML_DS_GOOGLENEWS-VECTORS-NEGATIVE-300_STD.bin&sa=D&source=hangouts&ust=1550651743825000&usg=AFQjCNHh2LSwNi9czsqAAuBLvx_vDeUE_Q

--2022-06-06 15:36:34--  https://cdn.talentsprint.com/aiml/Experiment_related_data/week9/Exp2/AIML_DS_GOOGLENEWS-VECTORS-NEGATIVE-300_STD.bin
Resolving cdn.talentsprint.com (cdn.talentsprint.com)... 172.105.34.236
Connecting to cdn.talentsprint.com (cdn.talentsprint.com)|172.105.34.236|:443... connected.
HTTP request sent, awaiting response... 302 Moved Temporarily
Location: https://cdn.ap-south-1.linodeobjects.com/public_html/aiml/Experiment_related_data/week9/Exp2/AIML_DS_GOOGLENEWS-VECTORS-NEGATIVE-300_STD.bin [following]
--2022-06-06 15:36:34--  https://cdn.ap-south-1.linodeobjects.com/public_html/aiml/Experiment_related_data/week9/Exp2/AIML_DS_GOOGLENEWS-VECTORS-NEGATIVE-300_STD.bin
Resolving cdn.ap-south-1.linodeobjects.com (cdn.ap-south-1.linodeobjects.com)... 172.104.36.102, 103.3.61.236, 139.162.42.49, ...
Connecting to cdn.ap-south-1.linodeobjects.com (cdn.ap-south-1.linodeobjects.com)|172.104.36.102|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 36

In [3]:
ls

AIML_DS_GOOGLENEWS-VECTORS-NEGATIVE-300_STD.bin  [0m[01;34msample_data[0m/


##Importing required packages

In [4]:
import re
import nltk
import random
import numpy as np
from copy import deepcopy
from collections import Counter, OrderedDict

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable

flatten = lambda l: [item for sublist in l for item in sublist]
random.seed(1024)

## Code for accessing CUDA

In [5]:
USE_CUDA = torch.cuda.is_available()
gpus = [0]
torch.cuda.set_device(gpus[0])

FloatTensor = torch.cuda.FloatTensor if USE_CUDA else torch.FloatTensor
LongTensor = torch.cuda.LongTensor if USE_CUDA else torch.LongTensor
ByteTensor = torch.cuda.ByteTensor if USE_CUDA else torch.ByteTensor

## Function to split the data in to batches

In [6]:
def getBatch(batch_size, train_data):
    random.shuffle(train_data)
    sindex = 0
    eindex = batch_size
    while eindex < len(train_data):
        batch = train_data[sindex: eindex]
        temp = eindex
        eindex = eindex + batch_size
        sindex = temp
        yield batch
    
    if eindex >= len(train_data):
        batch = train_data[sindex:]
        yield batch

## Function to add the padding to batches if required

In [7]:
def pad_to_batch(batch):
    x,y = zip(*batch)
    max_x = max([s.size(1) for s in x])
    x_p = []
    for i in range(len(batch)):
        if x[i].size(1) < max_x:
            x_p.append(torch.cat([x[i], Variable(LongTensor([word2index['<PAD>']] * (max_x - x[i].size(1)))).view(1, -1)], 1))
        else:
            x_p.append(x[i])
    return torch.cat(x_p), torch.cat(y).view(-1)

## Function to prepare the sequence

In [8]:
def prepare_sequence(seq, to_index):
    idxs = list(map(lambda w: to_index[w] if to_index.get(w) is not None else to_index["<UNK>"], seq))
    #print(idxs)
    return Variable(LongTensor(idxs))

## Data load & Preprocessing

### TREC question dataset(http://cogcomp.org/Data/QA/QC/)

The following command gets the required TREC question dataset.

In [9]:
! wget http://cogcomp.org/Data/QA/QC/train_5500.label

--2022-06-06 15:44:53--  http://cogcomp.org/Data/QA/QC/train_5500.label
Resolving cogcomp.org (cogcomp.org)... 173.236.182.118
Connecting to cogcomp.org (cogcomp.org)|173.236.182.118|:80... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://www.cogcomp.org/Data/QA/QC/train_5500.label [following]
--2022-06-06 15:44:53--  http://www.cogcomp.org/Data/QA/QC/train_5500.label
Resolving www.cogcomp.org (www.cogcomp.org)... 173.236.182.118
Reusing existing connection to cogcomp.org:80.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://cogcomp.seas.upenn.edu/Data/QA/QC/train_5500.label [following]
--2022-06-06 15:44:54--  https://cogcomp.seas.upenn.edu/Data/QA/QC/train_5500.label
Resolving cogcomp.seas.upenn.edu (cogcomp.seas.upenn.edu)... 158.130.57.77
Connecting to cogcomp.seas.upenn.edu (cogcomp.seas.upenn.edu)|158.130.57.77|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 335858 (328K)
Saving to: ‘

In [10]:
! ls

AIML_DS_GOOGLENEWS-VECTORS-NEGATIVE-300_STD.bin  sample_data  train_5500.label


Task involves
classifying a question into 6 question
types (whether the question is about person,
location, numeric information, etc.)

## Load the data

In [11]:
data = open('train_5500.label', 'r', encoding='latin-1').readlines()

In [12]:
data[:5]

['DESC:manner How did serfdom develop in and then leave Russia ?\n',
 'ENTY:cremat What films featured the character Popeye Doyle ?\n',
 "DESC:manner How can I find a list of celebrities ' real names ?\n",
 'ENTY:animal What fowl grabs the spotlight after the Chinese Year of the Monkey ?\n',
 'ABBR:exp What is the full form of .com ?\n']

## Split the data by seperating the labels

In [13]:
data = [[d.split(':')[1][:-1], d.split(':')[0]] for d in data]

In [14]:
data[:5]

[['manner How did serfdom develop in and then leave Russia ?', 'DESC'],
 ['cremat What films featured the character Popeye Doyle ?', 'ENTY'],
 ["manner How can I find a list of celebrities ' real names ?", 'DESC'],
 ['animal What fowl grabs the spotlight after the Chinese Year of the Monkey ?',
  'ENTY'],
 ['exp What is the full form of .com ?', 'ABBR']]

In [15]:
X, y = list(zip(*data))
X = list(X)

In [17]:
print(X[:5])

['manner How did serfdom develop in and then leave Russia ?', 'cremat What films featured the character Popeye Doyle ?', "manner How can I find a list of celebrities ' real names ?", 'animal What fowl grabs the spotlight after the Chinese Year of the Monkey ?', 'exp What is the full form of .com ?']


In [18]:
print(y[:5])

('DESC', 'ENTY', 'DESC', 'ENTY', 'ABBR')


## Print the labels in the data

In [19]:
set(y)

{'ABBR', 'DESC', 'ENTY', 'HUM', 'LOC', 'NUM'}

## Number masking 

In [20]:
for i, x in enumerate(X):
    X[i] = re.sub('\d', '#', x).split()

Replacing the numbers with # (hash)

It reduces the search space. 

For example, 

my birthday is 12.22 ==> my birthday is ##.##

In [21]:
X[:2]

[['manner',
  'How',
  'did',
  'serfdom',
  'develop',
  'in',
  'and',
  'then',
  'leave',
  'Russia',
  '?'],
 ['cremat',
  'What',
  'films',
  'featured',
  'the',
  'character',
  'Popeye',
  'Doyle',
  '?']]

## Building the Vocabulary

In [22]:
vocab = list(set(flatten(X)))
print(len(vocab))
print(vocab)

9117
['visiting', 'typhoid', 'cabinetmaker', 'Needle', 'Oh', 'III', 'Johnston', 'Tulip', 'pearls', 'o', 'invent', 'folic', 'idle', 'shadow', 'convicted', 'Contract', 'cents', 'microprocessors', 'Magoo', 'streak', 'jockey', 'least-populated', 'ripening', 'Wile', 'move', 'function', 'What', 'copy', 'rise', 'cognac', 'North', 'says', 'Future', 'support', 'witch', 'assent', 'registration', 'Fatman', 'her', 'neurosurgeon', 'cry', 'co-educational', 'wrestling', 'Sistine', 'years', 'supports', 'being', 'comedian', 'Lewis', 'Peabody', 'Page', 'scythe', 'Wide', 'R-Rated', 'made', 'Dane', 'call', 'Pastorius', 'debts', 'weapon', 'planned', 'soundtrack', 'Strange', 'span', 'intractable', 'boil', 'Lebanon', 'heir', 'traffic', 'kidnaping', 'Stallone', 'pass', 'weft', 'example', 'proceed', 'method', 'Sultan', 'locomotive', 'Gulliver', 'safety', 'co-founder', 'describe', 'own', 'Jews', 'Butler', 'expelled', 'UOL', 'HIV', 'premiered', 'Krypton', 'Olestra', 'Protestant', 'clockwise', 'multiplexer', 'Hum

## Check for number of classes

In [23]:
len(set(y)) # num of class

6

## Create the index to words in the vocabulary

In [24]:
word2index={'<PAD>': 0, '<UNK>': 1}
print(len(word2index))

2


In [25]:
print(word2index.get('<PAD>'))
print(word2index.get('<UNK>'))
print(word2index.get(vocab[1]))

0
1
None


In [26]:
for vo in vocab:
    if word2index.get(vo) is None:
        word2index[vo] = len(word2index)
#print(word2index)
index2word = {v:k for k, v in word2index.items()}
#print(index2word)

## Create the index to target

In [27]:
target2index = {}

for cl in set(y):
    if target2index.get(cl) is None:
        target2index[cl] = len(target2index)

index2target = {v:k for k, v in target2index.items()}

## Preparing the data in tensor format

In [28]:
X_p, y_p = [], []
for pair in zip(X,y):
    ## Create the indexes for the list of split words of questions present in X and changing to tensor format
    X_p.append(prepare_sequence(pair[0], word2index).view(1, -1)) 
    ## Changes the format of labels to tensor format
    y_p.append(Variable(LongTensor([target2index[pair[1]]])).view(1, -1))

## Zipping both the data and labels and shuffle randomly

In [29]:
data_p = list(zip(X_p, y_p))
random.shuffle(data_p)

## Split the data into train and test

In [30]:
train_data = data_p[: int(len(data_p) * 0.9)]
test_data = data_p[int(len(data_p) * 0.9):]

## Load Pretrained word vector

In [31]:
import gensim

model = gensim.models.KeyedVectors.load_word2vec_format('AIML_DS_GOOGLENEWS-VECTORS-NEGATIVE-300_STD.bin', binary=True)
len(model.index2word)

3000000

In [32]:
model.index2word

['</s>',
 'in',
 'for',
 'that',
 'is',
 'on',
 '##',
 'The',
 'with',
 'said',
 'was',
 'the',
 'at',
 'not',
 'as',
 'it',
 'be',
 'from',
 'by',
 'are',
 'I',
 'have',
 'he',
 'will',
 'has',
 '####',
 'his',
 'an',
 'this',
 'or',
 'their',
 'who',
 'they',
 'but',
 '$',
 'had',
 'year',
 'were',
 'we',
 'more',
 '###',
 'up',
 'been',
 'you',
 'its',
 'one',
 'about',
 'would',
 'which',
 'out',
 'can',
 'It',
 'all',
 'also',
 'two',
 'after',
 'first',
 'He',
 'do',
 'time',
 'than',
 'when',
 'We',
 'over',
 'last',
 'new',
 'other',
 'her',
 'people',
 'into',
 'In',
 'our',
 'there',
 'A',
 'she',
 'could',
 'just',
 'years',
 'some',
 'U.S.',
 'three',
 'million',
 'them',
 'what',
 'But',
 'so',
 'no',
 'like',
 'if',
 'only',
 'percent',
 'get',
 'did',
 'him',
 'game',
 'back',
 'because',
 'now',
 '#.#',
 'before',
 'company',
 'any',
 'team',
 'against',
 'off',
 'This',
 'most',
 'made',
 'through',
 'make',
 'second',
 'state',
 'well',
 'day',
 'season',
 'says',
 'w

In [33]:
word2index.keys()

dict_keys(['<PAD>', '<UNK>', 'visiting', 'typhoid', 'cabinetmaker', 'Needle', 'Oh', 'III', 'Johnston', 'Tulip', 'pearls', 'o', 'invent', 'folic', 'idle', 'shadow', 'convicted', 'Contract', 'cents', 'microprocessors', 'Magoo', 'streak', 'jockey', 'least-populated', 'ripening', 'Wile', 'move', 'function', 'What', 'copy', 'rise', 'cognac', 'North', 'says', 'Future', 'support', 'witch', 'assent', 'registration', 'Fatman', 'her', 'neurosurgeon', 'cry', 'co-educational', 'wrestling', 'Sistine', 'years', 'supports', 'being', 'comedian', 'Lewis', 'Peabody', 'Page', 'scythe', 'Wide', 'R-Rated', 'made', 'Dane', 'call', 'Pastorius', 'debts', 'weapon', 'planned', 'soundtrack', 'Strange', 'span', 'intractable', 'boil', 'Lebanon', 'heir', 'traffic', 'kidnaping', 'Stallone', 'pass', 'weft', 'example', 'proceed', 'method', 'Sultan', 'locomotive', 'Gulliver', 'safety', 'co-founder', 'describe', 'own', 'Jews', 'Butler', 'expelled', 'UOL', 'HIV', 'premiered', 'Krypton', 'Olestra', 'Protestant', 'clockwis

In [34]:
print(model['pail'].shape)
print(np.random.randn(300).shape)

(300,)
(300,)


## Get the vector corresponding to the word using the pretrained model

In [35]:
pretrained = []

for index, key in enumerate(word2index.keys()):
    try:
        pretrained.append(model[key])
    except:
        #print(index, key)
        pretrained.append(np.random.randn(300))
        
pretrained_vectors = np.vstack(pretrained)
#print(pretrained)

## Modeling 


![alttxt](https://cdn.talentsprint.com/aiml/Casestudies_slides/NLP_with_CNN/NLP_with_CNN.png)





The above image is borrowed from this [paper.](http://www.aclweb.org/anthology/D14-1181)

## Define CNN classifier architecture for classification as per the paper 

In [36]:
class  CNNClassifier(nn.Module):
    
    def __init__(self, vocab_size, embedding_dim, output_size, kernel_dim=100, kernel_sizes=(3, 4, 5), dropout=0.5):
        super(CNNClassifier,self).__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.convs = nn.ModuleList([nn.Conv2d(1, kernel_dim, (K, embedding_dim)) for K in kernel_sizes])

        # kernal_size = (K,D) 
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(len(kernel_sizes) * kernel_dim, output_size)
    
    
    def init_weights(self, pretrained_word_vectors, is_static=False):
        self.embedding.weight = nn.Parameter(torch.from_numpy(pretrained_word_vectors).float())
        if is_static:
            self.embedding.weight.requires_grad = False


    def forward(self, inputs, is_training=False):
        inputs = self.embedding(inputs).unsqueeze(1)
        inputs = [F.relu(conv(inputs)).squeeze(3) for conv in self.convs]
        inputs = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in inputs]

        concated = torch.cat(inputs, 1)

        if is_training:
            concated = self.dropout(concated)
        out = self.fc(concated) 
        return F.log_softmax(out,1)

## Training the model 

##It takes for a while if you use just cpu.

## Set the parameters

In [None]:
EPOCH = 5
BATCH_SIZE = 50
KERNEL_SIZES = [2,2,2]
KERNEL_DIM = 100
LR = 0.001

## Set up the defined CNN model and  Initialize embedding matrix using pretrained vectors

In [None]:
model = CNNClassifier(len(word2index), 300, len(target2index), KERNEL_DIM, KERNEL_SIZES)
model.init_weights(pretrained_vectors) # initialize embedding matrix using pretrained vectors

## Switch on the CUDA

In [None]:
if USE_CUDA:
    model = model.cuda()

## Define loss function and optimizer

In [None]:
loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=LR)

## Train the data batch wise

In [None]:
for epoch in range(EPOCH):
    losses = []
    for i,batch in enumerate(getBatch(BATCH_SIZE, train_data)):
        inputs,targets = pad_to_batch(batch)
        
        model.zero_grad()
        preds = model(inputs, True)
        
        loss = loss_function(preds, targets)
        losses.append(loss.data.item())
        loss.backward()
        
        #for param in model.parameters():
        #    param.grad.data.clamp_(-3, 3)
        
        optimizer.step()
        
        if i % 100 == 0:
            print("[%d/%d] mean_loss : %0.2f" %(epoch, EPOCH, np.mean(losses)))
            losses = []

## Predict the test data with the trained model and calculate the test accuracy 

In [None]:
accuracy = 0
for test in test_data:
    pred = model(test[0].cuda()).max(1)[1]
    pred = pred.data.tolist()[0]
    target = test[1].data.tolist()[0][0]
    if pred == target:
        accuracy += 1

print(accuracy/len(test_data) * 100)