# CNN Classifier

In [1]:
import numpy as np
import pandas as pd

In [2]:
'''df = pd.read_csv('/content/IMDB_Dataset.csv')'''

"df = pd.read_csv('/content/IMDB_Dataset.csv')"

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import numpy as np
with open('/content/drive/MyDrive/reviews.txt', 'r') as f:
    reviews = f.read()
with open('/content/drive/MyDrive/labels.txt', 'r') as f:
    labels = f.read()

In [5]:
print(reviews[:500])
print()
print(labels[:20])

bromwell high is a cartoon comedy . it ran at the same time as some other programs about school life  such as  teachers  . my   years in the teaching profession lead me to believe that bromwell high  s satire is much closer to reality than is  teachers  . the scramble to survive financially  the insightful students who can see right through their pathetic teachers  pomp  the pettiness of the whole situation  all remind me of the schools i knew and their students . when i saw the episode in which

positive
negative
po


---
## Data Pre-processing


In [6]:
'''from google.colab import drive
drive.mount('/content/drive')'''

"from google.colab import drive\ndrive.mount('/content/drive')"

In [7]:

from string import punctuation

# remove punctuation
reviews = reviews.lower() # lowercase, standardize
all_text = ''.join([c for c in reviews if c not in punctuation])

# split by new lines and spaces
reviews_split = all_text.split('\n')

all_text = ' '.join(reviews_split)

# create a list of all words
all_words = all_text.split()


### Encoding the Labels

Convert "positive" or "negative" labels to numerical values, 1 (positive) and 0 (negative).

In [8]:
# 1=positive, 0=negative label conversion
labels_split = labels.split('\n')
encoded_labels = np.array([1 if label == 'positive' else 0 for label in labels_split])

### Removing Outliers

Remove outliers in following steps
1. Getting rid of extremely long or short reviews; the outliers
2. Padding/truncating the remaining data so that all reviews are of the same length.

In [9]:
from collections import Counter

# Build a dictionary that maps indices to review lengths
counts = Counter(all_words)

review_lens = Counter([len(x.split()) for x in reviews_split])
print("Zero-length reviews: {}".format(review_lens[0]))
print("Maximum review length: {}".format(max(review_lens)))

Zero-length reviews: 1
Maximum review length: 2514


In [10]:
print('Number of reviews before removing outliers: ', len(reviews_split))

non_zero_idx = [ii for ii, review in enumerate(reviews_split) if len(review.split()) != 0]

reviews_split = [reviews_split[ii] for ii in non_zero_idx]
encoded_labels = np.array([encoded_labels[ii] for ii in non_zero_idx])

print('Number of reviews after removing outliers: ', len(reviews_split))

Number of reviews before removing outliers:  25001
Number of reviews after removing outliers:  25000


---
## Using a Pre-Trained Embedding Layer

In [11]:
# Load Word2Vec
from gensim.models import KeyedVectors

# Creating the model
embed_lookup = KeyedVectors.load_word2vec_format('/content/drive/MyDrive/GoogleNews-vectors-negative300-SLIM.bin',
                                                 binary=True)


In [12]:
pretrained_words = []
for index, word in enumerate(embed_lookup.index_to_key):
    pretrained_words.append(word)

In [13]:
row_idx = 1

# get word/embedding in that row
word = pretrained_words[row_idx]
embedding = embed_lookup[word]

# vocab and embedding info
print("Size of Vocab: {}\n".format(len(pretrained_words)))
print('Word in vocab: {}\n'.format(word))
print('Length of embedding: {}\n'.format(len(embedding)))
#print('Associated embedding: \n', embedding)

Size of Vocab: 299567

Word in vocab: for

Length of embedding: 300



In [14]:
# print a few common words
for i in range(5):
    print(pretrained_words[i])

in
for
that
is
on


### Cosine Similarity


In [15]:
find_similar_to = 'fabulous'

print('Similar words to '+find_similar_to+': \n')

# Find similar words, using cosine similarity
for similar_word in embed_lookup.similar_by_word(find_similar_to):
    print("Word: {0}, Similarity: {1:.3f}".format(
        similar_word[0], similar_word[1]
    ))


Similar words to fabulous: 

Word: wonderful, Similarity: 0.761
Word: fantastic, Similarity: 0.761
Word: marvelous, Similarity: 0.730
Word: gorgeous, Similarity: 0.714
Word: lovely, Similarity: 0.713
Word: terrific, Similarity: 0.694
Word: amazing, Similarity: 0.693
Word: beautiful, Similarity: 0.670
Word: magnificent, Similarity: 0.667
Word: splendid, Similarity: 0.645


## Tokenize reviews


In [16]:
def tokenize_all_reviews(embed_lookup, reviews_split):
    reviews_words = [review.split() for review in reviews_split]

    tokenized_reviews = []
    for review in reviews_words:
        ints = []
        for word in review:
            try:
                idx = embed_lookup.key_to_index[word]
            except:
                idx = 0
            ints.append(idx)
        tokenized_reviews.append(ints)

    return tokenized_reviews


In [17]:
tokenized_reviews = tokenize_all_reviews(embed_lookup, reviews_split)

In [18]:
print(tokenized_reviews[0])

[0, 137, 3, 0, 11620, 3799, 13, 1215, 10, 9, 194, 54, 12, 73, 61, 685, 41, 183, 243, 129, 12, 1663, 119, 72, 0, 9, 2989, 7334, 242, 159, 0, 453, 2, 0, 137, 1239, 19951, 3, 141, 1980, 0, 1898, 55, 3, 1663, 9, 11124, 0, 3857, 6663, 9, 20401, 295, 28, 45, 148, 157, 102, 27, 15452, 1663, 30714, 9, 65172, 0, 9, 844, 737, 47, 6585, 159, 0, 9, 668, 4365, 1003, 0, 27, 295, 56, 4365, 622, 9, 3832, 0, 43, 0, 897, 3187, 907, 0, 5396, 113, 9, 183, 4365, 1009, 3165, 10, 137, 0, 3288, 296, 10314, 4365, 6638, 213, 0, 8810, 40, 0, 116, 1663, 897, 2059, 0, 0, 137, 4365, 830, 2, 124, 2216, 0, 119, 782, 144, 2, 0, 137, 3, 330, 23046, 78, 0, 16915, 2, 13, 85275, 7451]


---
## Padding sequences


In [19]:
def pad_features(tokenized_reviews, seq_length):
    features = np.zeros((len(tokenized_reviews), seq_length), dtype=int)
    for i, row in enumerate(tokenized_reviews):
        features[i, -len(row):] = np.array(row)[:seq_length]

    return features

In [20]:
seq_length = 200

features = pad_features(tokenized_reviews, seq_length=seq_length)
assert len(features)==len(tokenized_reviews), "Features should have as many rows as reviews."
assert len(features[0])==seq_length, "Each feature row should contain seq_length values."

print(features[:10,:20])

[[     0      0      0      0      0      0      0      0      0      0
       0      0      0      0      0      0      0      0      0      0]
 [     0      0      0      0      0      0      0      0      0      0
       0      0      0      0      0      0      0      0      0      0]
 [ 16483     26      0     12 106210      0   1698     22     37     24
     432      1     72     30    275      0    303      0    162    126]
 [  1935   1326     12      0   1403     60   3921   2019      3   4809
      36      6   3172   7184    129   7951      0   2180   6098 166268]
 [     0      0      0      0      0      0      0      0      0      0
       0      0      0      0      0      0      0      0      0      0]
 [     0      0      0      0      0      0      0      0      0      0
       0      0      0      0      0      0      0      0      0      0]
 [     0      0      0      0      0      0      0      0      0      0
       0      0      0      0      0      0      0      0 

---
## Training, Validation, and Test Data


In [21]:
split_frac = 0.8

## split data into training, validation, and test data (features and labels, x and y)

split_idx = int(len(features)*split_frac)
train_x, remaining_x = features[:split_idx], features[split_idx:]
train_y, remaining_y = encoded_labels[:split_idx], encoded_labels[split_idx:]

test_idx = int(len(remaining_x)*0.5)
val_x, test_x = remaining_x[:test_idx], remaining_x[test_idx:]
val_y, test_y = remaining_y[:test_idx], remaining_y[test_idx:]

## print out the shapes of your resultant feature data
print("\t\t\tFeature Shapes:")
print("Train set: \t\t{}".format(train_x.shape),
      "\nValidation set: \t{}".format(val_x.shape),
      "\nTest set: \t\t{}".format(test_x.shape))

			Feature Shapes:
Train set: 		(20000, 200) 
Validation set: 	(2500, 200) 
Test set: 		(2500, 200)


## DataLoaders and Batching


In [22]:
import torch
from torch.utils.data import TensorDataset, DataLoader

# create Tensor datasets
train_data = TensorDataset(torch.from_numpy(train_x), torch.from_numpy(train_y))
valid_data = TensorDataset(torch.from_numpy(val_x), torch.from_numpy(val_y))
test_data = TensorDataset(torch.from_numpy(test_x), torch.from_numpy(test_y))

# dataloaders
batch_size = 50

# shuffling and batching data
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
valid_loader = DataLoader(valid_data, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size)

---
# Sentiment Network with PyTorch


In [23]:
train_on_gpu=torch.cuda.is_available()

if(train_on_gpu):
    print('Training on GPU.')
else:
    print('No GPU available, training on CPU.')

Training on GPU.


In [24]:
import torch.nn as nn
import torch.nn.functional as F

class SentimentCNN(nn.Module):


    def __init__(self, embed_model, vocab_size, output_size, embedding_dim,
                 num_filters=100, kernel_sizes=[3, 4, 5], freeze_embeddings=True, drop_prob=0.5):

        super(SentimentCNN, self).__init__()

        self.num_filters = num_filters
        self.embedding_dim = embedding_dim

        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.embedding.weight = nn.Parameter(torch.from_numpy(embed_model.vectors)) # all vectors
        if freeze_embeddings:
            self.embedding.requires_grad = False

        self.convs_1d = nn.ModuleList([
            nn.Conv2d(1, num_filters, (k, embedding_dim), padding=(k-2,0))
            for k in kernel_sizes])

        self.fc = nn.Linear(len(kernel_sizes) * num_filters, output_size)

        self.dropout = nn.Dropout(drop_prob)
        self.sig = nn.Sigmoid()


    def conv_and_pool(self, x, conv):

        x = F.relu(conv(x)).squeeze(3)

        x_max = F.max_pool1d(x, x.size(2)).squeeze(2)
        return x_max

    def forward(self, x):
        embeds = self.embedding(x)
        embeds = embeds.unsqueeze(1)
        conv_results = [self.conv_and_pool(embeds, conv) for conv in self.convs_1d]
        x = torch.cat(conv_results, 1)
        x = self.dropout(x)
        logit = self.fc(x)
        return self.sig(logit)


## Instantiate the network


In [25]:
vocab_size = len(pretrained_words)
output_size = 1
embedding_dim = len(embed_lookup[pretrained_words[0]])
num_filters = 100
kernel_sizes = [3, 4, 5]

net3 = SentimentCNN(embed_lookup, vocab_size, output_size, embedding_dim,
                   num_filters, kernel_sizes)

print(net3)

SentimentCNN(
  (embedding): Embedding(299567, 300)
  (convs_1d): ModuleList(
    (0): Conv2d(1, 100, kernel_size=(3, 300), stride=(1, 1), padding=(1, 0))
    (1): Conv2d(1, 100, kernel_size=(4, 300), stride=(1, 1), padding=(2, 0))
    (2): Conv2d(1, 100, kernel_size=(5, 300), stride=(1, 1), padding=(3, 0))
  )
  (fc): Linear(in_features=300, out_features=1, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
  (sig): Sigmoid()
)


---
## Training


In [26]:
lr=0.001

criterion = nn.BCELoss()
optimizer = torch.optim.Adam(net3.parameters(), lr=lr)


In [27]:
def train(net, train_loader, epochs, print_every=100):

    if(train_on_gpu):
        net.cuda()

    counter = 0

    net.train()
    for e in range(epochs):

        for inputs, labels in train_loader:
            counter += 1

            if(train_on_gpu):
                inputs, labels = inputs.cuda(), labels.cuda()

            net.zero_grad()

            output = net(inputs)

            loss = criterion(output.squeeze(), labels.float())
            loss.backward()
            optimizer.step()

            if counter % print_every == 0:
                val_losses = []
                net.eval()
                for inputs, labels in valid_loader:

                    if(train_on_gpu):
                        inputs, labels = inputs.cuda(), labels.cuda()

                    output = net(inputs)
                    val_loss = criterion(output.squeeze(), labels.float())

                    val_losses.append(val_loss.item())

                net.train()
                print("Epoch: {}/{}...".format(e+1, epochs),
                      "Step: {}...".format(counter),
                      "Loss: {:.6f}...".format(loss.item()),
                      "Val Loss: {:.6f}".format(np.mean(val_losses)))

In [28]:


epochs = 2
print_every = 100

train(net3, train_loader, epochs, print_every=print_every)

Epoch: 1/2... Step: 100... Loss: 0.404961... Val Loss: 0.456944
Epoch: 1/2... Step: 200... Loss: 0.478754... Val Loss: 0.371079
Epoch: 1/2... Step: 300... Loss: 0.331406... Val Loss: 0.342721
Epoch: 1/2... Step: 400... Loss: 0.302353... Val Loss: 0.330672
Epoch: 2/2... Step: 500... Loss: 0.204000... Val Loss: 0.344079
Epoch: 2/2... Step: 600... Loss: 0.239894... Val Loss: 0.371066
Epoch: 2/2... Step: 700... Loss: 0.246180... Val Loss: 0.364074
Epoch: 2/2... Step: 800... Loss: 0.155306... Val Loss: 0.360754


---
## Testing

In [29]:
test_losses = []
num_correct = 0


net3.eval()
for inputs, labels in test_loader:

    if(train_on_gpu):
        inputs, labels = inputs.cuda(), labels.cuda()

    output = net3(inputs)

    test_loss = criterion(output.squeeze(), labels.float())
    test_losses.append(test_loss.item())

    pred = torch.round(output.squeeze())

    correct_tensor = pred.eq(labels.float().view_as(pred))
    correct = np.squeeze(correct_tensor.numpy()) if not train_on_gpu else np.squeeze(correct_tensor.cpu().numpy())
    num_correct += np.sum(correct)


print("Test loss: {:.3f}".format(np.mean(test_losses)))

test_acc = num_correct/len(test_loader.dataset)
print("Test accuracy: {:.3f}".format(test_acc))

Test loss: 0.404
Test accuracy: 0.831


### Inference on a test review



In [30]:
from string import punctuation

def tokenize_review(embed_lookup, test_review):
    test_review = test_review.lower()
    test_text = ''.join([c for c in test_review if c not in punctuation])

    test_words = test_text.split()

    tokenized_review = []
    for word in test_words:
        try:
            idx = embed_lookup.vocab[word].index
        except:
            idx = 0
        tokenized_review.append(idx)

    return tokenized_review


In [31]:
def predict(embed_lookup, net3, test_review, sequence_length=200):


    net3.eval()
    print(test_review)
    test_ints = tokenize_review(embed_lookup, test_review)

    seq_length=sequence_length
    features = pad_features([test_ints], seq_length)

    feature_tensor = torch.from_numpy(features)

    batch_size = feature_tensor.size(0)

    if(train_on_gpu):
        feature_tensor = feature_tensor.cuda()

    output = net3(feature_tensor)

    pred = torch.round(output.squeeze())
    print('Prediction value, pre-rounding: {:.6f}'.format(output.item()))

    if(pred.item()==1):
        print("Positive review detected!")
    else:
        print("Negative review detected.")


### Test on pos/neg reviews


In [32]:
seq_length=200

In [33]:
test_review_pos = 'This movie had the best acting and the dialogue was so great. I loved it.'

predict(embed_lookup, net3, test_review_pos, seq_length)

This movie had the best acting and the dialogue was so great. I loved it.
Prediction value, pre-rounding: 0.517462
Positive review detected!
