In [1]:
import numpy as np
import pandas as pd
import nltk
import re

# Read dataset

In [2]:
df = pd.read_csv("~/Data/IMDB/IMDB_Dataset.csv")

In [3]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [5]:
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive
6,I sure would like to see a resurrection of a u...,positive
7,"This show was an amazing, fresh & innovative i...",negative
8,Encouraged by the positive comments about this...,negative
9,If you like original gut wrenching laughter yo...,positive


# Load BERT

In [6]:
import torch

In [7]:
from transformers import BertTokenizer, BertModel

In [8]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

# Text Preprocessing

In [9]:
def clean_html(text):
    cleaner_regex = re.compile('<.*?>')
    clean_text = re.sub(cleaner_regex, '', text)
    return clean_text
    

In [10]:
def sentiment_mapper(sent):
    if sent == "positive":
        return 1
    else:
        return 0

In [11]:
def bert_formatting(text):
    sent_text = nltk.sent_tokenize(text)
    sent = "[CLS] "
    for j in sent_text:
        sent = sent + j + " [SEP]"
        
    return sent

In [12]:
# the first element of output is the hidden state of the last layer of the bert model
def bert_encoder(text):
    tokenized_text = tokenizer.tokenize(text)
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    
    if len(indexed_tokens) > 512:
        indexed_tokens = indexed_tokens[:512]
        
    segment_ids = [1] * len(indexed_tokens)
    
    assert len(indexed_tokens) == len(segment_ids)
    
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segment_ids])
    
    assert tokens_tensor.shape == segments_tensors.shape
    
    with torch.no_grad():
        outputs = model(tokens_tensor, token_type_ids=segments_tensors)
        
    return outputs[0].numpy()

In [13]:
df['Clean_Text'] = df['review'].apply(clean_html)

In [14]:
df.head()

Unnamed: 0,review,sentiment,Clean_Text
0,One of the other reviewers has mentioned that ...,positive,One of the other reviewers has mentioned that ...
1,A wonderful little production. <br /><br />The...,positive,A wonderful little production. The filming tec...
2,I thought this was a wonderful way to spend ti...,positive,I thought this was a wonderful way to spend ti...
3,Basically there's a family where a little boy ...,negative,Basically there's a family where a little boy ...
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,"Petter Mattei's ""Love in the Time of Money"" is..."


In [15]:
df['y'] = df['sentiment'].apply(sentiment_mapper)

In [16]:
df.head()

Unnamed: 0,review,sentiment,Clean_Text,y
0,One of the other reviewers has mentioned that ...,positive,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,positive,A wonderful little production. The filming tec...,1
2,I thought this was a wonderful way to spend ti...,positive,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,negative,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [17]:
df['Bert_Ready_Text'] = df['Clean_Text'].apply(bert_formatting)

In [18]:
df['Bert_Ready_Text'].head()

0    [CLS] One of the other reviewers has mentioned...
1    [CLS] A wonderful little production. [SEP]The ...
2    [CLS] I thought this was a wonderful way to sp...
3    [CLS] Basically there's a family where a littl...
4    [CLS] Petter Mattei's "Love in the Time of Mon...
Name: Bert_Ready_Text, dtype: object

# Train Test Split

In [43]:
test_size = int(0.15*df.shape[0])

In [44]:
train_df = df[:-test_size]
test_df = df[-test_size:]

In [45]:
train_df.shape

(34, 5)

In [46]:
test_df.shape

(6, 5)

In [19]:
%%time
train_data = {}
for j in range(train_df.shape[0]):
    
    x = bert_encoder(train_df['Bert_Ready_Text'][j])
    z = np.random.uniform(-0.5, 0.5, size=(1,512,768))
    
    z[:,:x.shape[1],:] = x

    if (j+1)%200 == 0:
        print(f'{j+1}/{train_df.shape[0]}')
        
    train_data[j] = z

(1, 402, 768)
(1, 512, 768)
(1, 206, 768)
(1, 512, 768)
(1, 208, 768)
(1, 512, 768)
(1, 174, 768)
(1, 512, 768)
(1, 289, 768)
(1, 512, 768)
(1, 167, 768)
(1, 512, 768)
(1, 182, 768)
(1, 512, 768)
(1, 223, 768)
(1, 512, 768)
(1, 166, 768)
(1, 512, 768)
(1, 43, 768)
(1, 512, 768)
(1, 113, 768)
(1, 512, 768)
(1, 218, 768)
(1, 512, 768)
(1, 512, 768)
(1, 512, 768)
(1, 136, 768)
(1, 512, 768)
(1, 70, 768)
(1, 512, 768)
(1, 192, 768)
(1, 512, 768)
(1, 175, 768)
(1, 512, 768)
(1, 315, 768)
(1, 512, 768)
(1, 146, 768)
(1, 512, 768)
(1, 174, 768)
(1, 512, 768)
(1, 442, 768)
(1, 512, 768)
(1, 248, 768)
(1, 512, 768)
(1, 95, 768)
(1, 512, 768)
(1, 455, 768)
(1, 512, 768)
(1, 194, 768)
(1, 512, 768)
(1, 139, 768)
(1, 512, 768)
(1, 512, 768)
(1, 512, 768)
(1, 183, 768)
(1, 512, 768)
(1, 179, 768)
(1, 512, 768)
(1, 512, 768)
(1, 512, 768)
(1, 512, 768)
(1, 512, 768)
(1, 465, 768)
(1, 512, 768)
(1, 140, 768)
(1, 512, 768)
(1, 512, 768)
(1, 512, 768)
(1, 312, 768)
(1, 512, 768)
(1, 494, 768)
(1, 512, 

In [None]:
%%time
test_data = {}
for j in range(test_df.shape[0]):
    
    x = bert_encoder(test_df['Bert_Ready_Text'][j])
    z = np.random.uniform(-0.5, 0.5, size=(1,512,768))
    
    z[:,:x.shape[1],:] = x

    if (j+1)%200 == 0:
        print(f'{j+1}/{test_df.shape[0]}')
        
    test_data[j] = z

In [20]:
import pickle
with open('/Users/sachin/Data/IMDB/imdb_encoded_train_data.pickle', 'wb') as handle:
    pickle.dump(train_data, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('/Users/sachin/Data/IMDB/imdb_encoded_test_data.pickle', 'wb') as handle:
    pickle.dump(test_data, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [47]:
train_df.to_csv('/Users/sachin/Data/IMDB/train_imdb.csv', index=False)

In [48]:
test_df.to_csv('/Users/sachin/Data/IMDB/test_imdb.csv', index=False)