In [10]:
# !conda install scikit-learn -y

Collecting package metadata (current_repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: /Users/sachin/anaconda3/envs/torch

  added / updated specs:
    - scikit-learn


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    joblib-0.14.1              |             py_0         201 KB
    ------------------------------------------------------------
                                           Total:         201 KB

The following NEW packages will be INSTALLED:

  joblib             pkgs/main/noarch::joblib-0.14.1-py_0
  llvm-openmp        pkgs/main/osx-64::llvm-openmp-4.0.1-hcfea43d_1
  scikit-learn       pkgs/main/osx-64::scikit-learn-0.21.3-py37h27c97d8_0
  scipy              pkgs/main/osx-64::scipy-1.3.1-py37h1410ff5_0



Downloading and Extracting Packages
joblib-0.14.1        | 201 KB    | ##################################### | 100% 
Preparing transa

In [4]:
import numpy as np
import pandas as pd
import nltk
import re

# Read dataset

In [5]:
df = pd.read_csv("~/Data/IMDB/IMDB_Dataset.csv")

In [6]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


# Load BERT

In [7]:
import torch

In [8]:
from transformers import BertTokenizer, BertModel

To use data.metrics please install scikit-learn. See https://scikit-learn.org/stable/index.html


In [11]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

# Text Preprocessing

In [12]:
def clean_html(text):
    cleaner_regex = re.compile('<.*?>')
    clean_text = re.sub(cleaner_regex, '', text)
    return clean_text
    

In [14]:
def sentiment_mapper(sent):
    if sent == "positive":
        return 1
    else:
        return 0

In [15]:
def bert_formatting(text):
    sent_text = nltk.sent_tokenize(text)
    sent = "[CLS] "
    for j in sent_text:
        sent = sent + j + " [SEP]"
        
    return sent

In [27]:
# the first element of output is the hidden state of the last layer of the bert model
def bert_encoder(text):
    tokenized_text = tokenizer.tokenize(text)
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    
    if len(indexed_tokens) > 512:
        indexed_tokens = indexed_tokens[:512]
        
    segment_ids = [1] * len(indexed_tokens)
    
    assert len(indexed_tokens) == len(segment_ids)
    
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segment_ids])
    
    assert tokens_tensor.shape == segments_tensors.shape
    
    with torch.no_grad():
        outputs = model(tokens_tensor, token_type_ids=segments_tensors)
        
    return outputs[0].numpy()

In [17]:
df['Clean_Text'] = df['review'].apply(clean_html)

In [18]:
df.head()

Unnamed: 0,review,sentiment,Clean_Text
0,One of the other reviewers has mentioned that ...,positive,One of the other reviewers has mentioned that ...
1,A wonderful little production. <br /><br />The...,positive,A wonderful little production. The filming tec...
2,I thought this was a wonderful way to spend ti...,positive,I thought this was a wonderful way to spend ti...
3,Basically there's a family where a little boy ...,negative,Basically there's a family where a little boy ...
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,"Petter Mattei's ""Love in the Time of Money"" is..."


In [19]:
df['y'] = df['sentiment'].apply(sentiment_mapper)

In [20]:
df.head()

Unnamed: 0,review,sentiment,Clean_Text,y
0,One of the other reviewers has mentioned that ...,positive,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,positive,A wonderful little production. The filming tec...,1
2,I thought this was a wonderful way to spend ti...,positive,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,negative,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [21]:
df['Bert_Ready_Text'] = df['Clean_Text'].apply(bert_formatting)

In [23]:
df['Bert_Ready_Text'].head()

0    [CLS] One of the other reviewers has mentioned...
1    [CLS] A wonderful little production. [SEP]The ...
2    [CLS] I thought this was a wonderful way to sp...
3    [CLS] Basically there's a family where a littl...
4    [CLS] Petter Mattei's "Love in the Time of Mon...
Name: Bert_Ready_Text, dtype: object

In [28]:
%%time
data = {}
for j in range(df.shape[0]):
    
    x = bert_encoder(df['Bert_Ready_Text'][j])
    
    if (j+1)%200 == 0:
        print(f'{j+1}/{df.shape[0]}')
        
    data[j] = x
        

KeyboardInterrupt: 

In [None]:
import pickle
with open('imdb_encoded_data.pickle', 'wb') as handle:
    pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)