In [None]:
# Install keras-bert
!pip install -q keras-bert
!pip install tqdm

In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory



import keras_bert
import pandas as pd
import numpy as np
import re
import os
from IPython.display import HTML

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import text 
from sklearn.decomposition import PCA

from tensorflow.python.keras.models import Sequential, load_model
from tensorflow.python.keras.layers import Dense, Dropout
from tensorflow.python.keras import optimizers


import warnings
warnings.filterwarnings('ignore')
import os


from sklearn.model_selection import train_test_split
import pandas as pd
import tensorflow as tf
from datetime import datetime

# Any results you write to the current directory are saved as output.

Using TensorFlow backend.


In [4]:
# Constants

SEQ_LEN = 512
BATCH_SIZE = 2
EPOCHS = 5
LR = 5e-6

In [5]:
# Environment
import os

pretrained_path = '../input/bertpretrained/uncased_l-12_h-768_a-12/uncased_L-12_H-768_A-12/'
config_path = os.path.join(pretrained_path, 'bert_config.json')
checkpoint_path = os.path.join(pretrained_path, 'bert_model.ckpt')
vocab_path = os.path.join(pretrained_path, 'vocab.txt')

# TF_KERAS must be added to environment variables in order to use TPU
os.environ['TF_KERAS'] = '1'

## Import the Model

* Model and tokenizer import from check point
* Download the data

In [6]:
# Load Basic Model
import codecs
from keras_bert import load_trained_model_from_checkpoint

token_dict = {}
with codecs.open(vocab_path, 'r', 'utf8') as reader:   #tokenization
    for line in reader:
        token = line.strip()
        token_dict[token] = len(token_dict)

model = load_trained_model_from_checkpoint(  #load model from checkpoint
    config_path,
    checkpoint_path,
    training=True,
    trainable=True,
    seq_len=SEQ_LEN,
)

In [7]:

# Download IMDB Data
import tensorflow as tf

dataset = tf.keras.utils.get_file(
    fname="IMDB Movie Reviews Dataset", 
    origin=" https://www.kaggle.com/iarunava/imdb-movie-reviews-dataset" #data downloaded from 
   ,
)

Downloading data from  https://www.kaggle.com/iarunava/imdb-movie-reviews-dataset
 253952/Unknown - 1s 3us/step



## Convert the data into arrays 
* Convert the data into arrays of indices and sentiments
* Load teh data

In [9]:
# Convert Data to Array
import os
import numpy as np
from tqdm import tqdm
from keras_bert import Tokenizer

tokenizer = Tokenizer(token_dict)


def load_data(path):
    global tokenizer
    indices, sentiments = [], []
    for folder, sentiment in (('neg', 0), ('pos', 1)): #applying for loop on seniments in two folders
        folder = os.path.join(path, folder)
        for name in tqdm(os.listdir(folder)):   
            with open(os.path.join(folder, name), 'r') as reader:
                  text = reader.read()
            ids, segments = tokenizer.encode(text, max_len=SEQ_LEN)  #tokenizing ids and sentiments
            indices.append(ids)
            sentiments.append(sentiment)
    items = list(zip(indices, sentiments))
    np.random.shuffle(items)
    indices, sentiments = zip(*items)
    indices = np.array(indices)
    return [indices, np.zeros_like(indices)], np.array(sentiments) #return two arrays of indices and sentiments
  
  
 #load test and train data 
train_path = "../input/imdb-movie-reviews-dataset/aclimdb/aclImdb/train/"
test_path = "../input/imdb-movie-reviews-dataset/aclimdb/aclImdb/test/"


train_x, train_y = load_data(train_path)
test_x, test_y = load_data(test_path)

100%|██████████| 12500/12500 [00:48<00:00, 257.50it/s]
100%|██████████| 12500/12500 [00:50<00:00, 246.34it/s]
100%|██████████| 12500/12500 [00:48<00:00, 257.69it/s]
100%|██████████| 12500/12500 [00:48<00:00, 257.43it/s]


In [10]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Input-Token (InputLayer)        (None, 512)          0                                            
__________________________________________________________________________________________________
Input-Segment (InputLayer)      (None, 512)          0                                            
__________________________________________________________________________________________________
Embedding-Token (TokenEmbedding [(None, 512, 768), ( 23440896    Input-Token[0][0]                
__________________________________________________________________________________________________
Embedding-Segment (Embedding)   (None, 512, 768)     1536        Input-Segment[0][0]              
__________________________________________________________________________________________________
Embedding-

# Model Building
* The model is Keras.model with two dense layers on input and on the output with softmax activation function. Adamwarmup being used as optimizer.
* Sparse categorical entropy used as loss

In [11]:
# Build Custom Model
import keras
from keras_bert import AdamWarmup, calc_train_steps
from keras.layers import Dense

inputs = model.inputs[:2] 
dense = model.get_layer('NSP-Dense').output
outputs = Dense(units=2, activation='softmax')(dense) 

decay_steps, warmup_steps = calc_train_steps(
    train_y.shape[0],
    batch_size=BATCH_SIZE,
    epochs=5,
)

model = keras.models.Model(inputs, outputs)  
model.compile(
    AdamWarmup(decay_steps=decay_steps, warmup_steps=warmup_steps, lr=LR), #optimizer = adamwarmup
    
    loss='sparse_categorical_crossentropy',
    metrics=['sparse_categorical_accuracy'],
)

In [12]:
# Initialize Variables
import tensorflow as tf
import keras.backend as K

sess = K.get_session()
uninitialized_variables = set([i.decode('ascii') 
for i in sess.run(tf.report_uninitialized_variables())])
    init_op = tf.variables_initializer(
    [v for v in tf.global_variables() if v.name.split(':')[0] in uninitialized_variables]
)
sess.run(init_op)

In [13]:
 # with tf.keras.utils.custom_object_scope(get_custom_objects()):
        
model.fit(
        train_x,
        train_y,
        epochs = 5,
        batch_size = BATCH_SIZE
)

Epoch 1/5

In [14]:

predicts = model.predict(test_x, verbose=True).argmax(axis=-1)



In [15]:
# Accuracy

print(np.sum(test_y == predicts) / test_y.shape[0])

0.93912
