In [1]:
# Import required libraries
import numpy as np
import pandas as pd
import re
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras.models import Model
from tensorflow.keras import layers
import bert
from tqdm import tqdm
from collections import namedtuple

In [2]:
# Read the dataset
df = pd.read_csv("D:\\Study\\DataScience\\Projects\\AI-Advertising\\SentimentAnalysis\\IMDBDataset.csv")

df.isnull().values.any()
df.shape

(50000, 2)

In [31]:
df.head(10)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive
6,I sure would like to see a resurrection of a u...,positive
7,"This show was an amazing, fresh & innovative i...",negative
8,Encouraged by the positive comments about this...,negative
9,If you like original gut wrenching laughter yo...,positive


In [5]:
#df=pd.read_csv("D:\\Study\\DataScience\\Projects\\AI-Advertising\\SentimentAnalysis\\comments.csv")
 
#df = df.sample(frac=1)
 
train_sentences = df["review"].fillna("CVxTz").values
#list_classes = ["sentiment"]

train_y = df['sentiment']

train_y = np.array(list(map(lambda x: 1 if x=="positive" else 0, train_y)))

#train_y = df[list_classes].values

In [7]:
bert_layer=hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",trainable=True)

In [8]:
MAX_SEQ_LEN=128

input_word_ids = tf.keras.layers.Input(shape=(MAX_SEQ_LEN,), dtype=tf.int32, name="input_word_ids")
input_mask = tf.keras.layers.Input(shape=(MAX_SEQ_LEN,), dtype=tf.int32, name="input_mask")
segment_ids = tf.keras.layers.Input(shape=(MAX_SEQ_LEN,), dtype=tf.int32, name="segment_ids")

In [9]:
def get_masks(tokens, max_seq_length):
    return [1]*len(tokens) + [0] * (max_seq_length - len(tokens))
 
def get_segments(tokens, max_seq_length):
    """Segments: 0 for the first sequence, 1 for the second"""
    segments = []
    current_segment_id = 0
    for token in tokens:
        segments.append(current_segment_id)
        if token == "[SEP]":
            current_segment_id = 1
    return segments + [0] * (max_seq_length - len(tokens))

In [10]:
pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])

In [11]:
FullTokenizer=bert.bert_tokenization.FullTokenizer
 
vocab_file=bert_layer.resolved_object.vocab_file.asset_path.numpy()
 
do_lower_case=bert_layer.resolved_object.do_lower_case.numpy()
 
tokenizer=FullTokenizer(vocab_file,do_lower_case)
 
def get_ids(tokens, tokenizer, max_seq_length):
    """Token ids from Tokenizer vocab"""
    token_ids = tokenizer.convert_tokens_to_ids(tokens,)
    input_ids = token_ids + [0] * (max_seq_length-len(token_ids))
    return input_ids

In [13]:
def create_single_input(sentence,MAX_LEN):
  
  stokens = tokenizer.tokenize(sentence)
  
  stokens = stokens[:MAX_LEN]
  
  stokens = ["[CLS]"] + stokens + ["[SEP]"]
 
  ids = get_ids(stokens, tokenizer, MAX_SEQ_LEN)
  masks = get_masks(stokens, MAX_SEQ_LEN)
  segments = get_segments(stokens, MAX_SEQ_LEN)
 
  return ids,masks,segments
 
def create_input_array(sentences):
 
  input_ids, input_masks, input_segments = [], [], []
 
  for sentence in tqdm(sentences,position=0, leave=True):
  
    ids,masks,segments=create_single_input(sentence,MAX_SEQ_LEN-2)
 
    input_ids.append(ids)
    input_masks.append(masks)
    input_segments.append(segments)
 
  return [np.asarray(input_ids, dtype=np.int32), 
            np.asarray(input_masks, dtype=np.int32), 
            np.asarray(input_segments, dtype=np.int32)]

In [14]:
x = tf.keras.layers.GlobalAveragePooling1D()(sequence_output)
x = tf.keras.layers.Dropout(0.2)(x)
out = tf.keras.layers.Dense(1, activation="sigmoid", name="dense_output")(x)
 
model = tf.keras.models.Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)
 
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [15]:
inputs=create_input_array(train_sentences)

100%|██████████| 50000/50000 [04:02<00:00, 206.05it/s]


In [None]:
#inputs=create_input_array(train_sentences)
 
model.fit(inputs,train_y,epochs=1,batch_size=32,validation_split=0.2,shuffle=True)

100%|██████████| 50000/50000 [03:56<00:00, 211.63it/s]


Train on 40000 samples, validate on 10000 samples


In [None]:
test_df=pd.read_csv("D:\\Study\\DataScience\\Projects\\AI-Advertising\\SentimentAnalysis\\comments_test.csv")
 
test_sentences = test_df["comment_text"].fillna("CVxTz").values
 
test_inputs=create_input_array(test_sentences[110:150])
 
print(model.predict(test_inputs))