In [None]:
import zipfile
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf

# Loadin csv files

In [None]:
# READ THE CSV FILES
train_df = pd.read_csv('../input/sentiment-analysis-on-movie-reviews/train.tsv.zip', sep='\t')
test_df = pd.read_csv('../input/sentiment-analysis-on-movie-reviews/test.tsv.zip', sep='\t')

In [None]:
train_df.head()

In [None]:
train_df['Sentiment'].value_counts().plot(kind='bar')

In [None]:
seq_len = 512
num_samples = len(train_df)

num_samples, seq_len

# Tokenizing all Phrases using BertTokenizer

In [None]:
from transformers import BertTokenizer

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

In [None]:
tokens = tokenizer(train_df['Phrase'].tolist(),max_length=seq_len,
                  truncation=True,padding='max_length',
                   add_special_tokens=True,
                   return_tensors='np'
                  )

In [None]:
tokens.keys()

In [None]:
tokens['input_ids']

In [None]:
tokens['attention_mask']

# Saving Input IDs and attention Masks in numpy binary files

In [None]:
# Save as numpy binary files
with open('movie-xids.npy','wb') as f:
    np.save(f,tokens['input_ids'])
with open('movie-xmask.npy','wb') as f:
    np.save(f,tokens['attention_mask'])

**One hot encoding the target values**

In [None]:
arr = train_df['Sentiment'].values
print(arr.shape)
print(arr)
print(f"one hot encoding - {arr.max() + 1}")

In [None]:
labels = np.zeros((num_samples,arr.max()+1))
print(f"shape of labels - {labels.shape}")

In [None]:
labels[np.arange(num_samples),arr] = 1
print(labels)

In [None]:
with open('movie-labels.npy','wb') as f:
    np.save(f,labels)

In [None]:
labels[9]

**Loading the inputids,masks and labes from numpy binary files**

In [None]:
with open('movie-xids.npy','rb') as f:
    Xids = np.load(f,allow_pickle=True)
with open('movie-xmask.npy','rb') as f:
    Xmask = np.load(f,allow_pickle=True)
with open('movie-labels.npy','rb') as f:
    labels = np.load(f,allow_pickle=True)

In [None]:
Xids.shape

**Making Tesnorflow dataset adding batch size and shufling**

In [None]:
dataset = tf.data.Dataset.from_tensor_slices((Xids,Xmask,labels))

In [None]:
dataset.take(1)

In [None]:
def map_func(input_ids,masks,labels):
    return {'input_ids':input_ids,
           'attention_mask':masks}, labels

In [None]:
dataset = dataset.map(map_func)

In [None]:
dataset.take(1)

In [None]:
batch_size = 16
dataset = dataset.shuffle(10000).batch(batch_size, drop_remainder=True)
dataset.take(1)

In [None]:
split = 0.9

In [None]:
print(int((Xids.shape[0]/batch_size) * split))
size = int((Xids.shape[0]/batch_size) * split)

In [None]:
train_ds = dataset.take(size)
val_ds = dataset.skip(size)

In [None]:
tf.data.experimental.save(train_ds,'train')
tf.data.experimental.save(val_ds,'val')

In [None]:
train_ds.element_spec

In [None]:
train_ds.element_spec == val_ds.element_spec

In [None]:
ds = tf.data.experimental.load('train',element_spec=train_ds.element_spec)
ds.take(1)

In [None]:
from transformers import TFAutoModel

In [None]:
bert = TFAutoModel.from_pretrained('bert-base-cased')

In [None]:
bert.summary()

**Defining the BERT Model**

In [None]:
input_ids = tf.keras.layers.Input(shape=(512,),
                                 name="input_ids",dtype='int32')
mask = tf.keras.layers.Input(shape=(512,),name="attention_mask",dtype='int32')

# transformer
embeddings = bert.bert(input_ids,attention_mask=mask)[1]
# classifier head
x = tf.keras.layers.Dense(1024,activation='relu')(embeddings)
y = tf.keras.layers.Dense(5,activation='softmax',name='outputs')(x)
model = tf.keras.Model(inputs=[input_ids,mask],outputs=y)

In [None]:
model.layers[2].trainable = False

In [None]:
model.summary()

In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5,decay=1e-6)
loss = tf.keras.losses.CategoricalCrossentropy()
acc = tf.keras.metrics.CategoricalAccuracy('accuracy')

model.compile(optimizer=optimizer,loss=loss, metrics=[acc,])

In [None]:
element_spec = ({'input_ids': tf.TensorSpec(shape=(16, 512), dtype=tf.int64, name=None),
  'attention_mask': tf.TensorSpec(shape=(16, 512), dtype=tf.int64, name=None)},
 tf.TensorSpec(shape=(16, 5), dtype=tf.float64, name=None))

In [None]:
train_ds = tf.data.experimental.load('train',element_spec=element_spec)

In [None]:
val_ds = tf.data.experimental.load('val',element_spec=element_spec)

In [None]:
train_ds.take(1)

**Training the model**

In [None]:
history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=3
)

In [None]:
model.save('sentiment_model')

In [None]:
model = tf.keras.models.load_model('sentiment_model')

# Making Predictions

In [None]:
def prep_data(text):
    tokens = tokenizer(text,max_length=512,truncation=True,
                      padding='max_length',add_special_tokens=True,
                      return_tensors='tf')
    return {"input_ids": tokens['input_ids'],
           "attention_mask": tokens["attention_mask"]}

In [None]:
probs = model.predict(prep_data('hello world'))[0]

In [None]:
np.argmax(probs)

In [None]:
probs = model.predict(prep_data('this movie was amazing'))[0]

In [None]:
np.argmax(probs)

In [None]:
probs = model.predict(prep_data('this movie was terrible'))[0]
np.argmax(probs)

In [None]:
pd.set_option('display.max_colwidth',None)

In [None]:
test_df.head()

In [None]:
df = test_df.drop_duplicates(subset=['SentenceId'],keep='first')
df.head()

In [None]:
df['Sentiment'] = None

In [None]:
for i, row in df.iterrows():
    tokens = prep_data(row['Phrase'])
    probs = model.predict(tokens)
    pred = np.argmax(probs)
    df.at[i,'Sentiment'] = pred

In [None]:
df.head()

In [None]:
df.tail()