In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import tensorflow as tf
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        # Currently, memory growth needs to be the same across GPUs
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        # Memory growth must be set before GPUs have been initialized
        print(e)

In [None]:
from tensorflow.python.client import device_lib
device_lib.list_local_devices()

# Import Necessary Libraries

In [None]:
import zipfile
import os
with zipfile.ZipFile('/kaggle/input/sentiment-analysis-on-movie-reviews/train.tsv.zip','r') as zip_ref:
    zip_ref.extractall("./sentiment-analysis-on-movie-reviews/")
with zipfile.ZipFile('/kaggle/input/sentiment-analysis-on-movie-reviews/test.tsv.zip','r') as zip_ref:
    zip_ref.extractall("./sentiment-analysis-on-movie-reviews/")

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from wordcloud import WordCloud

pd.options.display.max_colwidth = 500

# Get Rotten Tomatoes Movie Reviews Dataset

Rotten Tomatoes Movie Reviews Dataset is taken from https://www.kaggle.com/c/sentiment-analysis-on-movie-reviews/data

The dataset is comprised of tab-separated files with phrases from the Rotten Tomatoes dataset. The train/test split has been preserved for the purposes of benchmarking, but the sentences have been shuffled from their original order. Each Sentence has been parsed into many phrases by the Stanford parser. Each phrase has a PhraseId. Each sentence has a SentenceId. Phrases that are repeated (such as short/common words) are only included once in the data.



*   "train.tsv" contains the phrases and their associated sentiment labels. We have additionally provided a SentenceId so that you can track which phrases belong to a single sentence.
*   "test.tsv" contains just phrases. You must assign a sentiment label to each phrase.

The sentiment labels are:


*   0: negative
*   1: somewhat negative
*   2: neutral
*   3: somewhat positive
*   4: positive

In [None]:
data = pd.read_table("/kaggle/working/sentiment-analysis-on-movie-reviews/train.tsv",sep='\t')
data = data[['Phrase','Sentiment']].copy()
data

In [None]:
print("0: " + str(len(data[data['Sentiment'] == 0])))
print("1: " + str(len(data[data['Sentiment'] == 1])))
print("2: " + str(len(data[data['Sentiment'] == 2])))
print("3: " + str(len(data[data['Sentiment'] == 3])))
print("4: " + str(len(data[data['Sentiment'] == 4])))

# Data Pre-processing

In [None]:
import re
import spacy

sp = spacy.load('en_core_web_sm')
stopwords = sp.Defaults.stop_words
# stopwords |= {"movie","movies","film","story","character","characters","comedy"}
# print(stopwords)
# stopwords.remove('not')

def remove_stopwords(text):
    text_tokens = text.split(" ")
    text_tokens_filtered= [word for word in text_tokens if not word in all_stopwords]
    return (" ").join(text_tokens_filtered)

def clean(text):
#     text = remove_stopwords(text.lower())        # Removing stopwords decreased accuracies, therefore we did not use!!!
    text = re.sub(r'@|#', r'', text.lower())     # Returns a string with @-symbols and hashtags removed.
    text = re.sub(r'http.*', r'', text.lower())  # Returns a string with any websites starting with 'http.' removed.
    return ' '.join(re.findall(r'\w+', text.lower())) # Returns a string with only English unicode word characters ([a-zA-Z0-9_]).

data['Phrase'] = data['Phrase'].apply(lambda x: clean(x))
data

Splitting "train.tsv" dataset to Train and Validation.

*   Train Set: %70 (Train: %85 - Val: %15)
*   Test Set: %30

In [None]:
# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(data.index.values, 
                                                  data.Sentiment.values, 
                                                  test_size=0.3, 
                                                  random_state=42, 
                                                  stratify=data.Sentiment)

# Train-Validation Split
X_train, X_val, y_train, y_val = train_test_split(X_train, 
                                                  y_train, 
                                                  test_size=0.15, 
                                                  random_state=42, 
                                                  stratify=y_train)

data['data_type'] = ['not_set']*data.shape[0]
data.loc[X_train, 'data_type'] = 'train'
data.loc[X_val, 'data_type'] = 'val'
data.loc[X_test, 'data_type'] = 'test'

data = data.dropna()

# Remove duplicates 
train_set = data[data['data_type'] == 'train'].drop_duplicates(ignore_index=True)
val_set = data[data['data_type'] == 'val'].drop_duplicates(ignore_index=True)
test_set = data[data['data_type'] == 'test'].drop_duplicates(ignore_index=True)

data = pd.concat([train_set, val_set, test_set], ignore_index=True)
data = data.sample(frac=1, random_state=1).reset_index(drop=True)
data

In [None]:
data.groupby("Sentiment").Sentiment.count().plot.bar(ylim=0)

In [None]:
negative = data[data['Sentiment'] == 0]
wordCloud = WordCloud(background_color="white", width=1600, height=800).generate(' '.join(negative.Phrase))
plt.figure(figsize=(20,10), facecolor='k')
plt.imshow(wordCloud)

In [None]:
positive = data[data['Sentiment'] == 4]
wordCloud = WordCloud(background_color="white", width=1600, height=800).generate(' '.join(positive.Phrase))
plt.figure(figsize=(20,10), facecolor='k')
plt.imshow(wordCloud)

# Training with Machine Learning Algorithms

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer, CountVectorizer
from sklearn import svm
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

In [None]:
X_train = train_set.Phrase.values
Y_train = train_set.Sentiment.values
X_test = test_set.Phrase.values
Y_test = test_set.Sentiment.values
target_categories = ["0","1","2","3","4"]

## Naive Bayes Classifier

In [None]:
vectorizer = TfidfVectorizer()
tfidf_text = vectorizer.fit_transform(X_train)
# print(X_test)

# Training the classifier with Naive Bayes
nb = Pipeline([('tfidf', TfidfVectorizer()),
               ('clf', MultinomialNB()),
              ])

nb.fit(X_train, Y_train)
test_predict = nb.predict(X_test)

train_accuracy = round(nb.score(X_train, Y_train)*100)
test_accuracy = round(accuracy_score(test_predict, Y_test)*100)

print("Naive Bayes Train Accuracy Score : {}% ".format(train_accuracy))
print("Naive Bayes Test Accuracy Score  : {}% ".format(test_accuracy))
print()
print(classification_report(test_predict, Y_test, target_names=target_categories))

cm = confusion_matrix(Y_test, test_predict, labels=[0,1,2,3,4])
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[0,1,2,3,4])
disp.plot()

## Support Vector Machine (SGD Classifier)

In [None]:
sgd = Pipeline([('tfidf', TfidfVectorizer()),
                ('clf', SGDClassifier()),
               ])

sgd.fit(X_train, Y_train)
test_predict = sgd.predict(X_test)

train_accuracy = round(sgd.score(X_train, Y_train)*100)
test_accuracy = round(accuracy_score(test_predict, Y_test)*100)

print("SVM Train Accuracy Score : {}% ".format(train_accuracy))
print("SVM Test Accuracy Score  : {}% ".format(test_accuracy))
print()
print(classification_report(test_predict, Y_test, target_names=target_categories))

cm = confusion_matrix(Y_test, test_predict, labels=[0,1,2,3,4])
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[0,1,2,3,4])
disp.plot()

## Decision Tree Classifier

In [None]:
dt = Pipeline([('tfidf', TfidfVectorizer()),
                ('dt', DecisionTreeClassifier()),
               ])

dt.fit(X_train, Y_train)
test_predict = dt.predict(X_test)

train_accuracy = round(dt.score(X_train, Y_train)*100)
test_accuracy = round(accuracy_score(test_predict, Y_test)*100)

print("Decision Tree Train Accuracy Score : {}% ".format(train_accuracy))
print("Decision Tree Test Accuracy Score  : {}% ".format(test_accuracy))
print()
print(classification_report(test_predict, Y_test, target_names=target_categories))

cm = confusion_matrix(Y_test, test_predict, labels=[0,1,2,3,4])
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[0,1,2,3,4])
disp.plot()

## K-Nearest Neighbour Classifier

In [None]:
knn = Pipeline([('tfidf', TfidfVectorizer()),
                ('knn', KNeighborsClassifier(n_neighbors=5, metric='euclidean')),
               ])

knn.fit(X_train, Y_train)
test_predict = knn.predict(X_test)

train_accuracy = round(knn.score(X_train, Y_train)*100)
test_accuracy = round(accuracy_score(test_predict, Y_test)*100)

print("K-Nearest Neighbour Train Accuracy Score : {}% ".format(train_accuracy))
print("K-Nearest Neighbour Test Accuracy Score  : {}% ".format(test_accuracy))
print()
print(classification_report(test_predict, Y_test, target_names=target_categories))

cm = confusion_matrix(Y_test, test_predict, labels=[0,1,2,3,4])
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[0,1,2,3,4])
disp.plot()

## Logistic Regression Classifier

In [None]:
lr = Pipeline([('tfidf', TfidfVectorizer()),
                ('lr', LogisticRegression(random_state=42, solver='liblinear')),
               ])

lr.fit(X_train, Y_train)
test_predict = lr.predict(X_test)

train_accuracy = round(lr.score(X_train, Y_train)*100)
test_accuracy = round(accuracy_score(test_predict, Y_test)*100)

print("Logistic Regression Train Accuracy Score : {}% ".format(train_accuracy))
print("Logistic Regression Test Accuracy Score  : {}% ".format(test_accuracy))
print()
print(classification_report(test_predict, Y_test, target_names=target_categories))

cm = confusion_matrix(Y_test, test_predict, labels=[0,1,2,3,4])
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[0,1,2,3,4])
disp.plot()

# Training with BERT

In [None]:
# Load Huggingface transformers
from transformers import TFBertModel, BertConfig, BertTokenizerFast, TFAutoModel

# Then what you need from tensorflow.keras
from tensorflow.keras.layers import Input, Dropout, Dense, Flatten, SpatialDropout1D, Conv1D, Bidirectional, LSTM
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow_addons.optimizers import LAMB, AdamW
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.initializers import TruncatedNormal
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.metrics import CategoricalAccuracy
from tensorflow.keras.utils import to_categorical

## Get Pretrained BERT Model and Prepare Data with Tokenizer

In [None]:
# Name of the BERT model to use
model_name = 'bert-base-cased'

# Max length of tokens
length = len(data.Phrase)
dff = [len(i.split(" ")) for i in data.Phrase[:length]]
max_length = max(dff)+3

# Load transformers config and set output_hidden_states to False
config = BertConfig.from_pretrained(model_name)
config.output_hidden_states = False

# Load BERT tokenizer
tokenizer = BertTokenizerFast.from_pretrained(pretrained_model_name_or_path = model_name, config = config)

In [None]:
y_senti = to_categorical(data[data.data_type=='train'].Sentiment)

# Tokenize the input 
x = tokenizer(
    text=data[data.data_type=='train'].Phrase.to_list(),
    add_special_tokens=True,
    max_length=max_length,
    truncation=True,
    padding=True, 
    return_tensors='tf',
    return_token_type_ids = False,
    return_attention_mask = True,
    verbose = True)

train = tf.data.Dataset.from_tensor_slices((x['input_ids'], x['attention_mask'], y_senti))
def map_func(input_ids, masks, labels):
    # convert three-item tuple into a two-item tuple where the input item is a dictionary
    return {'input_ids': input_ids, 'attention_mask': masks}, labels

train = train.map(map_func)
batch_size = 32

# shuffle and batch - dropping any remaining samples that don't cleanly
train = train.shuffle(100).batch(batch_size, drop_remainder=True)

train.take(1)

In [None]:
y_senti = to_categorical(data[data.data_type=='val'].Sentiment)

# Tokenize the input 
x = tokenizer(
    text=data[data.data_type=='val'].Phrase.to_list(),
    add_special_tokens=True,
    max_length=max_length,
    truncation=True,
    padding=True, 
    return_tensors='tf',
    return_token_type_ids = False,
    return_attention_mask = True,
    verbose = True)

val = tf.data.Dataset.from_tensor_slices((x['input_ids'], x['attention_mask'], y_senti))
val = val.map(map_func)
val = val.shuffle(100).batch(batch_size, drop_remainder=True)

## Build the Model with Transfer Learning

In [None]:
# Build model input
input_ids = Input(shape=(max_length,), name='input_ids', dtype='int32')
attention_mask = Input(shape=(max_length,), name='attention_mask', dtype='int32') 
inputs = {'input_ids': input_ids, 'attention_mask': attention_mask}

bert = TFAutoModel.from_pretrained('bert-base-cased')
embeddings = bert.bert(inputs)[1]

# convert bert embeddings into 5 output classes
output = Flatten()(embeddings)
output = Dense(256, activation='relu')(output)
output = Dense(128, activation='relu')(output)

output = Dense(5, activation='softmax', name='outputs')(output)

model = Model(inputs=inputs, outputs=output)

# Take a look at the model
model.summary()

In [None]:
optimizer = AdamW(learning_rate=1e-5, weight_decay=1e-6)
loss = CategoricalCrossentropy()
acc = CategoricalAccuracy('accuracy')

model.compile(optimizer=optimizer, loss=loss, metrics=[acc])

## Train the BERT Model with Train and Validation Data

In [None]:
# Fit the model
history = model.fit(
    train,
    validation_data=val,
    epochs=3)

In [None]:
model.save_weights('./sentiment-analysis-on-movie-reviews/bert_weights.h5')

In [None]:
model.load_weights('./sentiment-analysis-on-movie-reviews/bert_weights.h5')

## Plot Confusion Matrix for Test Data

In [None]:
def map_func(input_ids, masks):
    return {'input_ids': input_ids, 'attention_mask': masks}

# Tokenize the input 
x = tokenizer(
    text=data[data.data_type=='test'].Phrase.to_list(),
    add_special_tokens=True,
    max_length=max_length,
    truncation=True,
    padding=True, 
    return_tensors='tf',
    return_token_type_ids = False,
    return_attention_mask = True,
    verbose = True)

test = tf.data.Dataset.from_tensor_slices((x['input_ids'], x['attention_mask']))
test = test.map(map_func)
test = test.batch(32)

In [None]:
y_test = data[data.data_type=='test'].Sentiment
y_pred = model.predict(test).argmax(axis=-1)

In [None]:
# Plot Confusion Matrix for Test Data
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score, classification_report

print("BERT Train Accuracy Score :      {:.0f}% ".format(history.history['accuracy'][-1]*100))
print("BERT Validation Accuracy Score : {:.0f}% ".format(history.history['val_accuracy'][-1]*100))
print("BERT Test Accuracy Score  :      {:.0f}% ".format(accuracy_score(y_test, y_pred)*100))
print()
cm = confusion_matrix(y_test, y_pred, labels=[0,1,2,3,4])
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[0,1,2,3,4])
disp.plot()

# Get Classification Report for Validation Data
print(classification_report(y_test, y_pred, target_names=target_categories))