# <center> Amazon Reviews

## Import Libraries

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns 
from scipy import stats

from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Dense, Embedding,LSTM
from tensorflow.python.keras.preprocessing.text import Tokenizer
from keras.optimizers import Adam
from keras.layers import Dropout
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
from tensorflow.python.keras.models import load_model
from sklearn.model_selection import train_test_split
import re
import nltk 
nltk.download("stopwords")
from nltk.corpus import stopwords
nltk.download('punkt')

import warnings
warnings.filterwarnings("ignore")

In [None]:
data = pd.read_csv("../input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv")

In [None]:
data.head()

In [None]:
df = data[["review","sentiment"]]

In [None]:
# we are looking at the size of our data.
df.shape

In [None]:
df.sentiment.replace({"positive":1,"negative":0},inplace=True)

In [None]:
df.sample(10)

## Exploratory Data Analysis

In [None]:
df.info()

- we see that there is no null value in our data. 

In [None]:
# statistical summary of our data
data.describe()

In [None]:
sns.countplot(df["sentiment"], palette = ["green","red"])
plt.show()
print(df.sentiment.value_counts())

<ul>
    <li  style = "color:green" > <p style = "color:black;font-weight:bold" > We see that the number of positive and negative comments is equal. </p> </li>
</ul>

### Generating word frequencies

Let's first generate a frequency table of all the words present in all the reviews combined.

In [None]:
def gen_freq(text):
    #will store all the words in list
    words_list = []
    
    #Loop over all the words and extract word from list
    for word in text.split():
        words_list.extend(word)
        
    #Generate word frequencies using value counts in word_list
    word_freq = pd.Series(words_list).value_counts()
    
    #print top 100 words
    word_freq[:100]
    
    return word_freq    

In [None]:
freq = gen_freq(df.review.str)
freq

## Create Word clouds

### Word cloud using word frequencies

In [None]:
#Import library WordCloud
from wordcloud import WordCloud

#Generate word cloud
wc = WordCloud(width=400, height=330, max_words=100, background_color='white').generate_from_frequencies(freq)

plt.figure(figsize=(14,8))
plt.imshow(wc, interpolation='bilinear')
plt.axis('off')
plt.show()

### Word cloud using positive reviews

In [None]:
plt.figure(figsize = (20,20))
Wc = WordCloud(max_words = 500 , width = 1600 , height = 800,
               min_word_length=5).generate(" ".join(data[data.sentiment == 1].review))

plt.axis("off")
plt.imshow(Wc , interpolation = 'bilinear')

### Word cloud using negative reviews

In [None]:
plt.figure(figsize = (20,20))
Wc = WordCloud(max_words = 500 , width = 1600 , height = 800,colormap="YlOrBr",
               min_word_length=5).generate(" ".join(data[data.sentiment == 0].review))

plt.axis("off")
plt.imshow(Wc , interpolation = 'bilinear')

### Removing Stopwords

In [None]:
from nltk.corpus import stopwords
stop_word_list = stopwords.words('english')

In [None]:
from nltk.tokenize import word_tokenize,sent_tokenize
from nltk.tokenize.toktok import ToktokTokenizer

In [None]:
#Tokenization of text
tokenizer=ToktokTokenizer()

In [None]:
#removing the stopwords
def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stop_word_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stop_word_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text
#Apply function on review column
df['clean_review']= df['review'].apply(remove_stopwords)

## Text Cleaning

- process of clearing punctuation marks in data
- cleaning unnecessary marks in data. </p> </li>
- capitalization to lowercase. </p> </li>
- cleaning extra spaces. </p> </li>
- removal of stopwords in sentences. </p> </li>


In [None]:
import re
#clearing punctuation & unnecessary marks
df['clean_review'] = df['clean_review'].apply(lambda x: re.sub('[,\.!?:()"]', '', x))
df['clean_review'] = df['clean_review'].apply(lambda x: re.sub('[^a-zA-Z"]', ' ', x))

#capitalization to lowercase
df['clean_review'] = df['clean_review'].apply(lambda x: x.lower())

#cleaning extra spaces
df['clean_review'] = df['clean_review'].apply(lambda x: x.strip())


### Removing html strips and noise text


In [None]:
from bs4 import BeautifulSoup

In [None]:
#Removing the html strips
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

#Removing the square brackets
def remove_between_square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)

#Removing the noisy text
def denoise_text(text):
    return text

#Apply function on review column
    text = strip_html(text)
    text = remove_between_square_brackets(text)
df['clean_review']=df['clean_review'].apply(denoise_text)

## Generating Word clouds after text cleaning :

In [None]:
# POSITIVE REVIEWS
plt.figure(figsize = (20,20))
Wc = WordCloud(max_words = 500 , width = 1600 , height = 800,
               min_word_length=5).generate(" ".join(data[data.sentiment == 1].review))

plt.axis("off")
plt.title('Positive reviews')
plt.imshow(Wc , interpolation = 'bilinear')

In [None]:
# Negative Reviews
plt.figure(figsize = (20,20))
Wc = WordCloud(max_words = 500 , width = 1600 , height = 800,colormap="YlOrBr",
               min_word_length=5).generate(" ".join(data[data.sentiment == 0].review))

plt.axis("off")
plt.title('Negative reviews')
plt.imshow(Wc , interpolation = 'bilinear')

## Train - Test Split 

In [None]:
sentiment = df['sentiment'].values
sentiment

In [None]:
data = df['clean_review']

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(data,sentiment,test_size = 0.25, random_state = 42)

### Creating a Dictionary 
We create a dictionary of 15000 most used words in English for later use. 

In [None]:
from tensorflow.python.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(num_words = 15000)
tokenizer.fit_on_texts(data)
#tokenizer.word_index

#### Bring the comments to the same size
comments of different lengths cannot train the model. That's why we need to make the sentences the same size. 

In [None]:
x_train_tokens = tokenizer.texts_to_sequences(x_train)
x_test_tokens = tokenizer.texts_to_sequences(x_test)

- We can replace the words in our sentences with whichever index they are included in the dictionary we have created above. 

In [None]:
#Then we take the word count of each of our sentences in our data and create a list.
num_tokens = [len(tokens) for tokens in x_train_tokens + x_test_tokens]
num_tokens = np.array(num_tokens)

In [None]:
#Here, when setting the number of tokens, a number is determined by taking into account the variability around the average.
max_tokens = np.mean(num_tokens) + 2 * np.std(num_tokens)
max_tokens = int(max_tokens)
max_tokens

 max_tokens : This value will allow us to reduce the distribution of sentences in our data and the sentences with opposite lengths, if any, to the average. 

In [None]:
#It is checked what percentage of the data this determined number covers.
np.sum(num_tokens < max_tokens) / len(num_tokens)

In [None]:
#data is adjusted according to the number of tokens specified
x_train_pad = pad_sequences(x_train_tokens, maxlen=max_tokens)
x_test_pad = pad_sequences(x_test_tokens, maxlen=max_tokens)

In [None]:
x_train_pad.shape

##  LSTM Model 

In [None]:
# construct model
BATCH_SIZE = 32
embedding_size=50
from keras.utils.vis_utils import plot_model
from keras.layers import Dense, LSTM, Bidirectional, Dropout
model = Sequential()
model.add(Embedding(input_dim=15000,output_dim=embedding_size,input_length=max_tokens,name='embedding_layer'))
model.add(Bidirectional(LSTM(200, dropout=0,recurrent_dropout=0)))
model.add(Dense(128, activation="relu"))
model.add(Dense(1,activation="sigmoid"))

model.compile("adam","binary_crossentropy",metrics=["accuracy","AUC"])
model.summary()

In [None]:
history = model.fit(x_train_pad, y_train, validation_split=0.3, epochs=5, batch_size=1000, shuffle=True, verbose = 1)

### Result 

In [None]:
result = model.evaluate(x_test_pad, y_test)

##  Visualization of Accuracy & Loss

In [None]:
plt.figure()
#plt.style.use("fivethirtyeight")
plt.plot(history.history["accuracy"], label = "Training Accuracy")
plt.plot(history.history["val_accuracy"], label = "Validation Accuracy")
plt.title("Accuracy")
plt.ylabel("Acc")
plt.xlabel("epochs")
plt.legend()
plt.show()

In [None]:
plt.figure()
plt.plot(history.history["loss"], label = "Train")
plt.plot(history.history["val_loss"], label = "Validation")
plt.title("Loss")
plt.ylabel("Acc")
plt.xlabel("epochs")
plt.legend()
plt.show()

In [None]:
plt.plot(history.history["auc"], label = "Training ROC Score")
plt.plot(history.history["val_auc"], label = "Validation ROC Score")
plt.title("ROC AUC")
plt.ylabel("AUC")
plt.xlabel("epochs")
plt.legend()
plt.show()

## BERT

In [None]:
df

In [None]:
train, test= train_test_split(df, test_size=0.25, random_state=42)
Xtrain, ytrain = train['clean_review'], train['sentiment']
Xtest, ytest = test['clean_review'], test['sentiment']
#splitting the train set into train and validation
Xtrain,Xval,ytrain,yval=train_test_split(Xtrain,ytrain,test_size=0.2,random_state=42)

In [None]:
#set up the tokenizer
MAX_VOCAB_SIZE = 10000
tk = Tokenizer(num_words = MAX_VOCAB_SIZE,oov_token="<oov>")
tk.fit_on_texts(Xtrain)
word_index = tk.word_index
#print(word_index)
V = len(word_index)
print("Vocabulary of the dataset is : ",V)

In [None]:
##create sequences of reviews
seq_train = tk.texts_to_sequences(Xtrain)
seq_test =  tk.texts_to_sequences(Xtest)

In [None]:
#choice of maximum length of sequences
seq_len_list = [len(i) for i in seq_train + seq_test]

#if we take the direct maximum then
max_len=max(seq_len_list)
print('Maximum length of sequence in the list: {}'.format(max_len))

In [None]:
# when setting the maximum length of sequence, variability around the average is used.
max_seq_len = np.mean(seq_len_list) + 2 * np.std(seq_len_list)
max_seq_len = int(max_seq_len)
print('Maximum length of the sequence when considering data only two standard deviations from average: {}'.format(max_seq_len))

In [None]:
import transformers
#Perform tokenization
# automatically download the vocab used during pretraining or fine-tuning a given model,use from_pretrained() method
tokenizer = transformers.AutoTokenizer.from_pretrained('distilbert-base-uncased')

In [None]:
#pass our texts to the tokenizer. 
Xtrain_enc = tokenizer(Xtrain.tolist(), max_length=max_seq_len, 
                         truncation=True, padding='max_length', 
                         add_special_tokens=True, return_tensors='np') #return numpy object
Xval_enc = tokenizer(Xval.tolist(), max_length=max_seq_len, 
                         truncation=True, padding='max_length', 
                         add_special_tokens=True, return_tensors='np') #return numpy object
Xtest_enc = tokenizer(Xtest.tolist(), max_length=max_seq_len, 
                         truncation=True, padding='max_length', 
                         add_special_tokens=True, return_tensors='np') #return numpy object

In [None]:
import tensorflow as tf
#preparing our datasets
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(Xtrain_enc),
    ytrain
))
val_dataset = tf.data.Dataset.from_tensor_slices((
    dict(Xval_enc),
    yval
))
test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(Xtest_enc),
    ytest
))

In [None]:
# creating BERT Model
from tensorflow.keras.layers import Dense,Input, Embedding,LSTM,Dropout,Conv1D
from tensorflow.keras.models import Model
def bert_model(train_dataset,val_dataset,transformer,max_len,epochs):
    print("----Building the model----")
    input_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_ids")
    attention_mask = Input(shape=(max_len,),dtype=tf.int32,name = 'attention_mask') #attention mask
    sequence_output = transformer(input_ids,attention_mask)[0]
    cls_token = sequence_output[:, 0, :]
    x = Dense(512, activation='relu')(cls_token)
    x = Dropout(0.1)(x)
    y = Dense(1, activation='sigmoid')(x)
    model = Model(inputs=[input_ids,attention_mask], outputs=y)
    model.summary()
    model.compile(Adam(lr=2e-5), loss='binary_crossentropy', metrics=['accuracy','AUC'])
    r = model.fit(train_dataset.batch(32),batch_size = 32,
                  validation_data = val_dataset.batch(32),epochs = epochs)
                  #callbacks = callbacks
    #print("Train score:", model.evaluate(train_dataset.batch(32)))
    #print("Validation score:", model.evaluate(val_dataset.batch(32)))
    n_epochs = len(r.history['loss'])
    
    return r,model,n_epochs 

In [None]:
transformer = transformers.TFDistilBertModel.from_pretrained('distilbert-base-uncased')

In [None]:
epochs = 4
max_len = max_seq_len
r,model,n_epochs = bert_model(train_dataset,val_dataset,transformer,max_len,epochs)

In [None]:
#plt.figure()
plt.style.use("fivethirtyeight")
sns.set_style("darkgrid")
plt.plot(r.history["accuracy"], label = "Training Accuracy")
plt.plot(r.history["val_accuracy"], label = "Validation Accuracy")
plt.title("Accuracy")
plt.ylabel("Acc")
plt.xlabel("epochs")
plt.legend()
plt.show()

In [None]:
plt.plot(r.history["loss"], label = "Training")
plt.plot(r.history["val_loss"], label = "Validation")
plt.title("Loss")
plt.ylabel("Loss")
plt.xlabel("epochs")
plt.legend()
plt.show()

In [None]:
plt.figure()
plt.plot(r.history["auc"], label = "Training ROC Score")
plt.plot(r.history["val_auc"], label = "Validation ROC Score")
plt.title("ROC AUC")
plt.ylabel("AUC")
plt.xlabel("epochs")
plt.legend()
plt.show()

# <center> Hybrid Model

In [None]:
from keras.utils.vis_utils import plot_model
from keras import layers
from keras.layers import Dense, LSTM, Bidirectional, Dropout
from keras.models import Sequential

In [None]:
# construct model
BATCH_SIZE = 32

hmodel = Sequential()
hmodel.add(Embedding(input_dim=15000,output_dim=embedding_size,input_length=max_tokens,name='embedding_layer'))
#hmodel.add(layers.Conv1D(32, 8, padding='same',input_shape=(15000, 1),activation='relu'))
#hmodel.add(layers.MaxPooling1D(2, padding='same'))
hmodel.add(layers.Conv1D(64, 8, padding='same', activation='relu'))
hmodel.add(layers.MaxPooling1D(2, padding='same'))
hmodel.add(layers.Conv1D(128, 8, padding='same', activation='relu'))
hmodel.add(layers.MaxPooling1D(2, padding='same'))
hmodel.add(layers.Conv1D(256, 8, padding='same', activation='relu'))
hmodel.add(LSTM(100, dropout=0,recurrent_dropout=0))
hmodel.add(Dense(128, activation="relu"))
hmodel.add(Dense(1,activation="sigmoid"))

hmodel.compile("adam","binary_crossentropy",metrics=["accuracy","AUC"])
hmodel.summary()

In [None]:
hybrid_history = hmodel.fit(x_train_pad, y_train, validation_split=0.3, epochs=6, batch_size=64, shuffle=True, verbose = 1)

In [None]:
plt.figure()
plt.plot(hybrid_history.history["auc"], label = "Training")
plt.plot(hybrid_history.history["val_auc"], label = "Validation")
plt.title("ROC AUC")
plt.ylabel("AUC")
plt.xlabel("epochs")
plt.legend()
plt.show()

In [None]:
plt.figure()
plt.plot(hybrid_history.history["accuracy"], label = "Training")
plt.plot(hybrid_history.history["val_accuracy"], label = "Validation")
plt.title("Accuracy")
plt.ylabel("Acc")
plt.xlabel("epochs")
plt.legend()
plt.show()

In [None]:
plt.figure()
plt.plot(hybrid_history.history["loss"], label = "Training")
plt.plot(hybrid_history.history["val_loss"], label = "Validation")
plt.title("Loss")
plt.ylabel("loss")
plt.xlabel("epochs")
plt.legend()
plt.show()