In [None]:
import pandas as pd
!pip install contractions
!pip install textsearch
!pip install tqdm
import nltk
nltk.download('punkt')

In [None]:
df = pd.read_csv('Womens Clothing E-Commerce Reviews - NLP.csv', keep_default_na=False)
df.head()

Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name
0,767,33,,Absolutely wonderful - silky and sexy and comf...,4,1,0,Initmates,Intimate,Intimates
1,1080,34,,Love this dress! it's sooo pretty. i happene...,5,1,4,General,Dresses,Dresses
2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,General,Dresses,Dresses
3,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,General Petite,Bottoms,Pants
4,847,47,Flattering shirt,This shirt is very flattering to all due to th...,5,1,6,General,Tops,Blouses


In [None]:
df['full_review'] = df['Title'] + df['Review Text']

In [None]:
df.drop(df[df['full_review'] == ''].index, inplace = True)

In [None]:
df = df[['full_review', 'Recommended IND']]

In [None]:
df.head()

Unnamed: 0,full_review,Recommended IND
0,Absolutely wonderful - silky and sexy and comf...,1
1,Love this dress! it's sooo pretty. i happene...,1
2,Some major design flawsI had such high hopes f...,0
3,"My favorite buy!I love, love, love this jumpsu...",1
4,Flattering shirtThis shirt is very flattering ...,1


In [None]:
from sklearn.model_selection import train_test_split

In [None]:
train_reviews, test_reviews, train_labels, test_labels = train_test_split(df.full_review.values,
                                                                          df['Recommended IND'].values,
                                                                          test_size=0.2, random_state=42)

In [None]:
len(train_reviews), len(test_reviews)

(18113, 4529)

In [None]:
import contractions
from bs4 import BeautifulSoup
import numpy as np
import re
from tqdm import tqdm
import unicodedata


def strip_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    [s.extract() for s in soup(['iframe', 'script'])]
    stripped_text = soup.get_text()
    stripped_text = re.sub(r'[\r|\n|\r\n]+', '\n', stripped_text)
    return stripped_text

def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text

def pre_process_corpus(docs):
    norm_docs = []
    for doc in tqdm(docs):
        doc = strip_html_tags(doc)
        doc = doc.translate(doc.maketrans("\n\t\r", "   "))
        doc = doc.lower()
        doc = remove_accented_chars(doc)
        doc = contractions.fix(doc)
        doc = re.sub(r'[^a-zA-Z0-9\s]', '', doc, flags=re.I|re.A)
        doc = re.sub(' +', ' ', doc)
        doc = doc.strip()  

        norm_docs.append(doc)
  
    return norm_docs

In [None]:
%%time

norm_train_reviews = pre_process_corpus(train_reviews)
norm_test_reviews = pre_process_corpus(test_reviews)

100%|██████████| 18113/18113 [00:06<00:00, 2821.79it/s]
100%|██████████| 4529/4529 [00:02<00:00, 2053.77it/s]

CPU times: user 6.32 s, sys: 254 ms, total: 6.57 s
Wall time: 8.66 s





In [None]:
import gensim
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dropout, Activation, Dense
from sklearn.preprocessing import LabelEncoder

In [None]:
le = LabelEncoder()
tokenized_train = [nltk.word_tokenize(text)
                       for text in tqdm(norm_train_reviews)]
y_train = le.fit_transform(train_labels)
tokenized_test = [nltk.word_tokenize(text)
                       for text in tqdm(norm_test_reviews)]
y_test = le.transform(test_labels)

100%|██████████| 18113/18113 [00:12<00:00, 1399.68it/s]
100%|██████████| 4529/4529 [00:03<00:00, 1493.63it/s]


In [None]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [None]:
%%time
w2v_num_features = 300
w2v_model = gensim.models.Word2Vec(tokenized_train, size=w2v_num_features, window=150,
                                   min_count=2, workers=4, iter=10)    

2021-12-26 22:09:40,277 : INFO : collecting all words and their counts
2021-12-26 22:09:40,283 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2021-12-26 22:09:40,697 : INFO : PROGRESS: at sentence #10000, processed 634287 words, keeping 16745 word types
2021-12-26 22:09:40,920 : INFO : collected 23315 word types from a corpus of 1146044 raw words and 18113 sentences
2021-12-26 22:09:40,927 : INFO : Loading a fresh vocabulary
2021-12-26 22:09:41,006 : INFO : effective_min_count=2 retains 9845 unique words (42% of original 23315, drops 13470)
2021-12-26 22:09:41,008 : INFO : effective_min_count=2 leaves 1132574 word corpus (98% of original 1146044, drops 13470)
2021-12-26 22:09:41,092 : INFO : deleting the raw counts dictionary of 23315 items
2021-12-26 22:09:41,100 : INFO : sample=0.001 downsamples 57 most-common words
2021-12-26 22:09:41,108 : INFO : downsampling leaves estimated 770433 word corpus (68.0% of prior 1132574)
2021-12-26 22:09:41,232 : INFO : es

CPU times: user 1min 29s, sys: 370 ms, total: 1min 29s
Wall time: 49.7 s


In [None]:
def averaged_doc_vectorizer(corpus, model, num_features):
    vocabulary = set(model.wv.index2word)
    
    def average_word_vectors(words, model, vocabulary, num_features):
        feature_vector = np.zeros((num_features,), dtype="float64")
        nwords = 0.
        
        for word in words:
            if word in vocabulary: 
                nwords = nwords + 1.
                feature_vector = np.add(feature_vector, model.wv[word])
        if nwords:
            feature_vector = np.divide(feature_vector, nwords)

        return feature_vector

    features = [average_word_vectors(tokenized_sentence, model, vocabulary, num_features)
                    for tokenized_sentence in corpus]
    return np.array(features)

In [None]:
avg_w2v_train_features = averaged_doc_vectorizer(corpus=tokenized_train, model=w2v_model,
                                                     num_features=w2v_num_features)
avg_w2v_test_features = averaged_doc_vectorizer(corpus=tokenized_test, model=w2v_model,
                                                    num_features=w2v_num_features)

In [None]:
print('Word2Vec model:> Train features shape:', avg_w2v_train_features.shape, 
      ' Test features shape:', avg_w2v_test_features.shape)

Word2Vec model:> Train features shape: (18113, 300)  Test features shape: (4529, 300)


In [None]:
def construct_deepnn_architecture(num_input_features):
    dnn_model = Sequential()
    dnn_model.add(Dense(512, input_shape=(num_input_features,)))
    dnn_model.add(Activation('relu'))
    dnn_model.add(Dropout(0.2))
    
    dnn_model.add(Dense(256))
    dnn_model.add(Activation('relu'))
    dnn_model.add(Dropout(0.2))
    
    dnn_model.add(Dense(256))
    dnn_model.add(Activation('relu'))
    dnn_model.add(Dropout(0.2))
    
    dnn_model.add(Dense(1))
    dnn_model.add(Activation('sigmoid'))

    dnn_model.compile(loss='binary_crossentropy', optimizer='adam',                 
                      metrics=['accuracy'])
    return dnn_model

In [None]:
w2v_dnn = construct_deepnn_architecture(num_input_features=w2v_num_features)

In [None]:
w2v_dnn.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 512)               154112    
                                                                 
 activation (Activation)     (None, 512)               0         
                                                                 
 dropout (Dropout)           (None, 512)               0         
                                                                 
 dense_1 (Dense)             (None, 256)               131328    
                                                                 
 activation_1 (Activation)   (None, 256)               0         
                                                                 
 dropout_1 (Dropout)         (None, 256)               0         
                                                                 
 dense_2 (Dense)             (None, 256)               6

In [None]:
batch_size = 64
w2v_dnn.fit(avg_w2v_train_features, y_train, epochs=15, batch_size=batch_size, 
            shuffle=True, validation_split=0.1, verbose=1)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x7f3097dd70d0>

In [None]:
from sklearn.metrics import confusion_matrix, classification_report

In [None]:
predict_probas = w2v_dnn.predict(avg_w2v_test_features).ravel()
predictions = [1 if proba > 0.5 else 0 for proba in predict_probas]
predictions = le.inverse_transform(predictions)

In [None]:
labels = le.classes_.tolist()
print(classification_report(test_labels, predictions))
pd.DataFrame(confusion_matrix(test_labels, predictions), index=labels, columns=labels)

              precision    recall  f1-score   support

           0       0.71      0.73      0.72       819
           1       0.94      0.94      0.94      3710

    accuracy                           0.90      4529
   macro avg       0.83      0.83      0.83      4529
weighted avg       0.90      0.90      0.90      4529



Unnamed: 0,0,1
0,598,221
1,240,3470


In [None]:
import numpy as np
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Conv1D
from tensorflow.keras.layers import MaxPooling1D
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf

seed = 42
np.random.seed(seed)

In [None]:
t = Tokenizer(oov_token='<UNK>')
t.fit_on_texts(norm_train_reviews)
t.word_index['<PAD>'] = 0

In [None]:
train_sequences = t.texts_to_sequences(norm_train_reviews)

In [None]:
test_sequences = t.texts_to_sequences(norm_test_reviews)

In [None]:
print("Vocabulary size={}".format(len(t.word_index)))
print("Number of Documents={}".format(t.document_count))

Vocabulary size=23230
Number of Documents=18113


In [None]:
MAX_SEQUENCE_LENGTH = 250

In [None]:
X_train = sequence.pad_sequences(train_sequences, maxlen=MAX_SEQUENCE_LENGTH)
X_test = sequence.pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)
X_train.shape, X_test.shape

((18113, 250), (4529, 250))

In [None]:
num_classes=2 

In [None]:
y_train = le.fit_transform(train_labels)
y_test = le.transform(test_labels)

In [None]:
VOCAB_SIZE = len(t.word_index)

In [None]:
EMBED_SIZE = 300
EPOCHS=10
BATCH_SIZE=128

In [None]:
model = Sequential()
model.add(Embedding(VOCAB_SIZE, EMBED_SIZE, input_length=MAX_SEQUENCE_LENGTH))
model.add(Conv1D(filters=128, kernel_size=4, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Conv1D(filters=64, kernel_size=4, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Conv1D(filters=32, kernel_size=4, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(256, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 250, 300)          6969000   
                                                                 
 conv1d (Conv1D)             (None, 250, 128)          153728    
                                                                 
 max_pooling1d (MaxPooling1D  (None, 125, 128)         0         
 )                                                               
                                                                 
 conv1d_1 (Conv1D)           (None, 125, 64)           32832     
                                                                 
 max_pooling1d_1 (MaxPooling  (None, 62, 64)           0         
 1D)                                                             
                                                                 
 conv1d_2 (Conv1D)           (None, 62, 32)           

In [None]:
es = tf.keras.callbacks.EarlyStopping(monitor='val_loss', 
                                      patience=2,
                                      restore_best_weights=True,
                                      verbose=1)

model.fit(X_train, y_train, 
          validation_split=0.1,
          epochs=EPOCHS, 
          batch_size=BATCH_SIZE,
          callbacks=[es], 
          verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 00004: early stopping


<keras.callbacks.History at 0x7f3099eb6410>

In [None]:
prediction_probs = model.predict(X_test, verbose=1).ravel()
predictions = [1 if prob > 0.5 else 0 for prob in prediction_probs]
predictions[:10]



[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

labels = le.classes_.tolist()
print(classification_report(test_labels, predictions))
pd.DataFrame(confusion_matrix(test_labels, predictions), index=labels, columns=labels)

              precision    recall  f1-score   support

           0       0.72      0.79      0.75       819
           1       0.95      0.93      0.94      3710

    accuracy                           0.91      4529
   macro avg       0.84      0.86      0.85      4529
weighted avg       0.91      0.91      0.91      4529



Unnamed: 0,0,1
0,644,175
1,246,3464


In [None]:
EMBEDDING_DIM = 300 
LSTM_DIM = 128 

model2 = tf.keras.models.Sequential()
model2.add(tf.keras.layers.Embedding(input_dim=VOCAB_SIZE, output_dim=EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH))
model2.add(tf.keras.layers.SpatialDropout1D(0.1))
model2.add(tf.keras.layers.LSTM(LSTM_DIM, return_sequences=False))
model2.add(tf.keras.layers.Dense(256, activation='relu'))
model2.add(tf.keras.layers.Dense(1, activation="sigmoid"))

model2.compile(loss="binary_crossentropy", optimizer="adam",
              metrics=["accuracy"])
model2.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 250, 300)          6969000   
                                                                 
 spatial_dropout1d (SpatialD  (None, 250, 300)         0         
 ropout1D)                                                       
                                                                 
 lstm (LSTM)                 (None, 128)               219648    
                                                                 
 dense_6 (Dense)             (None, 256)               33024     
                                                                 
 dense_7 (Dense)             (None, 1)                 257       
                                                                 
Total params: 7,221,929
Trainable params: 7,221,929
Non-trainable params: 0
____________________________________________

In [None]:
batch_size = 128
EPOCHS = 10

es = tf.keras.callbacks.EarlyStopping(monitor='val_loss', 
                                      patience=2,
                                      restore_best_weights=True,
                                      verbose=1)

model2.fit(X_train, y_train, epochs=EPOCHS, batch_size=batch_size, 
          callbacks=[es],
          shuffle=True, validation_split=0.1, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 00004: early stopping


<keras.callbacks.History at 0x7f3094f74bd0>

In [None]:
prediction_probs = model2.predict(X_test, verbose=1).ravel()
predictions = [1 if prob > 0.5 else 0 for prob in prediction_probs]
predictions[:10]



[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

labels = le.classes_.tolist()
print(classification_report(test_labels, predictions))
pd.DataFrame(confusion_matrix(test_labels, predictions), index=labels, columns=labels)

              precision    recall  f1-score   support

           0       0.71      0.72      0.71       819
           1       0.94      0.94      0.94      3710

    accuracy                           0.90      4529
   macro avg       0.83      0.83      0.83      4529
weighted avg       0.90      0.90      0.90      4529



Unnamed: 0,0,1
0,586,233
1,236,3474


In [None]:
EMBEDDING_DIM = 300 
LSTM_DIM = 128 
inp = tf.keras.layers.Input(shape=(MAX_SEQUENCE_LENGTH,))
x = tf.keras.layers.Embedding(VOCAB_SIZE, EMBEDDING_DIM, trainable=True)(inp)
x = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(LSTM_DIM, return_sequences=True))(x)
x = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(LSTM_DIM, return_sequences=False))(x)
x = tf.keras.layers.Dense(256, activation='relu')(x)
x = tf.keras.layers.Dropout(rate=0.2)(x)
x = tf.keras.layers.Dense(256, activation='relu')(x)
x = tf.keras.layers.Dropout(rate=0.2)(x)

outp = tf.keras.layers.Dense(1, activation='sigmoid')(x)
model3 = tf.keras.models.Model(inputs=inp, outputs=outp)

    
model3.compile(loss='binary_crossentropy', optimizer=tf.keras.optimizers.Adam(), metrics=['accuracy'])
model3.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 250)]             0         
                                                                 
 embedding_2 (Embedding)     (None, 250, 300)          6969000   
                                                                 
 bidirectional (Bidirectiona  (None, 250, 256)         439296    
 l)                                                              
                                                                 
 bidirectional_1 (Bidirectio  (None, 256)              394240    
 nal)                                                            
                                                                 
 dense_8 (Dense)             (None, 256)               65792     
                                                                 
 dropout_3 (Dropout)         (None, 256)               0     

In [None]:
batch_size = 100
model3.fit(X_train, y_train, epochs=5, batch_size=batch_size, 
           shuffle=True, validation_split=0.1, verbose=1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f30915d6d90>

In [None]:
prediction_probs = model3.predict(X_test, verbose=1).ravel()
predictions = [1 if prob > 0.5 else 0 for prob in prediction_probs]
predictions[:10]



[1, 1, 1, 1, 1, 1, 0, 1, 1, 1]

In [None]:
labels = le.classes_.tolist()
print(classification_report(test_labels, predictions))
pd.DataFrame(confusion_matrix(test_labels, predictions), index=labels, columns=labels)

              precision    recall  f1-score   support

           0       0.68      0.72      0.70       819
           1       0.94      0.92      0.93      3710

    accuracy                           0.89      4529
   macro avg       0.81      0.82      0.81      4529
weighted avg       0.89      0.89      0.89      4529



Unnamed: 0,0,1
0,587,232
1,279,3431


In [None]:
t = tf.keras.preprocessing.text.Tokenizer(oov_token='<UNK>')
t.fit_on_texts(norm_train_reviews)
t.word_index['<PAD>'] = 0

In [None]:
train_sequences = t.texts_to_sequences(norm_train_reviews)
test_sequences = t.texts_to_sequences(norm_test_reviews)

In [None]:
MAX_SEQUENCE_LENGTH = 250
X_train = tf.keras.preprocessing.sequence.pad_sequences(train_sequences, maxlen=MAX_SEQUENCE_LENGTH)
X_test = tf.keras.preprocessing.sequence.pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)
X_train.shape, X_test.shape

((18113, 250), (4529, 250))

In [None]:
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-english/wiki-news-300d-1M.vec.zip

--2021-12-26 23:32:19--  https://dl.fbaipublicfiles.com/fasttext/vectors-english/wiki-news-300d-1M.vec.zip
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 172.67.9.4, 104.22.75.142, 104.22.74.142, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|172.67.9.4|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 681808098 (650M) [application/zip]
Saving to: ‘wiki-news-300d-1M.vec.zip.1’


2021-12-26 23:32:49 (21.9 MB/s) - ‘wiki-news-300d-1M.vec.zip.1’ saved [681808098/681808098]



In [None]:
!unzip wiki-news-300d-1M.vec.zip

Archive:  wiki-news-300d-1M.vec.zip
replace wiki-news-300d-1M.vec? [y]es, [n]o, [A]ll, [N]one, [r]ename: yes
  inflating: wiki-news-300d-1M.vec   


In [None]:
VOCAB_SIZE = len(t.word_index)
EMBED_SIZE = 300

In [None]:
word2idx = t.word_index
FASTTEXT_INIT_EMBEDDINGS_FILE = './wiki-news-300d-1M.vec'


def load_pretrained_embeddings(word_to_index, max_features, embedding_size, embedding_file_path):  
    """
    Utility function to load the pre-trained embeddings
    """  
    
    def get_coefs(word,*arr): 
        return word, np.asarray(arr, dtype='float32')
    
    embeddings_index = dict(get_coefs(*row.split(" ")) 
                                for row in open(embedding_file_path, encoding="utf8", errors='ignore') 
                                    if len(row)>100)

    all_embs = np.stack(embeddings_index.values())
    emb_mean, emb_std = all_embs.mean(), all_embs.std()
    embed_size = all_embs.shape[1]

    nb_words = min(max_features, len(word_to_index))
    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embedding_size))
    
    for word, idx in word_to_index.items():
        if idx >= max_features: 
            continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None: 
            embedding_matrix[idx] = embedding_vector

    return embedding_matrix

In [None]:
ft_embeddings = load_pretrained_embeddings(word_to_index=word2idx, 
                                           max_features=VOCAB_SIZE, 
                                           embedding_size=EMBED_SIZE, 
                                           embedding_file_path=FASTTEXT_INIT_EMBEDDINGS_FILE)
ft_embeddings.shape

In [None]:
model = tf.keras.models.Sequential()

model.add(tf.keras.layers.Embedding(VOCAB_SIZE, EMBED_SIZE,
                                    weights=[ft_embeddings],
                                    trainable=True,
                                    input_length=MAX_SEQUENCE_LENGTH))

model.add(tf.keras.layers.Conv1D(filters=256, kernel_size=4, padding='same', activation='relu'))
model.add(tf.keras.layers.MaxPooling1D(pool_size=2))

model.add(tf.keras.layers.Conv1D(filters=128, kernel_size=4, padding='same', activation='relu'))
model.add(tf.keras.layers.MaxPooling1D(pool_size=2))

model.add(tf.keras.layers.Conv1D(filters=64, kernel_size=4, padding='same', activation='relu'))
model.add(tf.keras.layers.MaxPooling1D(pool_size=2))

model.add(tf.keras.layers.Flatten())

model.add(tf.keras.layers.Dense(256, activation='relu'))
model.add(tf.keras.layers.Dropout(0.25))
model.add(tf.keras.layers.Dense(256, activation='relu'))
model.add(tf.keras.layers.Dropout(0.25))
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

In [None]:
EPOCHS=5
BATCH_SIZE=128

In [None]:
model.fit(X_train, y_train, 
          validation_split=0.02,
          epochs=EPOCHS, 
          batch_size=BATCH_SIZE, 
          shuffle=True,
          verbose=1)

Epoch 1/5

In [None]:
prediction_probs = model3.predict(X_test, verbose=1).ravel()
predictions = [1 if prob > 0.5 else 0 for prob in prediction_probs]
predictions[:10]

In [None]:
labels = le.classes_.tolist()
print(classification_report(test_labels, predictions))
pd.DataFrame(confusion_matrix(test_labels, predictions), index=labels, columns=labels)

In [None]:
!pip install transformers

In [None]:
import transformers
transformers.__version__

In [None]:
import tqdm
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

%matplotlib inline
seed = 42
np.random.seed(seed)
tf.random.set_seed(seed)

In [None]:
import tensorflow_hub as hub
print("TF Version: ", tf.__version__)
print("Eager mode: ", tf.executing_eagerly())
print("TF Hub version: ", hub.__version__)
print("GPU is", "available" if tf.test.is_gpu_available() else "NOT AVAILABLE")

In [None]:
tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
def create_bert_input_features(tokenizer, docs, max_seq_length):
    
    all_ids, all_masks, all_segments= [], [], []
    for doc in tqdm.tqdm(docs, desc="Converting docs to features"):
        
        tokens = tokenizer.tokenize(doc)
        
        if len(tokens) > max_seq_length-2:
            tokens = tokens[0 : (max_seq_length-2)]
        tokens = ['[CLS]'] + tokens + ['[SEP]']
        ids = tokenizer.convert_tokens_to_ids(tokens)
        masks = [1] * len(ids) 

        while len(ids) < max_seq_length:
            ids.append(0)
            masks.append(0)
            
        segments = [0] * max_seq_length 
        all_ids.append(ids)
        all_masks.append(masks)
        all_segments.append(segments)
        
    encoded = np.array([all_ids, all_masks, all_segments])
    
    return encoded

In [None]:
MAX_SEQ_LENGTH = 500

inp_id = tf.keras.layers.Input(shape=(MAX_SEQ_LENGTH,), dtype='int32', name="bert_input_ids")
inp_mask = tf.keras.layers.Input(shape=(MAX_SEQ_LENGTH,), dtype='int32', name="bert_input_masks")
inp_segment = tf.keras.layers.Input(shape=(MAX_SEQ_LENGTH,), dtype='int32', name="bert_segment_ids")
inputs = [inp_id, inp_mask, inp_segment]

hidden_state = transformers.TFBertModel.from_pretrained('bert-base-uncased')(inputs)
pooled_output = hidden_state[1]

dense1 = tf.keras.layers.Dense(256, activation='relu')(pooled_output)
drop1 = tf.keras.layers.Dropout(0.25)(dense1)
dense2 = tf.keras.layers.Dense(256, activation='relu')(drop1)
drop2 = tf.keras.layers.Dropout(0.25)(dense2)

output = tf.keras.layers.Dense(1, activation='sigmoid')(drop2)

model = tf.keras.Model(inputs=inputs, outputs=output)

model.compile(optimizer=tf.optimizers.Adam(learning_rate=2e-5, 
                                           epsilon=1e-08), 
              loss='binary_crossentropy', metrics=['accuracy'])

model.summary()

In [None]:
train_features_ids, train_features_masks, train_features_segments = create_bert_input_features(tokenizer, 
                                                                                               train_reviews, 
                                                                                               max_seq_length=MAX_SEQ_LENGTH)

test_features_ids, test_features_masks, test_features_segments = create_bert_input_features(tokenizer, 
                                                                                         test_reviews, 
                                                                                         max_seq_length=MAX_SEQ_LENGTH)

print('Train Features:', train_features_ids.shape, train_features_masks.shape, train_features_segments.shape)
print('Test Features:', test_features_ids.shape, test_features_masks.shape, test_features_segments.shape)

In [None]:
model.fit([train_features_ids, 
           train_features_masks, 
           train_features_segments], y_train, 
          validation_data=([test_features_ids, 
                            test_features_masks, 
                            test_features_segments], y_test),
          epochs=3, 
          batch_size=12, 
          shuffle=True,
          verbose=1)

In [None]:
predictions = [1 if pr > 0.5 else 0 
                   for pr in model.predict([test_features_ids, 
                                            test_features_masks, 
                                            test_features_segments], verbose=0).ravel()]

In [None]:
print("Accuracy: %.2f%%" % (accuracy_score(y_test, predictions)*100))
print(classification_report(y_test, predictions))
pd.DataFrame(confusion_matrix(y_test, predictions))