In [None]:
!pip install bert-for-tf2
!pip install sentencepiece

Collecting bert-for-tf2
[?25l  Downloading https://files.pythonhosted.org/packages/87/df/ab6d927d6162657f30eb0ae3c534c723c28c191a9caf6ee68ec935df3d0b/bert-for-tf2-0.14.5.tar.gz (40kB)
[K     |████████                        | 10kB 24.5MB/s eta 0:00:01[K     |████████████████                | 20kB 2.9MB/s eta 0:00:01[K     |████████████████████████▏       | 30kB 3.9MB/s eta 0:00:01[K     |████████████████████████████████| 40kB 2.4MB/s 
[?25hCollecting py-params>=0.9.6
  Downloading https://files.pythonhosted.org/packages/a4/bf/c1c70d5315a8677310ea10a41cfc41c5970d9b37c31f9c90d4ab98021fd1/py-params-0.9.7.tar.gz
Collecting params-flow>=0.8.0
  Downloading https://files.pythonhosted.org/packages/a9/95/ff49f5ebd501f142a6f0aaf42bcfd1c192dc54909d1d9eb84ab031d46056/params-flow-0.8.2.tar.gz
Building wheels for collected packages: bert-for-tf2, py-params, params-flow
  Building wheel for bert-for-tf2 (setup.py) ... [?25l[?25hdone
  Created wheel for bert-for-tf2: filename=bert_for_tf2

In [None]:
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras import layers
import keras.backend as K
import bert

In [None]:
import pandas as pd 
import numpy as np 
import re
import math
from sklearn.metrics import classification_report
from sklearn.cluster import KMeans 
import matplotlib.pyplot as plt

In [None]:
train = pd.read_csv('/content/drive/My Drive/parto tech/sentiment_analysis/twitter/train.csv') 

In [None]:
def remove_at_and_hashtag (text) :
  text = text.replace('@user',' ')
  return text.replace('#',' ')
  #TODO : use hashtags

In [None]:
def preprocess_text(sen):
    sentence = remove_at_and_hashtag(sen)

    # Remove punctuations and numbers
    sentence = re.sub('[^a-zA-Z]', ' ', sentence)

    # Single character removal
    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)

    # Removing multiple spaces
    sentence = re.sub(r'\s+', ' ', sentence)

    return sentence

In [None]:
train['tweet'] = train['tweet'].apply(preprocess_text)

In [None]:
tweets = train.tweet.to_list()

In [None]:
BertTokenizer = bert.bert_tokenization.FullTokenizer
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",
                            trainable=False)
vocabulary_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
to_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = BertTokenizer(vocabulary_file, to_lower_case)

In [None]:
def tokenize_tweets(text_tweets):
    return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text_tweets))

In [None]:
tokenized_tweets = [tokenize_tweets(tweet) for tweet in tweets]

In [None]:
np.max(train.tweet.apply(len))

138

In [None]:
y = np.array(train.label)
# tweets_with_len = [[tweet, y[i], len(tweet)] for i, tweet in enumerate(tokenized_tweets)]
tweets_with_len = [[[review[cnt] if cnt < len(review) else 0 for cnt in range(138)], y[i], len(review)] for i, review in enumerate(tokenized_tweets)]

In [None]:
tweets_with_len.sort(key=lambda x: x[2])

In [None]:
sorted_tweets_labels = [(tweet_lab[0], tweet_lab[1]) for tweet_lab in tweets_with_len]

In [None]:
processed_dataset = tf.data.Dataset.from_generator(lambda: sorted_tweets_labels, output_types=(tf.int32, tf.int32))

In [None]:
BATCH_SIZE = 32
batched_dataset = processed_dataset.padded_batch(BATCH_SIZE, padded_shapes=((None, ), ()))

In [None]:
TOTAL_BATCHES = math.ceil(len(sorted_tweets_labels) / BATCH_SIZE)
TEST_BATCHES = TOTAL_BATCHES // 10
batched_dataset.shuffle(TOTAL_BATCHES)
test_data = batched_dataset.take(TEST_BATCHES)
train_data = batched_dataset.skip(TEST_BATCHES)

In [None]:
class TEXT_MODEL(tf.keras.Model):
    
    def __init__(self,
                 vocabulary_size,
                 embedding_dimensions=128,
                 cnn1_filters=16,
                 cnn2_filters=32,
                 cnn3_filters=64,
                 dnn_units=256,
                 model_output_classes=2,
                 dropout_rate=0.1,
                 training=False,
                 name="text_model"):
        super(TEXT_MODEL, self).__init__(name=name)
        
        self.embedding = layers.Embedding(vocabulary_size,
                                          embedding_dimensions)
        self.cnn_layer1 = layers.Conv1D(filters=cnn1_filters,
                                        kernel_size=2,
                                        padding="valid",
                                        activation="relu")
        self.cnn_layer2 = layers.Conv1D(filters=cnn2_filters,
                                        kernel_size=3,
                                        padding="valid",
                                        activation="relu")
        self.cnn_layer3 = layers.Conv1D(filters=cnn3_filters,
                                        kernel_size=4,
                                        padding="valid",
                                        activation="relu")
        self.pool = layers.GlobalMaxPool1D()
        
        self.dense_1 = layers.Dense(units=dnn_units, activation=tf.nn.leaky_relu)
        self.dense_2 = layers.Dense(units=dnn_units, activation=tf.nn.leaky_relu)
        self.dropout_1 = layers.Dropout(rate=dropout_rate)
        self.dropout_2 = layers.Dropout(rate=dropout_rate)

        if model_output_classes == 2:
            self.last_dense = layers.Dense(units=1,
                                           activation="sigmoid")
        else:
            self.last_dense = layers.Dense(units=model_output_classes,
                                           activation="softmax")
    
    def call(self, inputs, training):
        l = self.embedding(inputs)
        l_1 = self.cnn_layer1(l) 
        l_1 = self.pool(l_1) 
        l_2 = self.cnn_layer2(l) 
        l_2 = self.pool(l_2)
        l_3 = self.cnn_layer3(l)
        l_3 = self.pool(l_3) 
        
        concatenated = tf.concat([l_1, l_2, l_3], axis=-1) 
        concatenated = self.dense_1(concatenated)
        concatenated = self.dropout_1(concatenated)
        concatenated = self.dense_2(concatenated)
        concatenated = self.dropout_2(concatenated, training)
        model_output = self.last_dense(concatenated)
        
        return model_output

In [None]:
VOCAB_LENGTH = len(tokenizer.vocab)
EMB_DIM = 200
CNN1_FILTERS = 16
CNN2_FILTERS = 32
CNN3_FILTERS = 64
DNN_UNITS = 256
OUTPUT_CLASSES = 2

DROPOUT_RATE = 0.2

NB_EPOCHS = 5

In [None]:
text_model = TEXT_MODEL(vocabulary_size=VOCAB_LENGTH,
                        embedding_dimensions=EMB_DIM,
                        cnn1_filters=CNN1_FILTERS,
                        cnn2_filters=CNN2_FILTERS,
                        cnn3_filters=CNN3_FILTERS,
                        dnn_units=DNN_UNITS,
                        model_output_classes=OUTPUT_CLASSES,
                        dropout_rate=DROPOUT_RATE)

In [None]:

def get_f1(y_true, y_pred): 
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    recall = true_positives / (possible_positives + K.epsilon())
    f1_val = 2*(precision*recall)/(precision+recall+K.epsilon())
    return f1_val

In [None]:
if OUTPUT_CLASSES == 2:
    text_model.compile(loss="binary_crossentropy",
                       optimizer="adam",
                       metrics=[get_f1])
else:
    text_model.compile(loss="sparse_categorical_crossentropy",
                       optimizer="adam",
                       metrics=["sparse_categorical_accuracy"])

In [None]:
text_model.fit(train_data, epochs=NB_EPOCHS)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7fe892718a20>

In [None]:
text_model.summary()

Model: "text_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      multiple                  6104400   
_________________________________________________________________
conv1d_9 (Conv1D)            multiple                  6416      
_________________________________________________________________
conv1d_10 (Conv1D)           multiple                  19232     
_________________________________________________________________
conv1d_11 (Conv1D)           multiple                  51264     
_________________________________________________________________
global_max_pooling1d_3 (Glob multiple                  0         
_________________________________________________________________
dense_9 (Dense)              multiple                  28928     
_________________________________________________________________
dense_10 (Dense)             multiple                  6

In [None]:
x_test = test_data.map(lambda tokens,label : tokens)

In [None]:
count = 0 
for tensor in test_data :
  count +=1 
  print(tensor)

print(count)

(<tf.Tensor: shape=(32, 138), dtype=int32, numpy=
array([[   0,    0,    0, ...,    0,    0,    0],
       [   0,    0,    0, ...,    0,    0,    0],
       [4566,    0,    0, ...,    0,    0,    0],
       ...,
       [5353,    0,    0, ...,    0,    0,    0],
       [2022,    0,    0, ...,    0,    0,    0],
       [4465,    0,    0, ...,    0,    0,    0]], dtype=int32)>, <tf.Tensor: shape=(32,), dtype=int32, numpy=
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0], dtype=int32)>)
(<tf.Tensor: shape=(32, 138), dtype=int32, numpy=
array([[2204,    0,    0, ...,    0,    0,    0],
       [5353,    0,    0, ...,    0,    0,    0],
       [2200,    0,    0, ...,    0,    0,    0],
       ...,
       [2265, 2154,    0, ...,    0,    0,    0],
       [2467, 2022,    0, ...,    0,    0,    0],
       [2092, 2589,    0, ...,    0,    0,    0]], dtype=int32)>, <tf.Tensor: shape=(32,), dtype=int32, numpy=
array([0, 0, 0, 0, 0, 0, 0, 

In [None]:
results = text_model.predict(test_data)
print(get_f1(y_true=test_data.map(lambda first,sec : list(sec)),y_pred=results))

OperatorNotAllowedInGraphError: ignored