In [1]:
import tensorflow as tf
import pandas as pd
from transformers import DistilBertTokenizer, BertTokenizer, TFBertForSequenceClassification, BertConfig, TFDistilBertForSequenceClassification
from transformers import InputExample, InputFeatures
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Rithesh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Rithesh\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [2]:
num_labels = 3
model = TFDistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=num_labels, problem_type="multi_label_classification")
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['vocab_layer_norm', 'vocab_transform', 'vocab_projector', 'activation_13']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier', 'classifier', 'dropout_19']
You should probably TRAIN this model on a down-stream task to be able to use i

In [3]:
model.summary()

Model: "tf_distil_bert_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 distilbert (TFDistilBertMai  multiple                 66362880  
 nLayer)                                                         
                                                                 
 pre_classifier (Dense)      multiple                  590592    
                                                                 
 classifier (Dense)          multiple                  2307      
                                                                 
 dropout_19 (Dropout)        multiple                  0         
                                                                 
Total params: 66,955,779
Trainable params: 66,955,779
Non-trainable params: 0
_________________________________________________________________


In [4]:
dataset = pd.read_csv('BalancedReviews3Class.csv')
dataset = dataset.iloc[:,1:3]

In [8]:
dataset

Unnamed: 0,reviews,sentiment
0,"If you have an emergency and drive yourself, b...",2
1,See my tip. While I really like my doctor and...,2
2,very knowledgeable doctor and one who will fix...,1
3,I had question about my surgery. Called them f...,0
4,My mother went to see Dr. Mcgee (podiatry) at ...,2
...,...,...
12054,"As a dentist, I was very please with Dr. Oakes...",2
12055,I've come to this office for three different i...,0
12056,Absolutely horrible experience with this offic...,0
12057,We stopped doing IV therapy because it's very ...,1


In [9]:
def replace_punctuation(df):
    df["reviews"] = df["reviews"].str.replace("[^a-zA-Z0-9#']", " ") 
    return df

In [10]:
def remove_short_words(df):
    df['reviews'] = df['reviews'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>2]))
    return df

In [11]:
def make_lowercase(df):
    df['reviews'] = [review.lower() for review in df['reviews']]
    return df

In [12]:
stop_words = stopwords.words('english')

def remove_stopwords(rev):
    add_words = ['doctor']
    stop_words.extend(add_words)
    review_tokenized = word_tokenize(rev)
    rev_new = " ".join([i for i in review_tokenized  if i not in stop_words])
    return rev_new
    
def stopwords_processing(df):
    df['reviews'] = [remove_stopwords(r) for r in df['reviews']]
    return df

In [13]:
def nltk_to_wordnet_tag(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:          
        return None

lemmatizer = WordNetLemmatizer()
def lemmatize_sentence(sentence):
    nltk_tagged = nltk.pos_tag(word_tokenize(sentence))  
    wordnet_tagged = map(lambda x: (x[0], nltk_to_wordnet_tag(x[1])), nltk_tagged)
    lemmatized_sentence = []
    for word, tag in wordnet_tagged:
        if tag is None:
            lemmatized_sentence.append(word)
        else:        
            lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))
    return " ".join(lemmatized_sentence)

def lemmatize(df):
    df['reviews'] = df['reviews'].apply(lambda x: lemmatize_sentence(x))
    return df

In [14]:
def preprocess(df):
    df = replace_punctuation(df)
    df = remove_short_words(df)
    df = make_lowercase(df)
    df = stopwords_processing(df)
    df = lemmatize(df)
    return df

In [15]:
dataset = preprocess(dataset)

  df["reviews"] = df["reviews"].str.replace("[^a-zA-Z0-9#']", " ")


In [16]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(dataset, test_size = 0.3, shuffle = True)

In [17]:
train.head()

Unnamed: 0,reviews,sentiment
2031,leave first appointment saw elizabeth bleeker ...,1
10012,note covid pandemic refer endoscopy n't teledo...,2
4195,wait time ridiculous good doctor front dess sc...,0
5309,agree reviewer stay away place go 2010 routine...,0
7824,try fit many eye exam one day always run behin...,0


In [18]:
train.columns = ['DATA_COLUMN', 'LABEL_COLUMN']
test.columns = ['DATA_COLUMN', 'LABEL_COLUMN']

In [19]:
def convert_data_to_examples(train, test, DATA_COLUMN, LABEL_COLUMN):
    train_examples = train.apply(lambda x : InputExample(guid = None,
                                                        text_a = x[DATA_COLUMN],
                                                        text_b = None,
                                                        label = x[LABEL_COLUMN]), axis = 1)
    validation_examples = test.apply(lambda x : InputExample(guid = None,
                                                        text_a = x[DATA_COLUMN],
                                                        text_b = None,
                                                        label = x[LABEL_COLUMN]), axis = 1)
    return train_examples, validation_examples

DATA_COLUMN = 'DATA_COLUMN'
LABEL_COLUMN = 'LABEL_COLUMN'

train_InputExamples, validation_InputExamples = convert_data_to_examples(train, 
                                                                        test, 
                                                                        DATA_COLUMN, 
                                                                        LABEL_COLUMN)

In [20]:
# DISTILBERT
def convert_examples_to_tf_dataset(examples, tokenizer, max_length=128):
    features = [] 
    for e in examples:
       
        input_dict = tokenizer.encode_plus(
            e.text_a,
            add_special_tokens=True,
            max_length=max_length, 
            return_token_type_ids=True,
            return_attention_mask=True,
            padding='max_length', 
            truncation=True
        )

        input_ids, token_type_ids, attention_mask = (input_dict["input_ids"],
            input_dict["token_type_ids"], input_dict['attention_mask'])
        
        features.append(
            InputFeatures(
                input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=e.label
            )
        )

    def gen():
        for f in features:
            yield (
                {
                    "input_ids": f.input_ids,
                    "attention_mask": f.attention_mask,
#                     "token_type_ids": f.token_type_ids,
                },
                f.label,
            )

    return tf.data.Dataset.from_generator(
        gen,
        ({"input_ids": tf.int32, "attention_mask": tf.int32}, tf.int64),
        (
            {
                "input_ids": tf.TensorShape([None]),
                "attention_mask": tf.TensorShape([None]),
#                 "token_type_ids": tf.TensorShape([None]),
            },
            tf.TensorShape([]),
        ),
    )

train_data = convert_examples_to_tf_dataset(list(train_InputExamples), tokenizer)
train_data = train_data.shuffle(100).batch(32).repeat(2)
type(train_data)

validation_data = convert_examples_to_tf_dataset(list(validation_InputExamples), tokenizer)
validation_data = validation_data.batch(32)

In [21]:
# # BERT
# def convert_examples_to_tf_dataset(examples, tokenizer, max_length=128):
#     features = [] 
#     for e in examples:
#         # Documentation is really strong for this method, so please take a look at it
#         input_dict = tokenizer.encode_plus(
#             e.text_a,
#             add_special_tokens=True,
#             max_length=max_length, # truncates if len(s) > max_length
#             return_token_type_ids=True,
#             return_attention_mask=True,
#             padding='max_length', # pads to the right by default # CHECK THIS for pad_to_max_length
#             truncation=True
#         )

#         input_ids, token_type_ids, attention_mask = (input_dict["input_ids"],
#             input_dict["token_type_ids"], input_dict['attention_mask'])
        
#         features.append(
#             InputFeatures(
#                 input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=e.label
#             )
#         )

#     def gen():
#         for f in features:
#             yield (
#                 {
#                     "input_ids": f.input_ids,
#                     "attention_mask": f.attention_mask,
#                     "token_type_ids": f.token_type_ids,
#                 },
#                 f.label,
#             )

#     return tf.data.Dataset.from_generator(
#         gen,
#         ({"input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32}, tf.int64),
#         (
#             {
#                 "input_ids": tf.TensorShape([None]),
#                 "attention_mask": tf.TensorShape([None]),
#                 "token_type_ids": tf.TensorShape([None]),
#             },
#             tf.TensorShape([]),
#         ),
#     )

# train_data = convert_examples_to_tf_dataset(list(train_InputExamples), tokenizer)
# train_data = train_data.shuffle(100).batch(32).repeat(2)
# print(type(train_data))

# validation_data = convert_examples_to_tf_dataset(list(validation_InputExamples), tokenizer)
# validation_data = validation_data.batch(32)

In [4]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0), 
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), 
              metrics=[tf.keras.metrics.SparseCategoricalAccuracy('accuracy')])

In [23]:

model.fit(train_data, epochs=2, validation_data=validation_data)

Epoch 1/2
Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089
Epoch 2/2


<keras.callbacks.History at 0x2138cf818b0>

In [24]:
model.save_weights('my_model_weights.h5')

In [5]:
model.load_weights('my_model_weights.h5')

In [6]:
model.save('DistilBert')

Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089




INFO:tensorflow:Assets written to: DistilBert\assets


INFO:tensorflow:Assets written to: DistilBert\assets


In [10]:
pred_sentences = ['Very good doctor', 
                  'the doctor was as good as my 6 year old',
                  'the clinic was clean but the doctor could be better tbh', 
                  'can safely say that this is an bad place',
                 'is this doctor even for real? I cannot believe this service, so good',
                 'poorest service']

In [11]:
tf_batch = tokenizer(pred_sentences, max_length=128, padding=True, truncation=True, return_tensors='tf')
tf_outputs = model(tf_batch)
print(tf_outputs)
tf_predictions = tf.nn.softmax(tf_outputs[0], axis=-1)
print(float(tf_predictions[1][2]),"\n")
labels = ['Negative','Positive','Neutral']
label = tf.argmax(tf_predictions, axis=1)
label = label.numpy()
for i in range(len(pred_sentences)):
    print(pred_sentences[i], ": \n", labels[label[i]])

TFSequenceClassifierOutput(loss=None, logits=<tf.Tensor: shape=(6, 3), dtype=float32, numpy=
array([[-3.621418  ,  2.4831648 ,  0.74470097],
       [-2.86753   ,  0.46841314,  2.1549695 ],
       [-3.2287614 ,  1.4268769 ,  1.4378314 ],
       [ 4.0951295 , -3.0556383 , -1.0528983 ],
       [-1.3185476 , -0.47535902,  1.4856945 ],
       [ 3.7923725 , -3.4539533 , -0.4716779 ]], dtype=float32)>, hidden_states=None, attentions=None)
0.8391063213348389 

Very good doctor : 
 Positive
the doctor was as good as my 6 year old : 
 Neutral
the clinic was clean but the doctor could be better tbh : 
 Neutral
can safely say that this is an bad place : 
 Negative
is this doctor even for real? I cannot believe this service, so good : 
 Neutral
poorest service : 
 Negative


In [52]:
#sentiment score computation function