In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')
import nltk

In [2]:
df = pd.read_csv('../BA_reviews.csv')

In [3]:
# basic stuff that NLTK can do
example = df["Reviews"][0]
tokens = nltk.word_tokenize(example)
print(f"Tokenizing:\n{tokens}\n\n")

pos_tag = nltk.pos_tag(tokens)
print(f"POS Tagging:\n{pos_tag}")

Tokenizing:
['✅', 'Trip', 'Verified', '|', 'This', 'time', 'British', 'Airways', 'managed', 'to', 'get', 'everything', 'right', '.', 'The', 'price', 'of', 'the', 'tickets', 'was', 'reasonable', 'and', 'the', 'timing', 'was', 'perfect', '.', 'Boarding', 'was', 'smooth', 'and', 'the', 'seats', 'were', 'comfortable', '–', 'easy', 'reline', 'to', 'enable', 'dozing', 'and', 'sufficient', 'pitch', 'to', 'get', 'in', 'and', 'out', '.', 'The', 'table', 'was', 'firm', 'enough', 'to', 'allow', 'the', 'use', 'of', 'a', 'small', 'laptop', 'and', 'mouse', '.', 'A', 'pre-flight', 'soft', 'drink', 'or', 'Prosecco', 'was', 'offered', 'and', 'both', 'the', 'food', 'and', 'drink', 'were', 'more', 'than', 'sufficient', '.', 'The', 'best', 'part', 'was', 'the', 'cabin', 'crew', '–', 'two', 'in', 'PE', 'and', 'they', 'were', 'both', 'extremely', 'efficient', ',', 'pleasant', 'and', 'witty', '.', 'There', 'was', 'a', 'good', 'selection', 'of', 'movies', 'and', 'both', 'the', 'screen', 'and', 'the', 'headset

In [4]:
# nltk.download('maxent_ne_chunker')

In [5]:
chunks = nltk.chunk.ne_chunk(pos_tag)
chunks.pprint()

(S
  ✅/JJ
  (PERSON Trip/NNP Verified/NNP)
  |/IN
  This/DT
  time/NN
  (GPE British/JJ)
  Airways/NNS
  managed/VBD
  to/TO
  get/VB
  everything/NN
  right/RB
  ./.
  The/DT
  price/NN
  of/IN
  the/DT
  tickets/NNS
  was/VBD
  reasonable/JJ
  and/CC
  the/DT
  timing/NN
  was/VBD
  perfect/JJ
  ./.
  Boarding/NNP
  was/VBD
  smooth/VBN
  and/CC
  the/DT
  seats/NNS
  were/VBD
  comfortable/JJ
  –/JJ
  easy/JJ
  reline/NN
  to/TO
  enable/VB
  dozing/NN
  and/CC
  sufficient/JJ
  pitch/NN
  to/TO
  get/VB
  in/IN
  and/CC
  out/IN
  ./.
  The/DT
  table/NN
  was/VBD
  firm/JJ
  enough/RB
  to/TO
  allow/VB
  the/DT
  use/NN
  of/IN
  a/DT
  small/JJ
  laptop/NN
  and/CC
  mouse/NN
  ./.
  A/DT
  pre-flight/JJ
  soft/JJ
  drink/NN
  or/CC
  (PERSON Prosecco/NNP)
  was/VBD
  offered/VBN
  and/CC
  both/DT
  the/DT
  food/NN
  and/CC
  drink/VB
  were/VBD
  more/JJR
  than/IN
  sufficient/NN
  ./.
  The/DT
  best/JJS
  part/NN
  was/VBD
  the/DT
  cabin/NN
  crew/VBD
  –/JJ
  two/CD
  i

In [6]:
# roberta model
# tranformer based model
from transformers import AutoTokenizer
from transformers import TFAutoModelForSequenceClassification
from scipy.special import softmax

# softmax function is a mathematical fuction used to convert a vector of raw scores (logits) into probabilities.

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment")
model = TFAutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment")





All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

All the layers of TFRobertaForSequenceClassification were initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.


In [8]:
# using roberta
encoded_text = tokenizer(example, return_tensors='tf')
output = model(**encoded_text)
scores = output[0][0].numpy()
scores = softmax(scores)
# scores_dict = {
#     'roberta_neg' : scores[0],
#     'roberta_neu' : scores[1],
#     'roberta_pos' : scores[2],
# }
print(scores)

[0.00256089 0.02157878 0.97586036]


In [9]:
print(example)

✅ Trip Verified |   This time British Airways managed to get everything right. The price of the tickets was reasonable and the timing was perfect. Boarding was smooth and the seats were comfortable – easy reline to enable dozing and sufficient pitch to get in and out. The table was firm enough to allow the use of a small laptop and mouse. A pre-flight soft drink or Prosecco was offered and both the food and drink were more than sufficient. The best part was the cabin crew – two in PE and they were both extremely efficient, pleasant and witty. There was a good selection of movies and both the screen and the headset worked perfectly.


In [10]:
arr = df['Reviews'].to_numpy()

In [14]:
def sentiment_score(reviews_arr):
    # using roberta
# encoded_text = tokenizer(example, return_tensors='tf')
# output = model(**encoded_text)
# scores = output[0][0].numpy()
# scores = softmax(scores)
# scores_dict = {
#     'roberta_neg' : scores[0],
#     'roberta_neu' : scores[1],
#     'roberta_pos' : scores[2],
# }
# print(scores_dict)

    sentiment_scores = []
    for review in reviews_arr:
        # encoded_text = tokenizer(review, return_tensors='tf')
        encoded_text = tokenizer(review, return_tensors='tf', truncation=True, max_length=512)
        output = model(**encoded_text)
        scores = output[0][0].numpy()
        scores = softmax(scores)
        sentiment_scores.append(scores)

    return sentiment_scores

In [15]:
sentiment_arr = sentiment_score(arr)

In [16]:
sentiment_arr

[array([0.00256089, 0.02157878, 0.97586036], dtype=float32),
 array([0.03649494, 0.16164699, 0.80185807], dtype=float32),
 array([0.20736884, 0.34803206, 0.44459915], dtype=float32),
 array([0.8511471, 0.1311342, 0.0177187], dtype=float32),
 array([0.52278984, 0.40243828, 0.07477194], dtype=float32),
 array([0.5529072 , 0.3187073 , 0.12838544], dtype=float32),
 array([0.00614226, 0.02144133, 0.97241646], dtype=float32),
 array([0.9622746 , 0.03414048, 0.00358488], dtype=float32),
 array([0.00397399, 0.04117271, 0.95485336], dtype=float32),
 array([0.00261903, 0.02930059, 0.96808034], dtype=float32),
 array([0.03225765, 0.2139566 , 0.75378585], dtype=float32),
 array([0.8803897 , 0.10754035, 0.01206996], dtype=float32),
 array([0.8484601 , 0.13817504, 0.0133648 ], dtype=float32),
 array([0.9380296 , 0.05580938, 0.00616104], dtype=float32),
 array([0.92840797, 0.06528636, 0.00630567], dtype=float32),
 array([0.8656891 , 0.12037521, 0.01393568], dtype=float32),
 array([0.2689894 , 0.38624

In [21]:
sentiment_arr[1][0]

0.036494944

In [19]:
arr[1]

'✅ Trip Verified |   The seats were excellent, with a feel of much more room than the official seat pitch (I am 6ft 1in). We could sleep for a couple of hours (daytime flight), with the blinds down as they should be (no point booking a window seat in my view). I focus on seat quality much more than smiley faces and food (the food was middling, and the second meal, before landing, was distinctly sub-par).'