In [None]:
import numpy as np 
import pandas as pd 
from scipy.special import softmax
from tqdm import tqdm

from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer


data = pd.read_csv("/kaggle/input/trip-advisor-hotel-reviews/tripadvisor_hotel_reviews.csv")
data["sentiment"], data["sent_positive"], data["sent_neutral"], data["sent_negative"] = None, None, None, None

task='sentiment'
MODEL = f"cardiffnlp/twitter-roberta-base-{task}"
labels = ['negative', 'neutral', 'positive']

tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

In [None]:
for r in tqdm(data.index):
    try:
        text = data["Review"].loc[r]
        encoded_input = tokenizer(text, return_tensors='pt')
        output = model(**encoded_input)
        scores = output[0][0].detach().numpy()
        scores = softmax(scores)

        ranking = np.argsort(scores)
        ranking = ranking[::-1]

        d = {}
        for i in range(scores.shape[0]):
            l = labels[ranking[i]]
            s = scores[ranking[i]]
            d[l] = s

        data["sentiment"].loc[r] = sorted(d.items(), key=lambda item: item[1], reverse=True)[0][0]
        data["sent_positive"].loc[r] = d["positive"]
        data["sent_neutral"].loc[r] = d["neutral"]
        data["sent_negative"].loc[r] = d["negative"]
    
    except:
        pass

data = data.dropna()

## Output analysis

Right now, lets take a pills of results.

In [None]:
import seaborn

groupby = pd.DataFrame(data[["Rating", "sentiment"]].groupby(["Rating", "sentiment"]).size()).rename(columns={0: "count"}).reset_index()
seaborn.barplot(x='Rating', y='count', hue='sentiment', data=groupby)

In [None]:
import scipy.stats

print("Pearson correlation btw Rating and positive sentiment: ", scipy.stats.pearsonr(data["Rating"].astype(int), data["sent_positive"].astype(float))[0])
print("Pearson correlation btw Rating and negative sentiment: ", scipy.stats.pearsonr(data["Rating"].astype(int), data["sent_negative"].astype(float))[0])

In [None]:
seaborn.regplot(x=data["Rating"].astype(int), y=data["sent_positive"].astype(float))

In [None]:
seaborn.regplot(x=data["Rating"].astype(int), y=data["sent_negative"].astype(float))