# Using ChatGPT to analyze hotel reviews
(c) Nuno Antonio 2023-2025<br>
Version 1.10 - 2025-05-02

### Import packages
If required, install packages using "!pip install <package name>"

In [1]:
import numpy as np
import pandas as pd
import requests
from tqdm import tqdm
import time
import ydata_profiling as ydp
import json
import ast

In [2]:
import openai                           # Documentation: https://github.com/openai/openai-python
import tiktoken                         # Documentation: https://github.com/openai/tiktoken
from skllm.config import SKLLMConfig    # !pip install scikit-llm

### OpenAI Setup

In [31]:
# OpenAI API key
openai.api_key = "REPLACE THIS TEXT WITH YOUR KEY"

SKLLMConfig.set_openai_key(openai.api_key)

In [4]:
# OpenAI API URL
GPT_API_URL = "https://api.openai.com/v1/chat/completions"

In [5]:
# Check if API key is valid
def check_openai_api_key():
    try:
        openai.models.list()
    except openai.AuthenticationError as e:
        return False
    else:
        return True

is_valid = check_openai_api_key()
if is_valid:
    print("Valid OpenAI API key")
else:
    print("Invalid OpenAI API key")

Valid OpenAI API key


### OpenAI usage

Online can be checked at https://platform.openai.com/usage

In [6]:
# API headers
headers = {'Authorization': f'Bearer {openai.api_key}'}

# API endpoint
url = 'https://api.openai.com/v1/usage'

# Date for which to get usage data
params = {'date': '2025-05-01'}

# Send API request and get response
response = requests.get(url, headers=headers, params=params)

print(response.json())

{'object': 'list', 'data': [], 'ft_data': [], 'dalle_api_data': [], 'whisper_api_data': [], 'tts_api_data': [], 'assistant_code_interpreter_data': [], 'retrieval_storage_data': []}


### Tokens counting

To count tokens online use: https://platform.openai.com/tokenizer


In [7]:
# Function to count tokens
def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens 

# Example
num_tokens_from_string("The ministry of silly walks", "cl100k_base")

5

In [8]:
def compare_encodings(example_string: str) -> None:
    """Prints a comparison of three string encodings."""
    # print the example string
    print(f'\nExample string: "{example_string}"')
    # for each encoding, print the # of tokens, the token integers, and the token bytes
    for encoding_name in ["r50k_base", "p50k_base", "cl100k_base"]:
        encoding = tiktoken.get_encoding(encoding_name)
        token_integers = encoding.encode(example_string)
        num_tokens = len(token_integers)
        token_bytes = [encoding.decode_single_token_bytes(token) for token in token_integers]
        print()
        print(f"{encoding_name}: {num_tokens} tokens")
        print(f"token integers: {token_integers}")
        print(f"token bytes: {token_bytes}")
        
# Example
compare_encodings("Monty Python's The Ministry of Silly Walks")


Example string: "Monty Python's The Ministry of Silly Walks"

r50k_base: 11 tokens
token integers: [26031, 88, 11361, 338, 383, 9475, 286, 311, 6548, 6445, 591]
token bytes: [b'Mont', b'y', b' Python', b"'s", b' The', b' Ministry', b' of', b' S', b'illy', b' Wal', b'ks']

p50k_base: 11 tokens
token integers: [26031, 88, 11361, 338, 383, 9475, 286, 311, 6548, 6445, 591]
token bytes: [b'Mont', b'y', b' Python', b"'s", b' The', b' Ministry', b' of', b' S', b'illy', b' Wal', b'ks']

cl100k_base: 11 tokens
token integers: [35515, 88, 13325, 596, 578, 20214, 315, 8211, 398, 12839, 82]
token bytes: [b'Mont', b'y', b' Python', b"'s", b' The', b' Ministry', b' of', b' Sil', b'ly', b' Walk', b's']


### Load data

In [9]:
# Load dataset
ds = pd.read_excel("HotelsAlgarve2019.xlsx")

### Data understanding

In [10]:
# Describe
ds.describe(include='all').T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
HotelID,17814.0,,,,40.731391,37.557297,1.0,10.0,29.0,61.0,161.0
Language,17814.0,6.0,English,10744.0,,,,,,,
GlobalRating,17814.0,,,,4.186567,0.417821,2.0,4.0,4.0,4.5,5.0
TotalReviewOnSite,17814.0,,,,2085.740036,1516.716905,9.0,863.0,1632.0,3157.0,6361.0
ExtractedReviews,17814.0,,,,173.839901,198.799038,1.0,35.0,90.0,235.0,740.0
PositionInRegionRanking,17814.0,,,,15.514708,22.631497,0.0,4.0,7.0,17.0,152.0
SitesInRegionRanking,17814.0,,,,67.521107,60.573737,0.0,17.0,37.0,154.0,154.0
TripadvisorReviewID,17814.0,,,,680555525.995004,39073160.812104,17695721.0,666456189.25,684433245.0,705523729.0,751120809.0
ReviewRating,17814.0,,,,4.178792,1.070667,1.0,4.0,5.0,5.0,5.0
PublishedDate,17814.0,12.0,2019-08-01 00:00:00.000,2261.0,,,,,,,


In [11]:
# Create a profile report
report = ydp.ProfileReport(ds)

# Show the report
report.to_notebook_iframe()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 14/14 [00:02<00:00,  5.80it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

### Get a sample of the data
Due to the rate limits of the free account and the amount to spend

In [12]:
# Calculate the number of samples per language based on the proportion in the dataframe
proportions = ds['Language'].value_counts(normalize=True)

# Get a stratified sample of 50 observations
sample_size = 50
stratified_sample = ds.groupby('Language', group_keys=False).apply(
    lambda x: x.sample(int(proportions[x.name] * sample_size), random_state=123)
)

# To extract exactly 50 samples
if len(stratified_sample) > sample_size:
    # Remove excess samples
    stratified_sample = stratified_sample.groupby('Language').head(
        lambda x: int(proportions[x.name] * sample_size)).reset_index(drop=True)
elif len(stratified_sample) < sample_size:
    # Add missing samples by sampling from the largest group
    additional_samples = ds[ds['Language'] == stratified_sample['Language'].value_counts().idxmax()].sample(
        sample_size - len(stratified_sample))
    stratified_sample = pd.concat([stratified_sample, additional_samples], ignore_index=True)

  stratified_sample = ds.groupby('Language', group_keys=False).apply(


### Sentiment analysis

In [13]:
def analyze_review(review, model="gpt-3.5-turbo"):
    """ Get the sentiment of a review """
    retries = 3
    sentiment = None

    while retries > 0:
        messages = [
            {"role": "system", "content": "You are an AI language model trained to analyze and detect the sentiment of hotel reviews."},
            {"role": "user", "content": f"Analyze the following hotel review and determine if the sentiment is: positive, negative or neutral. Return only a single word, either POSITIVE, NEGATIVE or NEUTRAL: {review}"}
        ]

        completion = openai.chat.completions.create(
            model=model,
            messages=messages,
            max_tokens=3,           # Limit the number of tokens in the output
            n=1,                    # Number of outputs
            stop=None,
            temperature=0           # Measure of entropy (temperature = 0 is more factual, temperature = 1 higher degree of creativity) 
        )

        response_text = completion.choices[0].message.content
        print(response_text)
        if response_text in ["POSITIVE", "NEGATIVE", "NEUTRAL"]:
            sentiment = response_text
            break
        else:
            retries -= 1
            time.sleep(1)
    else:
        sentiment = "NEUTRAL"

    # add a delay of 4 seconds between requests to avoid hitting the free tier API call rate limit
    time.sleep(4)

    return sentiment

In [14]:
# Analyze the reviews and store the results
sentiments = []

for review in tqdm(stratified_sample["FullText"], desc="Processing reviews"):
    sentiment = analyze_review(review)
    sentiments.append(sentiment)

stratified_sample["Sentiment"] = sentiments

# Save the results to a new Excel file
output_file = "AnalyzedReviews.xlsx"
stratified_sample.to_excel(output_file, index=False)

Processing reviews:   0%|          | 0/50 [00:00<?, ?it/s]

NEGATIVE


Processing reviews:   2%|▏         | 1/50 [00:04<03:40,  4.50s/it]

POSITIVE


Processing reviews:   4%|▍         | 2/50 [00:08<03:35,  4.48s/it]

POSITIVE


Processing reviews:   6%|▌         | 3/50 [00:13<03:29,  4.46s/it]

NEGATIVE


Processing reviews:   8%|▊         | 4/50 [00:17<03:25,  4.47s/it]

NEUTRAL


Processing reviews:  10%|█         | 5/50 [00:22<03:20,  4.46s/it]

POSITIVE


Processing reviews:  12%|█▏        | 6/50 [00:26<03:15,  4.44s/it]

NEGATIVE


Processing reviews:  14%|█▍        | 7/50 [00:31<03:11,  4.44s/it]

POSITIVE


Processing reviews:  16%|█▌        | 8/50 [00:35<03:07,  4.47s/it]

POSITIVE


Processing reviews:  18%|█▊        | 9/50 [00:40<03:03,  4.48s/it]

POSITIVE


Processing reviews:  20%|██        | 10/50 [00:44<02:59,  4.49s/it]

POSITIVE


Processing reviews:  22%|██▏       | 11/50 [00:49<02:54,  4.47s/it]

POSITIVE


Processing reviews:  24%|██▍       | 12/50 [00:53<02:49,  4.46s/it]

POSITIVE


Processing reviews:  26%|██▌       | 13/50 [00:58<02:46,  4.49s/it]

NEGATIVE


Processing reviews:  28%|██▊       | 14/50 [01:02<02:41,  4.49s/it]

NEGATIVE


Processing reviews:  30%|███       | 15/50 [01:07<02:37,  4.50s/it]

POSITIVE


Processing reviews:  32%|███▏      | 16/50 [01:11<02:31,  4.46s/it]

POSITIVE


Processing reviews:  34%|███▍      | 17/50 [01:16<02:28,  4.51s/it]

POSITIVE


Processing reviews:  36%|███▌      | 18/50 [01:20<02:23,  4.49s/it]

POSITIVE


Processing reviews:  38%|███▊      | 19/50 [01:24<02:18,  4.46s/it]

NEGATIVE


Processing reviews:  40%|████      | 20/50 [01:29<02:13,  4.46s/it]

POSITIVE


Processing reviews:  42%|████▏     | 21/50 [01:33<02:09,  4.48s/it]

NEGATIVE


Processing reviews:  44%|████▍     | 22/50 [01:38<02:04,  4.46s/it]

NEGATIVE


Processing reviews:  46%|████▌     | 23/50 [01:42<02:01,  4.50s/it]

POSITIVE


Processing reviews:  48%|████▊     | 24/50 [01:47<01:57,  4.53s/it]

NEGATIVE


Processing reviews:  50%|█████     | 25/50 [01:52<01:52,  4.51s/it]

POSITIVE


Processing reviews:  52%|█████▏    | 26/50 [01:56<01:47,  4.48s/it]

POSITIVE


Processing reviews:  54%|█████▍    | 27/50 [02:01<01:43,  4.51s/it]

POSITIVE


Processing reviews:  56%|█████▌    | 28/50 [02:05<01:38,  4.50s/it]

NEGATIVE


Processing reviews:  58%|█████▊    | 29/50 [02:09<01:34,  4.48s/it]

POSITIVE


Processing reviews:  60%|██████    | 30/50 [02:14<01:29,  4.46s/it]

POSITIVE


Processing reviews:  62%|██████▏   | 31/50 [02:18<01:25,  4.49s/it]

NEGATIVE


Processing reviews:  64%|██████▍   | 32/50 [02:23<01:20,  4.50s/it]

POSITIVE


Processing reviews:  66%|██████▌   | 33/50 [02:27<01:16,  4.48s/it]

NEGATIVE


Processing reviews:  68%|██████▊   | 34/50 [02:32<01:11,  4.48s/it]

POSITIVE


Processing reviews:  70%|███████   | 35/50 [02:36<01:07,  4.50s/it]

NEGATIVE


Processing reviews:  72%|███████▏  | 36/50 [02:41<01:03,  4.52s/it]

NEGATIVE


Processing reviews:  74%|███████▍  | 37/50 [02:48<01:10,  5.40s/it]

POSITIVE


Processing reviews:  76%|███████▌  | 38/50 [02:53<01:01,  5.11s/it]

POSITIVE


Processing reviews:  78%|███████▊  | 39/50 [02:57<00:54,  4.92s/it]

POSITIVE


Processing reviews:  80%|████████  | 40/50 [03:02<00:48,  4.89s/it]

POSITIVE


Processing reviews:  82%|████████▏ | 41/50 [03:07<00:42,  4.77s/it]

POSITIVE


Processing reviews:  84%|████████▍ | 42/50 [03:11<00:37,  4.66s/it]

POSITIVE


Processing reviews:  86%|████████▌ | 43/50 [03:15<00:32,  4.58s/it]

POSITIVE


Processing reviews:  88%|████████▊ | 44/50 [03:20<00:27,  4.54s/it]

POSITIVE


Processing reviews:  90%|█████████ | 45/50 [03:24<00:22,  4.52s/it]

POSITIVE


Processing reviews:  92%|█████████▏| 46/50 [03:29<00:18,  4.52s/it]

POSITIVE


Processing reviews:  94%|█████████▍| 47/50 [03:33<00:13,  4.51s/it]

POSITIVE


Processing reviews:  96%|█████████▌| 48/50 [03:38<00:08,  4.50s/it]

POSITIVE


Processing reviews:  98%|█████████▊| 49/50 [03:42<00:04,  4.47s/it]

POSITIVE


Processing reviews: 100%|██████████| 50/50 [03:47<00:00,  4.55s/it]


In [15]:
# Let's analyze some examples
stratified_sample[['FullText','Sentiment']].head(10)

Unnamed: 0,FullText,Sentiment
0,"This is in no way a 4 star hotel ,you have to ...",NEGATIVE
1,We've just returned from a fab time at this ho...,POSITIVE
2,A warm welcome awaited us on our arrival and w...,POSITIVE
3,The place was recommended to us a high quality...,NEGATIVE
4,We booked this hotel with Jet2 holidays for a ...,NEUTRAL
5,The recent renovation of the hotel has been do...,POSITIVE
6,We have never been to Portugal before and deci...,NEGATIVE
7,Stayed at the Baia Grande with my 5 year old d...,POSITIVE
8,The Hotel da Rocha ticks all the boxes. Situat...,POSITIVE
9,Great hotel. Great staff. Even if you don't go...,POSITIVE


In [None]:
# Check the text of a negative review
stratified_sample['FullText'].iloc[0]

'This is in no way a 4 star hotel ,you have to pay for a safe ,there is no tea or coffee making facilities in the room ,we had a dirty shower curtain not a shower screen,a plug that didn’t fit in the bath .The food is poor and not much choice but better at lunchtime than in the evening.There is absolutely nowhere nice to sit and have a drink .During the two weeks we were there the courtesy bus only ran once ,a very disappointing stay .'

In [None]:
# Check the text of a positive review
stratified_sample['FullText'].iloc[5]

'The place was recommended to us a high quality family resort but we were very disappointed by the quality of housing, food and cleanliness overall. <br>The staff was very helpful and friendly which was the best part and helped us find outside activities which made the vacation bearable.'

### Understanding PROS and CONS

This function code will read in approximately 1,650 words (~75% of 2200 tokens) at a time and determine the pros and cons users have mentioned from that block of text.<br>
The code will then move onto the next block of 1,800 words and extract the pros and cons from it, repeating as necessary until all of the reviews have been processed.<br>
This process is necessary because of the limits on how much input text chatGPT 3.5 can handle at one time. 

The model can be changed to gpt-4o-mini, allowing the context to be 200,000 tokens. If you want, you can try the block_size and the model name.  

In [18]:
def generate_proscons_list(text):
    word_blocks = text.split(' ')
    block_size = 2200
    blocks = [' '.join(word_blocks[i:i + block_size]) for i in range(0, len(word_blocks), block_size)]

    proscons = []

    for block in tqdm(blocks, desc="Processing blocks", unit="block"):
        messages = [
            {"role": "system", "content": "You are an AI language model trained to create a list of the most common pros and cons for hotel users based on online review summaries."},
            {"role": "user", "content": f"Based on the following hotel review summaries, create a list, in English, of the most common pros and cons for the hotels: {block}"}
        ]

        completion = openai.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=messages,
            max_tokens=300,             # limits the output size to 300 tokens
            n=1,
            stop=None,
            temperature=0.5             # limit the "creativity"
        )

        procon = completion.choices[0].message.content
        proscons.append(procon)

    # Combine the pros and cons that chatGPT found into a list 
    combined_proscons = "\n\n".join(proscons)
    return combined_proscons

In [19]:
# Combine all reviews text into one list 
all_reviews = "\n".join(stratified_sample["FullText"].tolist())

# Call the function to process the analysis
summary_proscons = generate_proscons_list(all_reviews)

# Print the list of pros and cons (optional step)
print(summary_proscons)

Processing blocks: 100%|██████████| 4/4 [00:10<00:00,  2.74s/block]

Based on the hotel review summaries provided, here are the most common pros and cons for hotel users:

**Pros:**
1. Clean and comfortable rooms with nice views.
2. Friendly and helpful staff.
3. Good facilities for children.
4. Nice pool area.
5. Complimentary tea and cake.
6. Peaceful and scenic location.
7. Free shuttle bus service.
8. Variety of food options for breakfast and lunch.
9. Free WiFi.
10. Lovely lounge area.
11. Free shuttle bus to nearby towns.
12. Great for families with children.
13. Easy access to local attractions.
14. Lovely balcony views.
15. Immaculate cleanliness.
16. Attentive staff.

**Cons:**
1. Not a 4-star hotel standard.
2. Additional charges for amenities like a safe.
3. Lack of tea or coffee making facilities in rooms.
4. Inconsistent quality of food, limited choices.
5. Entertainment and bar facilities could be better.
6. Limited seating at entertainment venues.
7. Inconvenient location for some attractions.
8. Limited variety in the dinner menu.
9. Col




In [20]:
# Summarize the list of pros and cons
messages = [
            {"role": "system", "content": "You are an helpful assistant for text summarization."},
            {"role": "user", "content": f"Based on the list of pros and cons, return the top 10 pros and top 10 cons. Avoid repeating the same topics. Rank the topics based on the numbers of times they appear in the text: {summary_proscons}"}
        ]

completion = openai.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=messages,
    max_tokens=500,
    stop=None,
    temperature=0.7,
    frequency_penalty=0.5,
    top_p=0.5
)

pros_and_cons = completion.choices[0].message.content

print(pros_and_cons)

**Top 10 Pros:**
1. Great location with easy access to the beach and town.
2. Immaculately clean rooms and public areas.
3. Courteous and attentive staff.
4. Beautiful views of the sea and beach.
5. Relaxing pool area with ample sunbeds.
6. Good selection of drinks and food.
7. Spacious rooms with modern amenities.
8. Buffet breakfast with plenty of choices.
9. Quiet and relaxing atmosphere.
10. Convenient parking facilities.

**Top 10 Cons:**
1. Lack of transparency or visibility of hotel management.
2. Delayed room cleaning services.
3. Limited gym facilities.
4. Inconsistent quality of coffee machines at breakfast.
5. Noisy environment due to road traffic or other disturbances.
6. Maintenance issues in rooms or common areas.
7. Lack of storage space in rooms.
8. Inconvenient or additional charges for amenities like safes.
9. Language barriers with staff for non-English speakers.
10. Issues with breakfast area overcrowding and poor customer service.

These top pros and cons are based

### Identifying factors and sentiment

In [21]:
def analyze_review_factors(review, model="gpt-3.5-turbo"):
    """ Get the sentiment per factor """
    retries = 3
    lst = ["POSITIVE", "NEGATIVE", "NEUTRAL"]
    sentiment = {}

    while retries > 0:
        messages = [
            {"role": "system", "content": "You are an AI language model trained to analyze and detect the sentiment of hotel reviews in six factors: Service Quality; Facilities; Value for Money; Rooms; Food and Recreation; Security."},
            {"role": "user", "content": f"Analyze the following hotel review and determine which of the six factors are mentioned in the review and if the sentiment is: positive, negative or neutral. Return a Python dictionary where the key is the factor name and the value is a single word, either POSITIVE, NEGATIVE or NEUTRAL. The Python dictionary should only include keys for the factors found in the text. For Python strings use only single quotes. Text of the review: {review}"}
        ]

        completion = openai.chat.completions.create(
            model=model,
            messages=messages,
            max_tokens=200,
            n=1,
            stop=None,
            temperature=0.5
        )

        response_text = completion.choices[0].message.content
        print(response_text)
        contains_word = lambda s, l: any(map(lambda x: x in s, l))
        if contains_word(response_text,lst):
            sentiment = response_text
            break
        else:
            retries -= 1
            time.sleep(1)
    else:
        sentiment = {}
   
    # add a delay of 4 seconds between requests to avoid hitting the free tier API call rate limit
    time.sleep(4)

    return sentiment

In [22]:
# Analyze the reviews and store the results
factors_sentiments = []

for review in tqdm(stratified_sample["FullText"], desc="Processing reviews"):
    factors_sentiment = analyze_review_factors(review)
    factors_sentiments.append(factors_sentiment)

stratified_sample["Factors_Sentiment"] = factors_sentiments




Processing reviews:   0%|          | 0/50 [00:00<?, ?it/s]

{
    'Service Quality': 'NEGATIVE',
    'Facilities': 'NEGATIVE',
    'Value for Money': 'NEGATIVE',
    'Rooms': 'NEGATIVE',
    'Food and Recreation': 'NEGATIVE'
}


Processing reviews:   2%|▏         | 1/50 [00:04<03:51,  4.72s/it]

{
    'Facilities': 'POSITIVE',
    'Food and Recreation': 'POSITIVE'
}


Processing reviews:   4%|▍         | 2/50 [00:09<03:42,  4.63s/it]

{
    'Service Quality': 'POSITIVE',
    'Facilities': 'POSITIVE',
    'Value for Money': 'NEUTRAL',
    'Rooms': 'POSITIVE',
    'Food and Recreation': 'POSITIVE',
    'Security': 'NEUTRAL'
}


Processing reviews:   6%|▌         | 3/50 [00:14<03:57,  5.05s/it]

{
    'Service Quality': 'POSITIVE',
    'Facilities': 'NEGATIVE',
    'Rooms': 'NEGATIVE',
    'Food and Recreation': 'NEGATIVE'
}


Processing reviews:   8%|▊         | 4/50 [00:19<03:45,  4.91s/it]

{
    'Service Quality': 'POSITIVE',
    'Facilities': 'NEUTRAL',
    'Rooms': 'NEUTRAL',
    'Food and Recreation': 'NEUTRAL'
}


Processing reviews:  10%|█         | 5/50 [00:24<03:37,  4.84s/it]

{
    'Service Quality': 'POSITIVE',
    'Facilities': 'POSITIVE',
    'Rooms': 'POSITIVE',
    'Food and Recreation': 'POSITIVE'
}


Processing reviews:  12%|█▏        | 6/50 [00:28<03:30,  4.79s/it]

{
    'Service Quality': 'POSITIVE',
    'Facilities': 'NEUTRAL',
    'Value for Money': 'NEUTRAL',
    'Rooms': 'NEUTRAL',
    'Food and Recreation': 'NEGATIVE',
    'Security': 'NEUTRAL'
}


Processing reviews:  14%|█▍        | 7/50 [00:33<03:27,  4.82s/it]

{
    'Service Quality': 'POSITIVE',
    'Facilities': 'POSITIVE',
    'Rooms': 'POSITIVE',
    'Food and Recreation': 'NEUTRAL',
    'Value for Money': 'POSITIVE',
    'Security': 'NEUTRAL'
}


Processing reviews:  16%|█▌        | 8/50 [00:38<03:22,  4.82s/it]

{
    'Service Quality': 'NEUTRAL',
    'Facilities': 'POSITIVE',
    'Value for Money': 'POSITIVE',
    'Rooms': 'NEUTRAL',
    'Food and Recreation': 'POSITIVE',
    'Security': 'NEUTRAL'
}


Processing reviews:  18%|█▊        | 9/50 [00:43<03:19,  4.86s/it]

{
    'Service Quality': 'POSITIVE',
    'Facilities': 'NEUTRAL',
    'Value for Money': 'NEUTRAL',
    'Rooms': 'NEUTRAL',
    'Food and Recreation': 'POSITIVE',
    'Security': 'NEUTRAL'
}


Processing reviews:  20%|██        | 10/50 [00:48<03:14,  4.86s/it]

{
    'Service Quality': 'POSITIVE',
    'Facilities': 'POSITIVE',
    'Rooms': 'POSITIVE',
    'Food and Recreation': 'POSITIVE',
    'Value for Money': 'NEUTRAL'
}


Processing reviews:  22%|██▏       | 11/50 [00:53<03:09,  4.85s/it]

{
    'Service Quality': 'POSITIVE',
    'Rooms': 'POSITIVE',
    'Food and Recreation': 'NEUTRAL',
    'Value for Money': 'NEUTRAL'
}


Processing reviews:  24%|██▍       | 12/50 [00:57<03:02,  4.80s/it]

{
    'Service Quality': 'POSITIVE',
    'Facilities': 'POSITIVE',
    'Rooms': 'NEUTRAL',
    'Food and Recreation': 'NEUTRAL',
    'Security': 'POSITIVE'
}


Processing reviews:  26%|██▌       | 13/50 [01:02<02:57,  4.79s/it]

{
    'Service Quality': 'POSITIVE',
    'Facilities': 'NEUTRAL',
    'Food and Recreation': 'NEGATIVE'
}


Processing reviews:  28%|██▊       | 14/50 [01:07<02:51,  4.75s/it]

{
    'Service Quality': 'NEGATIVE',
    'Facilities': 'NEUTRAL',
    'Value for Money': 'NEGATIVE',
    'Rooms': 'NEUTRAL',
    'Food and Recreation': 'POSITIVE',
    'Security': 'NEUTRAL'
}


Processing reviews:  30%|███       | 15/50 [01:12<02:51,  4.90s/it]

{
    'Service Quality': 'POSITIVE',
    'Rooms': 'POSITIVE',
    'Facilities': 'POSITIVE',
    'Food and Recreation': 'POSITIVE'
}


Processing reviews:  32%|███▏      | 16/50 [01:19<03:06,  5.48s/it]

{
    'Service Quality': 'POSITIVE',
    'Value for Money': 'POSITIVE',
    'Rooms': 'POSITIVE',
    'Food and Recreation': 'POSITIVE',
    'Facilities': 'POSITIVE'
}


Processing reviews:  34%|███▍      | 17/50 [01:24<02:53,  5.25s/it]

{
    'Service Quality': 'POSITIVE',
    'Facilities': 'NEUTRAL',
    'Value for Money': 'NEGATIVE',
    'Rooms': 'NEUTRAL',
    'Food and Recreation': 'NEGATIVE',
    'Security': 'NEUTRAL'
}


Processing reviews:  36%|███▌      | 18/50 [01:29<02:44,  5.13s/it]

{
    'Service Quality': 'POSITIVE',
    'Facilities': 'NEUTRAL',
    'Value for Money': 'NEUTRAL',
    'Rooms': 'POSITIVE',
    'Food and Recreation': 'NEUTRAL',
    'Security': 'NEUTRAL'
}


Processing reviews:  38%|███▊      | 19/50 [01:34<02:41,  5.22s/it]

{
    'Value for Money': 'NEGATIVE',
    'Facilities': 'POSITIVE',
    'Food and Recreation': 'NEGATIVE',
    'Security': 'NEGATIVE'
}


Processing reviews:  40%|████      | 20/50 [01:39<02:31,  5.06s/it]

{
    'Service Quality': 'POSITIVE',
    'Facilities': 'NEUTRAL',
    'Value for Money': 'NEUTRAL',
    'Rooms': 'NEUTRAL',
    'Food and Recreation': 'POSITIVE',
    'Security': 'NEUTRAL'
}


Processing reviews:  42%|████▏     | 21/50 [01:44<02:25,  5.01s/it]

{
    'Service Quality': 'POSITIVE',
    'Facilities': 'NEUTRAL',
    'Rooms': 'NEGATIVE'
}


Processing reviews:  44%|████▍     | 22/50 [01:48<02:17,  4.92s/it]

{
    'Service Quality': 'POSITIVE',
    'Facilities': 'NEUTRAL',
    'Value for Money': 'NEUTRAL',
    'Rooms': 'NEUTRAL',
    'Recreation': 'NEGATIVE'
}


Processing reviews:  46%|████▌     | 23/50 [01:53<02:13,  4.93s/it]

{
    'Value for Money': 'POSITIVE',
    'Rooms': 'POSITIVE',
    'Service Quality': 'NEUTRAL'
}


Processing reviews:  48%|████▊     | 24/50 [01:58<02:06,  4.87s/it]

{
    'Service Quality': 'NEGATIVE',
    'Value for Money': 'NEGATIVE',
    'Rooms': 'NEGATIVE',
    'Facilities': 'NEGATIVE',
    'Food and Recreation': 'NEGATIVE'
}


Processing reviews:  50%|█████     | 25/50 [02:03<02:01,  4.88s/it]

{
    'Service Quality': 'POSITIVE',
    'Facilities': 'NEUTRAL',
    'Value for Money': 'NEGATIVE',
    'Rooms': 'POSITIVE',
    'Food and Recreation': 'NEGATIVE'
}


Processing reviews:  52%|█████▏    | 26/50 [02:08<01:57,  4.88s/it]

{
    'Service Quality': 'POSITIVE',
    'Facilities': 'POSITIVE',
    'Rooms': 'POSITIVE',
    'Food and Recreation': 'NEUTRAL'
}


Processing reviews:  54%|█████▍    | 27/50 [02:12<01:51,  4.85s/it]

{
    'Service Quality': 'POSITIVE',
    'Facilities': 'POSITIVE',
    'Food and Recreation': 'POSITIVE',
    'Value for Money': 'NEUTRAL',
    'Rooms': 'NEUTRAL',
    'Security': 'NEUTRAL'
}


Processing reviews:  56%|█████▌    | 28/50 [02:17<01:46,  4.85s/it]

{
    'Service Quality': 'POSITIVE',
    'Facilities': 'NEGATIVE',
    'Rooms': 'NEGATIVE'
}


Processing reviews:  58%|█████▊    | 29/50 [02:22<01:41,  4.83s/it]

{
    'Service Quality': 'POSITIVE',
    'Facilities': 'NEUTRAL',
    'Value for Money': 'NEUTRAL',
    'Rooms': 'POSITIVE',
    'Food and Recreation': 'POSITIVE',
    'Security': 'NEUTRAL'
}


Processing reviews:  60%|██████    | 30/50 [02:27<01:36,  4.84s/it]

{
    'Service Quality': 'POSITIVE',
    'Facilities': 'POSITIVE',
    'Rooms': 'POSITIVE',
    'Food and Recreation': 'POSITIVE'
}


Processing reviews:  62%|██████▏   | 31/50 [02:32<01:30,  4.79s/it]

{
    'Service Quality': 'NEGATIVE'
}


Processing reviews:  64%|██████▍   | 32/50 [02:36<01:24,  4.70s/it]

{
    'Service Quality': 'POSITIVE',
    'Facilities': 'POSITIVE',
    'Value for Money': 'POSITIVE',
    'Rooms': 'POSITIVE',
    'Food and Recreation': 'POSITIVE'
}


Processing reviews:  66%|██████▌   | 33/50 [02:41<01:21,  4.77s/it]

{
    'Service Quality': 'POSITIVE',
    'Facilities': 'NEUTRAL',
    'Rooms': 'NEGATIVE'
}


Processing reviews:  68%|██████▊   | 34/50 [02:46<01:15,  4.72s/it]

{
    'Service Quality': 'POSITIVE'
}


Processing reviews:  70%|███████   | 35/50 [02:51<01:11,  4.76s/it]

{
    'Service Quality': 'NEGATIVE',
    'Facilities': 'NEUTRAL',
    'Value for Money': 'NEUTRAL',
    'Rooms': 'NEUTRAL',
    'Food and Recreation': 'POSITIVE',
    'Security': 'NEUTRAL'
}


Processing reviews:  72%|███████▏  | 36/50 [02:55<01:06,  4.78s/it]

{
    'Service Quality': 'POSITIVE',
    'Facilities': 'NEGATIVE',
    'Rooms': 'NEUTRAL',
    'Food and Recreation': 'POSITIVE',
    'Value for Money': 'NEUTRAL'
}


Processing reviews:  74%|███████▍  | 37/50 [03:00<01:02,  4.80s/it]

{
    'Service Quality': 'POSITIVE',
    'Facilities': 'POSITIVE',
    'Rooms': 'NEUTRAL',
    'Food and Recreation': 'POSITIVE'
}


Processing reviews:  76%|███████▌  | 38/50 [03:05<00:57,  4.77s/it]

{
    'Service Quality': 'POSITIVE',
    'Facilities': 'POSITIVE',
    'Rooms': 'POSITIVE'
}


Processing reviews:  78%|███████▊  | 39/50 [03:10<00:52,  4.73s/it]

{
    'Service Quality': 'POSITIVE',
    'Rooms': 'POSITIVE',
    'Food and Recreation': 'POSITIVE',
    'Value for Money': 'NEGATIVE'
}


Processing reviews:  80%|████████  | 40/50 [03:14<00:47,  4.71s/it]

{
    'Service Quality': 'NEUTRAL',
    'Facilities': 'POSITIVE',
    'Value for Money': 'NEUTRAL',
    'Rooms': 'NEUTRAL',
    'Food and Recreation': 'POSITIVE',
    'Security': 'NEUTRAL'
}


Processing reviews:  82%|████████▏ | 41/50 [03:19<00:42,  4.77s/it]

{
    'Service Quality': 'NEGATIVE',
    'Facilities': 'POSITIVE',
    'Value for Money': 'NEGATIVE',
    'Rooms': 'POSITIVE',
    'Food and Recreation': 'NEUTRAL',
    'Security': 'NEUTRAL'
}


Processing reviews:  84%|████████▍ | 42/50 [03:24<00:38,  4.82s/it]

{
    'Service Quality': 'POSITIVE',
    'Facilities': 'POSITIVE',
    'Value for Money': 'POSITIVE',
    'Rooms': 'NEUTRAL',
    'Food and Recreation': 'POSITIVE',
    'Security': 'NEUTRAL'
}


Processing reviews:  86%|████████▌ | 43/50 [03:29<00:34,  4.87s/it]

{
    'Service Quality': 'POSITIVE',
    'Facilities': 'POSITIVE'
}


Processing reviews:  88%|████████▊ | 44/50 [03:34<00:29,  4.90s/it]

{
    'Service Quality': 'POSITIVE',
    'Facilities': 'POSITIVE',
    'Value for Money': 'POSITIVE',
    'Rooms': 'POSITIVE'
}


Processing reviews:  90%|█████████ | 45/50 [03:39<00:24,  4.89s/it]

{
    'Service Quality': 'POSITIVE',
    'Facilities': 'POSITIVE',
    'Rooms': 'POSITIVE',
    'Food and Recreation': 'POSITIVE'
}


Processing reviews:  92%|█████████▏| 46/50 [03:44<00:19,  4.85s/it]

{
    'Service Quality': 'POSITIVE',
    'Rooms': 'POSITIVE'
}


Processing reviews:  94%|█████████▍| 47/50 [03:48<00:14,  4.80s/it]

{
    'Service Quality': 'POSITIVE',
    'Facilities': 'POSITIVE',
    'Value for Money': 'POSITIVE',
    'Rooms': 'NEUTRAL',
    'Food and Recreation': 'POSITIVE',
    'Security': 'NEUTRAL'
}


Processing reviews:  96%|█████████▌| 48/50 [03:53<00:09,  4.83s/it]

{
    'Service Quality': 'NEUTRAL',
    'Facilities': 'NEGATIVE',
    'Value for Money': 'NEUTRAL',
    'Rooms': 'POSITIVE',
    'Food and Recreation': 'NEUTRAL',
    'Security': 'NEUTRAL'
}


Processing reviews:  98%|█████████▊| 49/50 [03:58<00:04,  4.86s/it]

{
    'Service Quality': 'POSITIVE',
    'Facilities': 'POSITIVE',
    'Rooms': 'POSITIVE',
    'Food and Recreation': 'POSITIVE'
}


Processing reviews: 100%|██████████| 50/50 [04:03<00:00,  4.87s/it]


In [23]:
# Check the text of a review
i=7
print(stratified_sample['FullText'].iloc[i])
print(stratified_sample['Factors_Sentiment'].iloc[i])

Stayed at the Baia Grande with my 5 year old daughter during the May half term holidays. Check-in and out times did not work very well with our flights - but it was easy enough to kill time in the locality. Room was super - lovely bed(s) with everything you needed in the bathroom, large TV, air conditioning and loads of hanging space. The pool area never got particularly busy and the pool itself was beautiful - running from very shallow at one end to cold and deep at the other. Pool bar prices were higher than nearby alternatives, but surely that is to be expected?! We found the staff to be friendly and helpful. The breakfast really did have something for everyone. Sure, the juices and coffee were not the best - but clearly the hotel catering team have a budget to work with! The hotel is not anywhere near Albufeira, but there is enough nearby to ensure you can eat well and keep yourself entertained. The 'Rabbit beach' is only an easy, short walk away. In summary this is a great choice 

In [24]:
# Check the text of a review
i=3
print(stratified_sample['FullText'].iloc[i])
print(stratified_sample['Factors_Sentiment'].iloc[i])

The place was recommended to us a high quality family resort but we were very disappointed by the quality of housing, food and cleanliness overall. <br>The staff was very helpful and friendly which was the best part and helped us find outside activities which made the vacation bearable.
{
    'Service Quality': 'POSITIVE',
    'Facilities': 'NEGATIVE',
    'Rooms': 'NEGATIVE',
    'Food and Recreation': 'NEGATIVE'
}


In [25]:
# Add these factors to the dataset
cleaned_list = [s.replace("\n", "") for s in factors_sentiments]
list_of_dicts = [ast.literal_eval(s) for s in cleaned_list]

# Turn the list of dictionaries into a DataFrame
new_rows_df = pd.DataFrame(list_of_dicts)

# Concatenate the new DataFrame with the existing one, aligning on columns
df = pd.concat([stratified_sample.reset_index(drop=True), new_rows_df], axis=1)


In [26]:
# View the header of the resulting dataframe
df.head()

Unnamed: 0,HotelID,Language,GlobalRating,TotalReviewOnSite,ExtractedReviews,PositionInRegionRanking,SitesInRegionRanking,TripadvisorReviewID,ReviewRating,PublishedDate,...,FullText,Sentiment,Factors_Sentiment,Service Quality,Facilities,Value for Money,Rooms,Food and Recreation,Security,Recreation
0,9,English,4.0,3635,535,50,154,715210414,2,2019-10-01 00:00:00.000,...,"This is in no way a 4 star hotel ,you have to ...",NEGATIVE,"{\n 'Service Quality': 'NEGATIVE',\n 'Fa...",NEGATIVE,NEGATIVE,NEGATIVE,NEGATIVE,NEGATIVE,,
1,48,English,4.5,1147,740,10,154,693473393,4,2019-07-01 00:00:00.000,...,We've just returned from a fab time at this ho...,POSITIVE,"{\n 'Facilities': 'POSITIVE',\n 'Food an...",,POSITIVE,,,POSITIVE,,
2,100,English,4.5,389,30,14,154,692083878,5,2019-08-01 00:00:00.000,...,A warm welcome awaited us on our arrival and w...,POSITIVE,"{\n 'Service Quality': 'POSITIVE',\n 'Fa...",POSITIVE,POSITIVE,NEUTRAL,POSITIVE,POSITIVE,NEUTRAL,
3,23,English,4.5,1925,200,2,10,683615537,1,2019-06-01 00:00:00.000,...,The place was recommended to us a high quality...,NEGATIVE,"{\n 'Service Quality': 'POSITIVE',\n 'Fa...",POSITIVE,NEGATIVE,,NEGATIVE,NEGATIVE,,
4,71,English,4.0,757,85,2,14,665491731,3,2019-04-01 00:00:00.000,...,We booked this hotel with Jet2 holidays for a ...,NEUTRAL,"{\n 'Service Quality': 'POSITIVE',\n 'Fa...",POSITIVE,NEUTRAL,,NEUTRAL,NEUTRAL,,


### Moderation
Check if reviews contain violence, hate, or discrimination

In [27]:
def analyze_review_moderation(review):
    """ Moderate review """
    
    output = None
    
    response = openai.moderations.create(input = review)

    output = response.results[0]
   
    # add a delay of 4 seconds between requests to avoid hitting the free tier API call rate limit
    time.sleep(4)

    return output

In [28]:
# Analyze the reviews and store the results
moderation_output = []

for review in tqdm(stratified_sample["FullText"], desc="Processing reviews"):
    moderation = analyze_review_moderation(review)
    moderation_output.append(moderation)

Processing reviews: 100%|██████████| 50/50 [04:04<00:00,  4.89s/it]


In [29]:
# Check the overall result of the first review
moderation_output[0]

Moderation(categories=Categories(harassment=False, harassment_threatening=False, hate=False, hate_threatening=False, illicit=None, illicit_violent=None, self_harm=False, self_harm_instructions=False, self_harm_intent=False, sexual=False, sexual_minors=False, violence=False, violence_graphic=False, self-harm=False, sexual/minors=False, hate/threatening=False, violence/graphic=False, self-harm/intent=False, self-harm/instructions=False, harassment/threatening=False), category_applied_input_types=None, category_scores=CategoryScores(harassment=0.026297861710190773, harassment_threatening=0.0001697603438515216, hate=0.00417799036949873, hate_threatening=2.3985580810403917e-06, illicit=None, illicit_violent=None, self_harm=1.801067810447421e-06, self_harm_instructions=1.2977936421521008e-05, self_harm_intent=7.901753633632325e-07, sexual=4.79954587717657e-06, sexual_minors=1.4816654356764047e-06, violence=0.00017603016749490052, violence_graphic=0.0001223611325258389, self-harm=1.8010678104

In [30]:
# Checking the full output of the first review
moderation_output[0].flagged

False

## References

- How to get around OpenAI GPT token limits: https://blog.devgenius.io/how-to-get-around-openai-gpt-3-token-limits-b11583691b32
- How Transformers work: https://towardsdatascience.com/illustrated-guide-to-transformers-step-by-step-explanation-f74876522bc0
- Examples of reviews analysis: https://blog.startupstash.com/analyze-customer-product-reviews-using-chatgpt-openai-api-a-step-by-step-guide-to-extracting-5cb599608c8d#829f-1370f316f4aa and https://medium.com/data-and-beyond/sentiment-analysis-with-chatgpt-openai-and-python-use-chatgpt-to-build-a-sentiment-analysis-ai-2b89158a37f6
- Topic modeling using Bertopic: https://towardsdatascience.com/topic-modelling-using-chatgpt-api-8775b0891d16



