In [1]:
import pandas as pd

# Load the dataset
df = pd.read_excel("final.xlsx")

# Basic info
print("Number of rows:", len(df))
print("\nColumn names:", df.columns.tolist())

# Preview first few rows
df.head(10)


Number of rows: 1096

Column names: ['Text', 'Rating', 'Place name']


Unnamed: 0,Text,Rating,Place name
0,Should have read the reviews before wasting ti...,1.0,Kakadu National Park
1,Do not purchase a park pass before going to a ...,1.0,Kakadu National Park
2,Very disappointing.. over hyped.. most attract...,1.0,Kakadu National Park
3,I didn't see any kakadus,1.0,Kakadu National Park
4,"Lots of kaka, but nothing to du.",1.0,Kakadu National Park
5,Only Jim Jim Falls open way to far and dangero...,1.0,Kakadu National Park
6,Disappointed with my experience considering Ka...,1.0,Kakadu National Park
7,Disappointing vs. expectations and with unprof...,1.0,Kakadu National Park
8,Kakadont.\nExpect you pay for an expensive par...,1.0,Kakadu National Park
9,"We will never visit here again, info centre wa...",1.0,Kakadu National Park


In [2]:
import re

def clean_text(text):
    if not isinstance(text, str):
        return ""
    text = text.encode('utf-8', 'ignore').decode('utf-8')  # fix encoding issues
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)    # remove URLs
    text = re.sub(r"@\w+|#\w+", "", text)                  # remove @mentions, #hashtags
    text = re.sub(r"[^A-Za-z0-9.,!?'\s]", "", text)        # remove emojis/symbols but keep punctuation
    text = re.sub(r"\s+", " ", text).strip()               # remove extra spaces
    return text

# Apply cleaning
df["Cleaned_Text"] = df["Text"].apply(clean_text)

# Quick check
df[["Text", "Cleaned_Text"]].head(10)


Unnamed: 0,Text,Cleaned_Text
0,Should have read the reviews before wasting ti...,Should have read the reviews before wasting ti...
1,Do not purchase a park pass before going to a ...,Do not purchase a park pass before going to a ...
2,Very disappointing.. over hyped.. most attract...,Very disappointing.. over hyped.. most attract...
3,I didn't see any kakadus,I didn't see any kakadus
4,"Lots of kaka, but nothing to du.","Lots of kaka, but nothing to du."
5,Only Jim Jim Falls open way to far and dangero...,Only Jim Jim Falls open way to far and dangero...
6,Disappointed with my experience considering Ka...,Disappointed with my experience considering Ka...
7,Disappointing vs. expectations and with unprof...,Disappointing vs. expectations and with unprof...
8,Kakadont.\nExpect you pay for an expensive par...,Kakadont. Expect you pay for an expensive park...
9,"We will never visit here again, info centre wa...","We will never visit here again, info centre wa..."


In [3]:
# Take a small, diverse sample
sample_df = (
    df.groupby("Place name", group_keys=False)
      .apply(lambda x: x.sample(min(len(x), 1)))  # 1 review per place if available
      .reset_index(drop=True)
)

# If you want exactly 10 random reviews across all
sample_df = df.sample(10, random_state=42)

# View the sample
sample_df[["Place name", "Rating", "Cleaned_Text"]]


  .apply(lambda x: x.sample(min(len(x), 1)))  # 1 review per place if available


Unnamed: 0,Place name,Rating,Cleaned_Text
44,Mindil Beach Casino Resort,1.0,Visited this casino with my friend. Got refuse...
568,Litchfield National Park,5.0,Amazing National Park and so close to Darwin!
56,Mindil Beach Casino Resort,1.0,If the kitchen took less than the 40 minutes w...
636,Litchfield National Park,5.0,What a stunning part of the world.
486,Uluṟu-Kata Tjuṯa National Park,5.0,One of the most iconic places in Australia and...
96,Kakadu National Park,2.0,Spent there 4 days in November 2023. not much ...
761,Nitmiluk (Katherine) Gorge,5.0,So peaceful and quiet cruise was so good
51,Mindil Beach Casino Resort,1.0,Definitely not worth it for the price you pay....
107,Cooinda Lodge Kakadu,2.0,Rooms are very basic and overpriced. Daily hou...
666,Litchfield National Park,5.0,Amazing place to visit


In [4]:
import nltk
nltk.download("punkt")

from nltk.tokenize import sent_tokenize

# Split each review into sentences
sentences_data = []

for _, row in sample_df.iterrows():
    sentences = sent_tokenize(row["Cleaned_Text"])
    for s in sentences:
        sentences_data.append({
            "Place name": row["Place name"],
            "Rating": row["Rating"],
            "Sentence": s.strip()
        })

sent_df = pd.DataFrame(sentences_data)

# Preview the split sentences
print(f"Total sentences: {len(sent_df)}")
sent_df.head(15)


Total sentences: 34


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\shish\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Unnamed: 0,Place name,Rating,Sentence
0,Mindil Beach Casino Resort,1.0,Visited this casino with my friend.
1,Mindil Beach Casino Resort,1.0,Got refused to enter.
2,Mindil Beach Casino Resort,1.0,Reason My shirt was too wet.
3,Mindil Beach Casino Resort,1.0,Asked the staff what I'm supposed to do if it'...
4,Mindil Beach Casino Resort,1.0,Got told it's my own fault If i do not have my...
5,Litchfield National Park,5.0,Amazing National Park and so close to Darwin!
6,Mindil Beach Casino Resort,1.0,If the kitchen took less than the 40 minutes w...
7,Mindil Beach Casino Resort,1.0,After kicking the roach away pic attached of h...
8,Mindil Beach Casino Resort,1.0,The meals werent cheap so I expect better hygi...
9,Litchfield National Park,5.0,What a stunning part of the world.


In [5]:
from transformers import pipeline

# Load a pre-trained sentiment analysis model
sentiment_analyzer = pipeline("sentiment-analysis")

# Run sentiment analysis on a few sentences
sent_df["Sentiment"] = sent_df["Sentence"].apply(lambda x: sentiment_analyzer(x)[0]["label"])

# Preview results
sent_df.head(20)


  from .autonotebook import tqdm as notebook_tqdm
No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Device set to use cpu


Unnamed: 0,Place name,Rating,Sentence,Sentiment
0,Mindil Beach Casino Resort,1.0,Visited this casino with my friend.,POSITIVE
1,Mindil Beach Casino Resort,1.0,Got refused to enter.,NEGATIVE
2,Mindil Beach Casino Resort,1.0,Reason My shirt was too wet.,NEGATIVE
3,Mindil Beach Casino Resort,1.0,Asked the staff what I'm supposed to do if it'...,NEGATIVE
4,Mindil Beach Casino Resort,1.0,Got told it's my own fault If i do not have my...,NEGATIVE
5,Litchfield National Park,5.0,Amazing National Park and so close to Darwin!,POSITIVE
6,Mindil Beach Casino Resort,1.0,If the kitchen took less than the 40 minutes w...,NEGATIVE
7,Mindil Beach Casino Resort,1.0,After kicking the roach away pic attached of h...,NEGATIVE
8,Mindil Beach Casino Resort,1.0,The meals werent cheap so I expect better hygi...,NEGATIVE
9,Litchfield National Park,5.0,What a stunning part of the world.,POSITIVE


In [13]:
from transformers import pipeline
import pandas as pd

# 1️⃣ Define the tourism-related categories
ASPECTS = [
    "service", "staff", "food", "price", "facilities",
    "cleanliness", "accessibility", "experience", "safety", "value"
]

# 2️⃣ Load zero-shot classification model
aspect_clf = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

# 3️⃣ Helper function — allows multiple aspect labels per sentence
def detect_multiple_aspects(sentence, threshold=0.4):
    if not isinstance(sentence, str) or not sentence.strip():
        return pd.Series([[], []])
    result = aspect_clf(sentence, candidate_labels=ASPECTS, multi_label=True)
    
    # keep aspects that meet the confidence threshold
    aspects = [label for label, score in zip(result["labels"], result["scores"]) if score >= threshold]
    confidences = [round(score, 2) for score in result["scores"] if score >= threshold]
    
    return pd.Series([aspects, confidences])

# 4️⃣ Apply to a small sample first (to verify)
sample_df = sent_df.sample(10, random_state=42).copy()
sample_df[["Predicted_Aspects", "Confidence_Scores"]] = sample_df["Sentence"].apply(
    lambda x: detect_multiple_aspects(x, threshold=0.4)
)

# 5️⃣ Preview the result
print("✅ Multi-aspect classification preview:")
display(sample_df[["Place name", "Sentence", "Sentiment", "Predicted_Aspects", "Confidence_Scores"]])


Device set to use cpu


✅ Multi-aspect classification preview:


Unnamed: 0,Place name,Sentence,Sentiment,Predicted_Aspects,Confidence_Scores
15,Kakadu National Park,The service in the Cooinda lodge was rather po...,NEGATIVE,"[service, price, facilities, experience, value...","[1.0, 0.97, 0.94, 0.61, 0.54, 0.43]"
19,Mindil Beach Casino Resort,Definitely not worth it for the price you pay.,NEGATIVE,"[price, facilities, service, staff]","[0.99, 0.82, 0.71, 0.57]"
27,Mindil Beach Casino Resort,The cleaners came in daily but never changed t...,NEGATIVE,"[facilities, service, staff, experience, acces...","[0.91, 0.89, 0.77, 0.73, 0.62, 0.6, 0.6]"
26,Mindil Beach Casino Resort,"The service was average, took a long time to c...",NEGATIVE,"[service, staff, facilities, accessibility, ex...","[0.99, 0.99, 0.92, 0.84, 0.7]"
8,Mindil Beach Casino Resort,The meals werent cheap so I expect better hygi...,NEGATIVE,"[food, facilities, service, cleanliness, value...","[0.94, 0.82, 0.79, 0.72, 0.57, 0.57, 0.43]"
24,Mindil Beach Casino Resort,The rooms are very outdated and do not even ha...,NEGATIVE,"[facilities, price, accessibility, service, va...","[0.98, 0.79, 0.69, 0.55, 0.4]"
21,Mindil Beach Casino Resort,There was mould all through out the room.,NEGATIVE,[facilities],[0.8]
12,Uluṟu-Kata Tjuṯa National Park,From every corner of this rock you can admire ...,POSITIVE,"[value, accessibility, experience]","[0.7, 0.65, 0.64]"
32,Cooinda Lodge Kakadu,It is not sufficient to just empty the trash a...,NEGATIVE,"[facilities, cleanliness, service, value, acce...","[0.97, 0.91, 0.73, 0.7, 0.63, 0.52, 0.4]"
9,Litchfield National Park,What a stunning part of the world.,POSITIVE,"[experience, value]","[0.72, 0.42]"


In [16]:
# Create a filtered aspect column to keep only high-confidence aspects
sample_df["Filtered_Aspects"] = sample_df.apply(
    lambda row: [a for a, s in zip(row["Predicted_Aspects"], row["Confidence_Scores"]) if s >= 0.6],
    axis=1
)


In [17]:
from collections import Counter

def get_aspect_counts(df, sentiment_type):
    aspects = df[df["Sentiment"] == sentiment_type]["Filtered_Aspects"].sum()
    return Counter(aspects)

neg_counts = get_aspect_counts(sample_df, "NEGATIVE")
pos_counts = get_aspect_counts(sample_df, "POSITIVE")

print("Top Negative Aspects:", neg_counts.most_common(5))
print("Top Positive Aspects:", pos_counts.most_common(5))


Top Negative Aspects: [('facilities', 4), ('accessibility', 4), ('service', 4), ('value', 3), ('experience', 2)]
Top Positive Aspects: [('experience', 2), ('value', 2), ('facilities', 2), ('accessibility', 1)]
