In [1]:
import re
from collections import defaultdict
import pandas as pd
import spacy
import en_core_web_sm

In [2]:
def extract_topics(sentence):

  try:
      nlp = spacy.load("en_core_web_sm")
  except ImportError:
      print("spaCy not available")

  doc = nlp(sentence)

  # Define topic patterns with phrases and keywords
  topic_patterns = {
      'Facility': {
          'phrases': ['new apartments', 'comfortable apartments', 'modern rooms'],
          'keywords': ['apartment', 'room', 'suite', 'building', 'property', 'new', 'modern']
      },
      'Location': {
          'phrases': ['close to airport', 'near beach', 'walking distance', 'city center'],
          'keywords': ['close', 'near', 'airport', 'beach', 'location', 'distance', 'center']
      },
      'Staff': {
          'phrases': ['helpful staff', 'friendly service', 'easy to communicate'],
          'keywords': ['staff', 'service', 'helpful', 'friendly', 'communicate', 'professional']
      },
      'Food': {
          'phrases': ['tasty food', 'delicious breakfast', 'comfortable restaurant'],
          'keywords': ['food', 'restaurant', 'tasty', 'delicious', 'dining', 'meal', 'breakfast']
      },
      'Room': {
          'phrases': ['clean room', 'comfortable bed', 'spacious bathroom'],
          'keywords': ['room', 'bed', 'bathroom', 'clean', 'comfortable', 'spacious']
      },
      'Comfort': {
          'phrases': ['cozy evening', 'calm work', 'comfortable stay'],
          'keywords': ['comfortable', 'cozy', 'calm', 'peaceful', 'relaxing']
      },
      'Facilities': {
          'phrases': ['first floor', 'ground floor', 'escape the heat'],
          'keywords': ['floor', 'floors', 'facilities', 'amenities']
      }
  }

  # Split into sentences and then into meaningful chunks
  sentences = list(doc.sents)
  result = {}

  for sent in sentences:
    sent_text = sent.text.strip()
    if not sent_text:
      continue

    # Split by commas within sentences
    segments = [seg.strip() for seg in sent_text.split(',') if seg.strip()]

    for segment in segments:
      segment_lower = segment.lower()
      topic_scores = defaultdict(float)

      # Score based on phrases and keywords
      for topic, patterns in topic_patterns.items():
        score = 0

        # Check for phrase matches (higher weight)
        for phrase in patterns['phrases']:
          if phrase in segment_lower:
            score += 5

        # Check for keyword matches
        for keyword in patterns['keywords']:
          if keyword in segment_lower:
            score += 1

        topic_scores[topic] = score

      # Assign to best topic
      if topic_scores:
        best_topic = max(topic_scores, key=topic_scores.get)
        if topic_scores[best_topic] > 0:
          if best_topic not in result:
            result[best_topic] = []
          result[best_topic].append(segment)

  # Combine segments for same topic
  final_result = {}
  for topic, segments in result.items():
    final_result[topic] = ', '.join(segments)

  return final_result

In [3]:
def process_hotel_review(sentence):

  print(f"Original:\n  '{sentence}'")

  print("\nExtracted Topic Relevant Text:")
  topics_nlp = extract_topics(sentence)
  for topic, text in topics_nlp.items():
    print(f"  '{topic}': '{text}'")
  print("\n" + "="*100)

  return topics_nlp

In [4]:
example = "New, comfortable apartments, close to the airport, to very clean beach. Staff is extremely helpful and easy to communicate with. Tasty food on the first floor, comfortable restaurant for both cozy evenings and calm work to escape the heat in the midday"

process_hotel_review(example)


Original:
  'New, comfortable apartments, close to the airport, to very clean beach. Staff is extremely helpful and easy to communicate with. Tasty food on the first floor, comfortable restaurant for both cozy evenings and calm work to escape the heat in the midday'

Extracted Topic Relevant Text:
  'Facility': 'New, comfortable apartments'
  'Location': 'close to the airport, to very clean beach.'
  'Staff': 'Staff is extremely helpful and easy to communicate with.'
  'Food': 'Tasty food on the first floor'
  'Comfort': 'comfortable restaurant for both cozy evenings and calm work to escape the heat in the midday'



{'Facility': 'New, comfortable apartments',
 'Location': 'close to the airport, to very clean beach.',
 'Staff': 'Staff is extremely helpful and easy to communicate with.',
 'Food': 'Tasty food on the first floor',
 'Comfort': 'comfortable restaurant for both cozy evenings and calm work to escape the heat in the midday'}

In [5]:
df = pd.read_csv("Original ABA Dataset for Version 3 [May 30] - 1. hotel in Larnaca-Cyprus - Topic.csv")

# Removes duplicates and NaN
positive_reviews = df['PositiveReview'].drop_duplicates().dropna()
print(f"Found {len(positive_reviews)} positive reviews.\n")

for i, review in enumerate(positive_reviews):
  if i > 9:
    break
  print(f"Review No.{i+1}")
  process_hotel_review(review)

Found 768 positive reviews.

Review No.1
Original:
  'New, comfortable apartments, close to the airport, to very clean beach.
 Staff is extremely helpful and easy to communicate with 
 Tasty food on the first floor, comfortable restaurant for both cozy evenings and calm work to escape the heat in the midday'

Extracted Topic Relevant Text:
  'Facility': 'New, comfortable apartments'
  'Location': 'close to the airport, to very clean beach.'
  'Staff': 'Staff is extremely helpful and easy to communicate with 
 Tasty food on the first floor'
  'Comfort': 'comfortable restaurant for both cozy evenings and calm work to escape the heat in the midday'

Review No.2
Original:
  'We had a really pleasant stay! The staff was very nice and helpful! The room was very clean, well decorated and modern, although not big. Also, the breakfast was amazing, fresh and handmade! Especially vegetarian version with humus. The cook is great, and all staff was very kind and helpful. I like hotels with a family