In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('final_augmented_cleaned.csv')

In [None]:
df.shape

(10196, 12)

In [None]:
df.columns

Index(['id', 'title', 'rating', 'text', 'user/userLocation/shortName',
       'placeInfo/id', 'placeInfo/name', 'placeInfo/rating',
       'ownerResponse/text', 'reply_length', 'review_length',
       'review_sentiment'],
      dtype='object')

In [None]:
df.drop(columns=['reply_length', 'review_length', 'review_sentiment'], inplace=True)

In [None]:
df.shape

(10196, 9)

In [None]:
df.columns

Index(['id', 'title', 'rating', 'text', 'user/userLocation/shortName',
       'placeInfo/id', 'placeInfo/name', 'placeInfo/rating',
       'ownerResponse/text'],
      dtype='object')

In [None]:
df['id'] = df.index

In [None]:
df.shape

(10196, 9)

In [None]:
!pip install langdetect

Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py) ... [?25l[?25hdone
  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993223 sha256=5d44e93a0ced9be91cd2e8789efc62e790a8b160e1f15728c94f20152927ed95
  Stored in directory: /root/.cache/pip/wheels/c1/67/88/e844b5b022812e15a52e4eaa38a1e709e99f06f6639d7e3ba7
Successfully built langdetect
Installing collected packages: langdetect
Successfully installed langdetect-1.0.9


In [None]:
from langdetect import detect

def get_lang(text):
    try:
        return detect(text)
    except:
        return "unknown"

df['review_lang'] = df['text'].apply(get_lang)
df['reply_lang'] = df['ownerResponse/text'].apply(get_lang)

In [None]:
df['is_english_review'] = df['review_lang'].apply(lambda x: 'english' if x == 'en' else 'not_english')
df['is_english_reply'] = df['reply_lang'].apply(lambda x: 'english' if x == 'en' else 'not_english')

In [None]:
df['is_english_review'].value_counts()

Unnamed: 0_level_0,count
is_english_review,Unnamed: 1_level_1
english,10015
not_english,181


In [None]:
df['is_english_reply'].value_counts()

Unnamed: 0_level_0,count
is_english_reply,Unnamed: 1_level_1
english,10185
not_english,11


In [None]:
result = df.groupby(['is_english_review', 'rating']).size().reset_index(name='count')
print(result)

  is_english_review  rating  count
0           english       1   1260
1           english       2    731
2           english       3   1706
3           english       4   1328
4           english       5   4990
5       not_english       1     32
6       not_english       2     19
7       not_english       3     42
8       not_english       4     34
9       not_english       5     54


In [None]:
df = df[df['review_lang'] == 'en']

In [None]:
df.shape

(10015, 13)

In [None]:
df = df[df['reply_lang'] == 'en']

In [None]:
df.shape

(10014, 13)

In [None]:
df['rating'].value_counts()

Unnamed: 0_level_0,count
rating,Unnamed: 1_level_1
5,4990
3,1706
4,1328
1,1260
2,730


In [None]:
import re
import unicodedata

def clean_unicode(text):
    text = unicodedata.normalize('NFKD', text)
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)   # remove non-ASCII or replace with space
    return text

In [None]:
df['reply'] = df['ownerResponse/text'].fillna("").astype(str)
df['reply'] = df['ownerResponse/text'].apply(clean_unicode)

In [None]:
df['title'] = df['title'].fillna("").astype(str)
df['text'] = df['text'].fillna("").astype(str)

df['title'] = df['title'].apply(clean_unicode)
df['text'] = df['text'].apply(clean_unicode)

In [None]:
def is_gibberish(text, threshold=0.5):
    if not isinstance(text, str) or len(text.strip()) == 0:
        return True  # empty or invalid text = gibberish

    alpha = sum(c.isalpha() for c in text)
    total = len(text)

    if total == 0:
        return True

    return (alpha / total) < threshold

In [None]:
df['is_gibberish_review'] = df['text'].apply(is_gibberish)
df['is_gibberish_reply'] = df['reply'].apply(is_gibberish)

In [None]:
df = df[df['is_gibberish_review'] == False].copy()

In [None]:
df = df[df['is_gibberish_reply'] == False].copy()

In [None]:
df.shape

(9952, 16)

In [None]:
df.drop(columns=['is_gibberish_review', 'is_gibberish_reply'], inplace=True)
df.shape

(9952, 14)

In [None]:
def replace_entities_review(row):
    review = row['text']
    hotel = str(row['placeInfo/name'])
    user_loc = str(row['user/userLocation/shortName'])

    # Replace hotel name with placeholder
    if hotel and isinstance(hotel, str):
        review = review.replace(hotel, "<HOTEL_NAME>")

    # Replace user location with placeholder
    if user_loc and isinstance(user_loc, str):
        review = review.replace(user_loc, "<USER_LOCATION>")

    return review

In [None]:
def replace_entities_reply(row):
    reply = row['reply']
    hotel = str(row['placeInfo/name'])
    user_loc = str(row['user/userLocation/shortName'])

    # Replace hotel name with placeholder
    if hotel and isinstance(hotel, str):

        reply = reply.replace(hotel, "<HOTEL_NAME>")

    # Replace user location with placeholder
    if user_loc and isinstance(user_loc, str):
        reply = reply.replace(user_loc, "<USER_LOCATION>")

    return reply

In [None]:
df["cleaned_review"] = df.apply(replace_entities_review, axis=1)

In [None]:
df["cleaned_reply"] = df.apply(replace_entities_reply, axis=1)

In [None]:
df.shape

(9952, 16)

In [None]:
df.columns

Index(['id', 'title', 'rating', 'text', 'user/userLocation/shortName',
       'placeInfo/id', 'placeInfo/name', 'placeInfo/rating',
       'ownerResponse/text', 'review_lang', 'reply_lang', 'is_english_review',
       'is_english_reply', 'reply', 'cleaned_review', 'cleaned_reply'],
      dtype='object')

In [None]:
def clean_contact_info(text):
    if not isinstance(text, str):
        return text

    text = re.sub(r'\b[\w\.-]+@[\w\.-]+\.\w+\b', '<EMAIL>', text)   # email
    text = re.sub(r'https?://\S+|www\.\S+', '<URL>', text)         # urls
    text = re.sub(r'(\+?\d[\d\-\s]{8,}\d)', '<PHONE>', text)       # phone no.
    return text

In [None]:
df["cleaned_review"] = df["cleaned_review"].apply(clean_contact_info)
df["cleaned_reply"] = df["reply"].apply(clean_contact_info)

In [None]:
df.shape

(9952, 16)

In [None]:
df.columns

Index(['id', 'title', 'rating', 'text', 'user/userLocation/shortName',
       'placeInfo/id', 'placeInfo/name', 'placeInfo/rating',
       'ownerResponse/text', 'review_lang', 'reply_lang', 'is_english_review',
       'is_english_reply', 'reply', 'cleaned_review', 'cleaned_reply'],
      dtype='object')

In [None]:
!pip install rapidfuzz

Collecting rapidfuzz
  Downloading rapidfuzz-3.14.3-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (12 kB)
Downloading rapidfuzz-3.14.3-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (3.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m28.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz
Successfully installed rapidfuzz-3.14.3


In [None]:
from rapidfuzz import fuzz

def replace_hotel_fuzzy(text, hotel_name, threshold=80):
    if not isinstance(text, str) or not isinstance(hotel_name, str):
        return text

    # lowercase for consistency
    t = text.lower()
    h = hotel_name.lower()

    score = fuzz.partial_ratio(h, t)

    # If match is good enough, replace fuzzy hotel name occurrences in the text
    if score >= threshold:
        # safer: replace exact words or case-insensitive match
        return re.sub(re.escape(hotel_name), "<HOTEL_NAME>", text, flags=re.IGNORECASE)

    return text


In [None]:
df["cleaned_review"] = df.apply(
    lambda row: replace_hotel_fuzzy(row["cleaned_review"], row["placeInfo/name"]),
    axis=1
)

df["cleaned_reply"] = df.apply(
    lambda row: replace_hotel_fuzzy(row["cleaned_reply"], row["placeInfo/name"]),
    axis=1
)

In [None]:
df.shape

(9952, 16)

In [None]:
df.columns

Index(['id', 'title', 'rating', 'text', 'user/userLocation/shortName',
       'placeInfo/id', 'placeInfo/name', 'placeInfo/rating',
       'ownerResponse/text', 'review_lang', 'reply_lang', 'is_english_review',
       'is_english_reply', 'reply', 'cleaned_review', 'cleaned_reply'],
      dtype='object')

In [None]:
df.drop(columns=['review_lang', 'reply_lang', 'is_english_review',
       'is_english_reply'], inplace=True)

In [None]:
df.shape

(9952, 12)

In [None]:
df.drop(columns=['text', 'ownerResponse/text'], inplace=True)

In [None]:
df.shape

(9952, 10)

In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [None]:
title_regex = re.compile(r'\b(Ms|Mr|Mrs|Miss|Dr|Sir|Madam|Prof|Captain)\.?\s+[A-Z][a-zA-Z]+\b', flags=re.IGNORECASE)
def mask_person_names(text):
    if not isinstance(text, str): return text
    text = title_regex.sub("<PERSON_NAME>", text)
    doc = nlp(text)
    new_text = text
    for ent in doc.ents:
        if ent.label_ == "PERSON":
            new_text = re.sub(r'\b' + re.escape(ent.text) + r'\b', "<PERSON_NAME>", new_text, flags=re.IGNORECASE)
    return new_text

In [None]:
df['cleaned_reply'] = df['cleaned_reply'].apply(mask_person_names)
df['cleaned_review'] = df['cleaned_review'].apply(mask_person_names)

In [None]:
!pip install ftfy emoji

Collecting ftfy
  Downloading ftfy-6.3.1-py3-none-any.whl.metadata (7.3 kB)
Collecting emoji
  Downloading emoji-2.15.0-py3-none-any.whl.metadata (5.7 kB)
Downloading ftfy-6.3.1-py3-none-any.whl (44 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.8/44.8 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading emoji-2.15.0-py3-none-any.whl (608 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m608.4/608.4 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: ftfy, emoji
Successfully installed emoji-2.15.0 ftfy-6.3.1


In [None]:
import re, ftfy, emoji

In [None]:
def normalize_text(text):
    if not isinstance(text, str): return ""
    text = ftfy.fix_text(text)
    text = emoji.replace_emoji(text, replace=' ')
    # Remove weird control characters; allow common punctuation
    text = re.sub(r'[^A-Za-z0-9\s.,;:?!\'"\-()/&%]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [None]:
df['cleaned_review'] = df['cleaned_review'].apply(normalize_text)
df['cleaned_reply'] = df['cleaned_reply'].apply(normalize_text)

In [None]:
df.shape

(9952, 10)

In [None]:
df.head()

Unnamed: 0,id,title,rating,user/userLocation/shortName,placeInfo/id,placeInfo/name,placeInfo/rating,reply,cleaned_review,cleaned_reply
0,0,Kids club,5,,307512,"ITC Grand Goa, a Luxury Collection Resort & Sp...",4.6,"Dear Valued Guest,\n\nNamaste,\n\nThank you fo...",It was very pleasant and wonderful stay specia...,"Dear Valued Guest, Namaste, Thank you for taki..."
1,1,Excellent,5,,620076,Hyderabad Marriott Hotel & Convention Centre,4.4,"Dear Guest, \nThank you for your wonderful fee...",The hotel is fabulous and especially the Tattv...,"Dear Guest, Thank you for your wonderful feedb..."
2,2,Good visit,4,London,299754,JW Marriott Mumbai Juhu,4.5,"Dear Tripadvisor Member,\n\nThank you for taki...",Good experience at the JE Marriott Juhu. Servi...,"Dear Tripadvisor Member, Thank you for taking ..."
3,3,Excellent stay,5,,8435492,Seclude Mussoorie,4.5,"Dear Guest,\nThank you for sharing your experi...",We came in group during our vacations in June ...,"Dear Guest, Thank you for sharing your experie..."
4,4,"The hotel is neat very well maintained, the fo...",5,,1141615,Gulf Hotel,3.9,"Dear Guest,\n\nGreetings from Gulf Hotel\n\nWe...","The best hotel in Mumbai as per my opinion, th...","Dear Guest, Greetings from HOTEL NAME We welco..."


In [None]:
import spacy
import re

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

def replace_entities(text):
    if not isinstance(text, str) or len(text.strip()) == 0:
        return text

    doc = nlp(text)
    new_text = text

    # Sort entities by length (to avoid partial overlaps)
    entities = sorted(doc.ents, key=lambda e: len(e.text), reverse=True)

    for ent in entities:
        label = ent.label_
        ent_text = re.escape(ent.text)  # escape regex-sensitive characters

        if label == "PERSON":
            new_text = re.sub(rf"\b{ent_text}\b", "<PERSON_NAME>", new_text, flags=re.IGNORECASE)
        elif label in ["ORG"]:
            new_text = re.sub(rf"\b{ent_text}\b", "<HOTEL_NAME>", new_text, flags=re.IGNORECASE)
        elif label in ["GPE", "LOC"]:
            new_text = re.sub(rf"\b{ent_text}\b", "<LOCATION>", new_text, flags=re.IGNORECASE)

    # Remove duplicate tags caused by consecutive replacements
    new_text = re.sub(r"(<PERSON_NAME>\s*,\s*)+", "<PERSON_NAME> ", new_text)
    new_text = re.sub(r"(<HOTEL_NAME>\s*,\s*)+", "<HOTEL_NAME> ", new_text)
    new_text = re.sub(r"(<LOCATION>\s*,\s*)+", "<LOCATION> ", new_text)

    # Remove double spaces and clean punctuation
    new_text = re.sub(r"\s{2,}", " ", new_text).strip()

    return new_text


In [None]:
df["cleaned_review2"] = df["cleaned_review"].apply(replace_entities)
df["cleaned_response2"] = df["cleaned_reply"].apply(replace_entities)


In [None]:
df.to_csv('temp_clean.csv')