In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

# Function to load and normalize data
def load_and_normalize_data(directories):
    hotel_dfs = {}
    
    for directory in directories:           
        path = os.path.join('../../../raw data', directory)
        
        for filename in os.listdir(path):
            if filename.endswith('.csv'):
                # Extract hotel name from filename by removing undesired part
                hotel_name = os.path.splitext(filename)[0].replace('_reviews_2022_2024', '')
                
                df = pd.read_csv(os.path.join(path, filename))
                
                # Normalize the Review Score
                scaler = MinMaxScaler(feature_range=(0, 1))
                df['normalized_score'] = scaler.fit_transform(df[['Review Score']])

                # Add a new column for hotel name
                df['Hotel'] = hotel_name
                
                # Concatenate to the hotel DataFrame
                if hotel_name in hotel_dfs:
                    hotel_dfs[hotel_name] = pd.concat([hotel_dfs[hotel_name], df], ignore_index=True)
                else:
                    hotel_dfs[hotel_name] = df.reset_index(drop=True)

    return hotel_dfs


In [2]:
from langdetect import detect, DetectorFactory
from langdetect.lang_detect_exception import LangDetectException
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from langdetect import DetectorFactory
from collections import Counter
DetectorFactory.seed = 0

# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    try:
        #Return None if there are less than 10 words in the review
        min_length = 10
        if len(text.split()) < min_length:
            return None
        # Check if the review is in English
        lang = detect(text)
        if lang != 'en':
            return None  # Return None if not in English
        # Convert text to lowercase
        
        text = text.lower()

        # Remove punctuation
        text = text.translate(str.maketrans('', '', string.punctuation))
        
        # Remove emojis
        text = re.sub(r'[^\w\s]', '', text)

        # Remove numbers
        text = re.sub(r'\d+', '', text)

        # Tokenization
        tokens = nltk.word_tokenize(text)
        # Count tokens before removing stop words and lemmatization

        # Remove stop words
        tokens = [word for word in tokens if word not in stop_words]
        # Lemmatization
        lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens]

        processed_text = ' '.join(lemmatized_tokens)

        # Remove extra whitespace
        return re.sub(r'\s+', ' ', processed_text).strip()

    except LangDetectException as e:
        # If language detection fails, return None
        # print(f"error: {e}")
        return None


In [3]:

directories = [
    'agoda_hotel_reviews',  
    'tripadvisor_hotel_reviews',
    'klook_hotel_reviews',  
    'booking_hotel_reviews'
]
# Load and normalize data
hotel_dfs = load_and_normalize_data(directories)


for hotel, df in hotel_dfs.items():
    # Apply preprocessing to the DataFrame's 'Review Content' column
    df['cleaned_content'] = df['Review Content'].apply(preprocess_text)
    hotel_dfs[hotel] = df

In [4]:
#creation of labels
input_data_dfs = {}
for hotel, df in hotel_dfs.items():
    df['label'] = df['normalized_score'].apply(lambda x: 0 if x <= 0.25 else (2 if x >= 0.75 else 1))
    # Identify and print the removed reviews (non-English)
    removed_reviews = df[df['cleaned_content'].isnull()]['Review Content']
    # Drop rows where 'cleaned_content' is None (non-English reviews)
    df = df.dropna(subset=['cleaned_content'])
    

    # Update the DataFrame in the dictionary    
    input_data_dfs[hotel] = df

In [6]:
import os
import pandas as pd

# Ensure the directories exist before saving the CSV files
os.makedirs('hotel_data', exist_ok=True)
os.makedirs('input_data', exist_ok=True)

# Exporting hotel_dfs
for hotel_name, df in hotel_dfs.items():
    df.to_csv(f'hotel_data/{hotel_name}_hotel_data.csv', index=False)

# Exporting input_data_dfs
for hotel_name, df in input_data_dfs.items():
    df.to_csv(f'input_data/{hotel_name}_input_data.csv', index=False)

print("Hotel data and input data exported successfully.")


Hotel data and input data exported successfully.
