# Imported Libraries

In [2]:
import string, nltk, os, glob, pickle
from bs4 import BeautifulSoup
import re, datetime,gensim,spacy
import pandas as pd, numpy as np
import matplotlib
matplotlib.use('agg')
import matplotlib.pyplot as plt 
from nltk.corpus import wordnet
from nltk.corpus import stopwords, words 
from nltk.sentiment import SentimentIntensityAnalyzer
import warnings
import os
warnings.filterwarnings("ignore")
nltk.download('stopwords')
nltk.download('vader_lexicon')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Predator\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [4]:
words_english = words.words()
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))
sid = SentimentIntensityAnalyzer() # for VADER SA

LookupError: 
**********************************************************************
  Resource [93mvader_lexicon[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('vader_lexicon')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93msentiment/vader_lexicon.zip/vader_lexicon/vader_lexicon.txt[0m

  Searched in:
    - 'C:\\Users\\Predator/nltk_data'
    - 'c:\\Users\\Predator\\AppData\\Local\\Programs\\Python\\Python311\\nltk_data'
    - 'c:\\Users\\Predator\\AppData\\Local\\Programs\\Python\\Python311\\share\\nltk_data'
    - 'c:\\Users\\Predator\\AppData\\Local\\Programs\\Python\\Python311\\lib\\nltk_data'
    - 'C:\\Users\\Predator\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
    - ''
**********************************************************************


# Functions

In [None]:
## Define Functions #######
# def make_bigrams(texts):
# 	return [bigram_mod[doc] for doc in texts]

def drop_columns(df): #! APPLICABLE FOR TRIPADVISOR REVIEWS
    df.drop_duplicates(subset=['Author', 'Review Text'], inplace=True) # to ensure the uniqueness of reviews by author and text
    # Drop columns 'B' and 'C'
    columns_to_drop = ['Author', 'Review Date', 'Title', 'Date of Stay', 'Trip Type']
    df = df.drop(columns=columns_to_drop)
    return df

def clean_ratings(df):#! APPLICABLE FOR TRIPADVISOR REVIEWS ATM
    # Remove characters other than digits and '.0'
    df['Rating'] = df['Rating'].str.extract(r'(\d+(?:\.\d+)?)')  # Extract digits and decimal point
    # Convert the cleaned ratings column to integer type
    df['Rating'] = df['Rating'].astype(float).astype(int)  # Convert to float first to handle '.0' cases
    return df


def clean_text(text):
    """
        text: a string
        
        return: modified initial string
    """
    text = BeautifulSoup(text, "lxml").text # HTML decoding
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text
    text = BAD_SYMBOLS_RE.sub('', text) # delete symbols which are in BAD_SYMBOLS_RE from text
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # delete stopwords from text
    return text

def analyze_sentiment(text):
    '''
    Utility function to classify the polarity of a text
    using VADER.
    '''
    # Analyze sentiment using VADER
    scores = sid.polarity_scores(text)
    
    return scores['compound']
    # # Classify the sentiment
    # if scores['compound'] >= 0.05:
    #     return 1  # Positive
    # elif scores['compound'] <= -0.05:
    #     return -1  # Negative
    # else:
    #     return 0  # Neutral

# Loading of Data + Cleaning + Analysis

In [None]:
# Load data from the specified CSV files
waterfront_reviews = pd.read_csv(r'..\datasets\tripadvisor\1_Waterfront-Cebu-City-Hotel-Casino.csv')
bai_reviews = pd.read_csv(r'..\datasets\tripadvisor\3_bai-Hotel-Cebu.csv')

# Display the first few rows of each DataFrame
print("Waterfront Reviews:")
waterfront_reviews = drop_columns(waterfront_reviews) # should drop columns unnecessary for sentiment analysis
waterfront_reviews = clean_ratings(waterfront_reviews) # should clean the ratings column
# waterfront_reviews.shape # (1725, 2)
# (waterfront_reviews.isnull().sum()/(len(waterfront_reviews)))*100 # no missing values
waterfront_reviews.name = 'Waterfront Hotel and Casino'
waterfront_reviews.head()


In [None]:
print("\nBai Hotel Reviews:")
bai_reviews = drop_columns(bai_reviews)
bai_reviews = clean_ratings(bai_reviews)
# bai_reviews.shape # (726, 2)
# (bai_reviews.isnull().sum()/(len(bai_reviews)))*100 # no missing values
bai_reviews.name = 'bai Hotel'
bai_reviews.head()

In [None]:
# Load and process data for the specified hotels
hotel_dataframes = [waterfront_reviews, bai_reviews]

for hotel_df in hotel_dataframes:
    # Extract the hotel name from the DataFrame variable name
    hotel_name = hotel_df.name
    
    print('Analyzing', hotel_name)
    
    # Drop rows with NaN values in 'Review Text' column
    hotel_df.dropna(subset=['Review Text'], inplace=True)
    
    # Apply text cleaning
    hotel_df['Review Text'] = hotel_df['Review Text'].apply(clean_text)
    
    # Perform sentiment analysis and add the results to the DataFrame
    hotel_df['SA'] = hotel_df['Review Text'].apply(analyze_sentiment)
    
    # Calculate percentages of positive, neutral, and negative reviews
    pos_texts = int(len(hotel_df[hotel_df['SA'] > 0]))
    neu_texts = int(len(hotel_df[hotel_df['SA'] == 0]))
    neg_texts = int(len(hotel_df[hotel_df['SA'] < 0]))

    print("Percentage of Positive Reviews: {}%".format(pos_texts * 100 / len(hotel_df)))
    print("Percentage of Neutral Reviews: {}%".format(neu_texts * 100 / len(hotel_df)))
    print("Percentage of Negative Reviews: {}%".format(neg_texts * 100 / len(hotel_df)))
    
    # Save sentiment analysis results to a CSV file
    sentiment_count = pd.DataFrame({
        '%Positive': [pos_texts * 100 / len(hotel_df)],
        '%Neutral': [neu_texts * 100 / len(hotel_df)],
        '%Negative': [neg_texts * 100 / len(hotel_df)]
    })
    output_folder = r'sentiment count'
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    output_filename = os.path.join(output_folder, hotel_name.replace(' ', '_') + '_sentiment_count.csv')
    sentiment_count.to_csv(output_filename, index=False)
    
    # Categorize sentiments
    condition = [hotel_df['SA'] < 0, hotel_df['SA'] == 0, hotel_df['SA'] > 0]
    choices = ['Negative', 'Neutral', 'Positive']
    hotel_df['Sentiment'] = np.select(condition, choices)
    
    # Save processed data
    output_folder = r'processed reviews'
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    output_filename = os.path.join(output_folder, hotel_name.replace(' ', '_') + '_processed.csv')
    hotel_df.to_csv(output_filename, index=False)
    print("Processed data saved to:", output_filename)
    print('\n')
    
    # to add more analysis or processing steps when necessary
    # ...

    