# Hotel Reviews- Sentiment Classification

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import string
import spacy
from wordcloud import WordCloud, STOPWORDS

In [None]:
data=pd.read_excel('hotel_reviews (1).xlsx')

In [None]:
data.head()

In [None]:
data.shape

In [None]:
data.info()     # no null values

## Text Pre-processing

In [None]:
# removing trailing and leading characters
data=[Review.strip() for Review in data.Review]

In [None]:
data

In [None]:
# removing empty strings
data=[Review for Review in data if Review]

In [None]:
data[0:5]

In [None]:
# joining list into 1 string
text=' '.join(data)
text

In [None]:
import nltk
nltk.download('punkt')

In [None]:
# removing punctuations
import string
no_punc_text=text.translate(str.maketrans('','',string.punctuation))
no_punc_text

In [None]:
# tokenization
from nltk.tokenize import word_tokenize
text_tokens=word_tokenize(no_punc_text)
print(text_tokens[0:50])

In [None]:
len(text_tokens)

In [None]:
# removing stopwords
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')

print(stopwords.words('english'))

In [None]:
my_stop_words=stopwords.words('english')

sw_list = ['hotel','monaco','seattle']
my_stop_words.extend(sw_list)

no_stop_tokens=[word for word in text_tokens if not word in my_stop_words]

In [None]:
print(no_stop_tokens[0:50])

In [None]:
from collections import Counter 
dct=Counter(no_stop_tokens) 
dct

In [None]:
most_occur=dct.most_common(10) 
print(most_occur) 

In [None]:
# normalizing data
lower_words=[Review.lower() for Review in no_stop_tokens]
print(lower_words[0:30])

## Stemming

In [None]:
from nltk.stem import PorterStemmer
ps= PorterStemmer()
stemmed_tokens=[ps.stem(word) for word in lower_words]
print(stemmed_tokens[0:40])

## Lemmatizing

In [None]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
lemmatized_tokens=[lemmatizer.lemmatize(word) for word in lower_words]
print(lemmatized_tokens[0:40])

In [None]:
clean_text=' '.join(lemmatized_tokens)
clean_text

## Feature Extraction

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer 

In [None]:
cv=CountVectorizer()
text_cv=cv.fit_transform(lemmatized_tokens)

In [None]:
print(cv.vocabulary_)

In [None]:
print(cv.get_feature_names_out()[100:200])

In [None]:
# Count Vectorizer with N-grams ( Bigrams & Trigrams)
cv_ngram_range=CountVectorizer(analyzer='word',ngram_range=(1,3),max_features=100)
bow_matrix_ngram=cv_ngram_range.fit_transform(lemmatized_tokens)

In [None]:
print(cv_ngram_range.get_feature_names_out())
print(bow_matrix_ngram.toarray())

In [None]:
# TF-IDF Vectorizer
tfidfv_ngram_max_features=TfidfVectorizer(norm='l2',analyzer='word',ngram_range=(1,3),max_features=500)
tfidf_matix_ngram=tfidfv_ngram_max_features.fit_transform(lemmatized_tokens)

In [None]:
print(tfidfv_ngram_max_features.get_feature_names_out())
print(tfidf_matix_ngram.toarray())

## Wordcloud

In [None]:
# Define a function to plot word cloud
def plot_cloud(wordcloud):
    plt.figure(figsize=(40,30))
    plt.imshow(wordcloud)
    plt.axis('off')

# Generate Word Cloud
STOPWORDS.add('resort')
STOPWORDS.add('room')
STOPWORDS.add('said')
STOPWORDS.add('say')
STOPWORDS.add('nt')
STOPWORDS.add('going')
STOPWORDS.add('time')
STOPWORDS.add('really')
STOPWORDS.add('used')
STOPWORDS.add('went')
STOPWORDS.add('asked')
STOPWORDS.add('day')

wordcloud=WordCloud(width=3000,height=2000,background_color='black',max_words=80,
                   colormap='Set1',stopwords=STOPWORDS).generate(clean_text)
plot_cloud(wordcloud)

## NER- Named Entity Recognition

In [None]:
import spacy
from spacy import displacy

In [None]:
# Parts Of Speech (POS) Tagging
nlp=spacy.load('en_core_web_sm')

In [None]:
doc = nlp("nice hotel expensive parking got good deal stay hotel anniversary, arrived late evening took advice previous reviews did valet parking, check quick easy, little disappointed non-existent view room room clean nice size, bed comfortable woke stiff neck high pillows, not soundproof like heard music room night morning loud bangs doors opening closing hear people talking hallway, maybe just noisy neighbors, aveda bath products nice, did not goldfish stay nice touch taken advantage staying longer, location great walking distance shopping, overall nice experience having pay 40 parking night")

In [None]:
doc.ents     #Display all entities

In [None]:
for i in doc.ents:
    print(i, '|' , i.label_)

In [None]:
#Spacy library is not 100% precise
#to display graphical representation for tags
displacy.render(doc,style='ent',jupyter=True)

In [None]:
data1=pd.DataFrame(data)

In [None]:
data1

In [None]:
for index, row in data1.iterrows():
    review_text = row[0]
    doc = nlp(review_text)
    # Display named entities for each review
    print(f"Entities for Review {index + 1}:")
    for ent in doc.ents:
        print(ent.text, '|', ent.label_)

    # Display graphical representation for named entities
    displacy.render(doc, style='ent', jupyter=True)

## Sentiment Analysis

In [None]:
data_re=pd.read_excel('hotel_reviews (1).xlsx')

In [None]:
data_re.shape

In [None]:
data_re['Rating'].value_counts()

In [None]:
# Down sampling positive reviews

In [None]:
data_neg = data_re.loc[data_re["Rating"]<3]
data_neg = data_neg.reset_index(drop = True)

In [None]:
data_five = data_re.loc[data_re['Rating'] == 5]
data_five = data_five.reset_index(drop = True)

In [None]:
data_pos = data_five.loc[:len(data_neg)]

In [None]:
len(data_pos)

In [None]:
len(data_neg)

In [None]:
data_all = pd.concat([data_neg,data_pos], axis = 0)
data_all = data_all.reset_index(drop = True)

In [None]:
len(data_all)

In [None]:
data_all["Sentiment"]=np.where(data_all["Rating"] == 5, "Positive" , "Negative")

In [None]:
data_all.head()

In [None]:
data_all= data_all.sample(frac=1)
data_all= data_all.reset_index(drop = True)
data_all.tail()

## Splitting Data

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(data_all.Review, data_all.Sentiment)

In [None]:
v=CountVectorizer()
x_train_vec = v.fit_transform(x_train)
x_test_vec = v.transform(x_test)

## Model Building

### 1) SVM

In [None]:
from sklearn import svm
SVM =svm.SVC(kernel = "linear")
SVM.fit(x_train_vec, y_train)

In [None]:
svm_score=SVM.score(x_test_vec, y_test)
svm_score

In [None]:
from sklearn.metrics import f1_score
f1_svm = f1_score(y_test,SVM.predict(x_test_vec), average = None)

In [None]:
f1_svm

In [None]:
review1 = ["unique, great stay, wonderful time hotel monaco, location excellent short stroll main downtown shopping area, pet friendly room showed no signs animal hair smells, monaco suite sleeping area big striped curtains pulled closed nice touch felt cosy, goldfish named brandi enjoyed, did n't partake free wine coffee/tea service lobby thought great feature, great staff friendly, free wireless internet hotel worked suite 2 laptops, decor lovely eclectic mix pattens color palatte, animal print bathrobes feel like rock stars, nice did n't look like sterile chain hotel hotel personality excellent stay."]
review1_vec = v.transform(review1)
SVM.predict(review1_vec)

In [None]:
review2 = ["stay did not care hotel, expecting cozy trendy room sadly disappointed tiny run-down room strange odor, bed practically floor level afraid head pillow, no warmth room just bare necessities, feel better place money not mention 30.00 mandatory valet parking,  "]
review2_vec = v.transform(review2)
SVM.predict(review2_vec)

In [None]:
review3 = ["good place  "]
review3_vec = v.transform(review3)
SVM.predict(review3_vec)

### 2) Naive Bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Build the Naive Bayes classifier
nb_classifier = MultinomialNB()
nb_classifier.fit(x_train_vec, y_train)

# Predict sentiment labels on the testing set
y_pred = nb_classifier.predict(x_test_vec)

In [None]:
# Accuracy
naivebayes_score = accuracy_score(y_test, y_pred)
naivebayes_score

In [None]:
print(nb_classifier.predict(review1_vec))
print(nb_classifier.predict(review2_vec))

In [None]:
print(nb_classifier.predict(review3_vec))

### 3) Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(x_train_vec, y_train)
y_pred = rf_classifier.predict(x_test_vec)

In [None]:
# Accuracy
randomforest_score = accuracy_score(y_test, y_pred)
randomforest_score

In [None]:
print(rf_classifier.predict(review1_vec))
print(rf_classifier.predict(review2_vec))

In [None]:
print(rf_classifier.predict(review3_vec))

### 4) Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

logreg_model = LogisticRegression()
logreg_model.fit(x_train_vec, y_train)

y_pred = logreg_model.predict(x_test_vec)

In [None]:
# Accuracy
logisticreg_score = accuracy_score(y_test, y_pred)
logisticreg_score

In [None]:
print(logreg_model.predict(review1_vec))
print(logreg_model.predict(review2_vec))

In [None]:
print(logreg_model.predict(review3_vec))

### 5) Gradient Boosting

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

gb_classifier = GradientBoostingClassifier()
gb_classifier.fit(x_train_vec, y_train)

y_pred = gb_classifier.predict(x_test_vec)

In [None]:
# Accuracy
gradientboosting_score = accuracy_score(y_test, y_pred)
gradientboosting_score

In [None]:
print(gb_classifier.predict(review1_vec))
print(gb_classifier.predict(review2_vec))

In [None]:
print(gb_classifier.predict(review3_vec))

### 6) KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn_classifier = KNeighborsClassifier(n_neighbors=5)  # You can adjust the number of neighbors
knn_classifier.fit(x_train_vec, y_train)

# Predict sentiment labels on the testing set
y_pred = knn_classifier.predict(x_test_vec)

In [None]:
# Evaluate the model
knn_score = accuracy_score(y_test, y_pred)
knn_score

In [None]:
print(knn_classifier.predict(review1_vec))
print(knn_classifier.predict(review2_vec))

In [None]:
print(knn_classifier.predict(review3_vec))

## Model Summary

In [None]:
modelsummary_data= [['SVM', svm_score*100],
         ['Random Forest', randomforest_score*100],
          ['Logistic Regression', logisticreg_score*100],
        ['Gradient Boosting', gradientboosting_score*100],
        ['K Nearest Neighnours', knn_score*100]  ]


df_model_summary= pd.DataFrame(modelsummary_data, columns=['Model', 'Accuracy'])


print(df_model_summary)

In [None]:
modelsummary=df_model_summary.sort_values(by=['Accuracy'], ascending=False)
modelsummary

In [None]:
modelsummary.plot.bar(x='Model', y='Accuracy')