In [None]:
import pandas as pd
import nltk
import ssl
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from imblearn.over_sampling import SMOTE
from textblob import TextBlob
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report, accuracy_score
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
df = pd.read_csv("https://raw.githubusercontent.com/shashank2325/CMPE-255-project/main/Hotel_reviews_sentiment_extracted.csv")

In [None]:
df.head()

Unnamed: 0.1,Unnamed: 0,dateAdded,address,city,name,postalCode,province,reviews.rating,reviews.text,reviews.title,reviews.userCity,reviews.userProvince,review,bert_sentiment_rating,rating_diff
0,0,2018-01-18T18:43:12Z,5620 Calle Real,Goleta,Best Western Plus South Coast Inn,93117,CA,3.0,"This hotel was nice and quiet. Did not know, t...",Best Western Plus Hotel,San Jose,UnitedStates,Best Western Plus Hotel This hotel was nice an...,4.0,1.0
1,1,2016-11-06T20:21:05Z,5th And San Carlos PO Box 3574,Carmel by the Sea,Best Western Carmel's Town House Lodge,93921,CA,4.0,We stayed in the king suite with the separatio...,Clean rooms at solid rates in the heart of Carmel,San Francisco,CA,Clean rooms at solid rates in the heart of Car...,4.0,0.0
2,2,2016-11-06T20:21:05Z,5th And San Carlos PO Box 3574,Carmel by the Sea,Best Western Carmel's Town House Lodge,93921,CA,3.0,"Parking was horrible, somebody ran into my ren...",Business,Prescott Valley,AZ,"Business Parking was horrible, somebody ran in...",1.0,2.0
3,3,2016-11-06T20:21:05Z,5th And San Carlos PO Box 3574,Carmel by the Sea,Best Western Carmel's Town House Lodge,93921,CA,5.0,Not cheap but excellent location. Price is som...,Very good,Guaynabo,PR,Very good Not cheap but excellent location. Pr...,4.0,1.0
4,4,2016-11-06T20:21:05Z,5th And San Carlos PO Box 3574,Carmel by the Sea,Best Western Carmel's Town House Lodge,93921,CA,2.0,If you get the room that they advertised on th...,Low chance to come back here,Reno,NV,Low chance to come back here If you get the ro...,2.0,0.0


In [None]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9]', ' ', text)
    text = re.sub(r'\d+', '', text)
    words = word_tokenize(text)
    words = [word for word in words if word not in stopwords.words('english')]
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

df['review'] = df['review'].apply(clean_text)

In [None]:
df['combined_rating'] = (df['reviews.rating'] + df['bert_sentiment_rating']) / 2

In [None]:
def categorize_combined_rating(rating):
    if rating >= 4:
        return 'positive'
    elif rating >= 3:
        return 'neutral'
    else:
        return 'negative'
df['combined_sentiment'] = df['combined_rating'].apply(categorize_combined_rating)

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['combined_sentiment'] = le.fit_transform(df['combined_sentiment'])
class_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print(class_mapping)

{'negative': 0, 'neutral': 1, 'positive': 2}


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(ngram_range=(1, 2))
X_tfidf = vectorizer.fit_transform(df['review'])

In [None]:
from sklearn.model_selection import train_test_split

y = df['combined_sentiment']
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report


rf_clf = RandomForestClassifier()
svc = SVC()


rf_clf.fit(X_train, y_train)
svc.fit(X_train, y_train)

In [None]:
print('Random Forest:\n', classification_report(y_test, rf_clf.predict(X_test)))
print('Support Vector Machine:\n', classification_report(y_test, svc.predict(X_test)))

Random Forest:
               precision    recall  f1-score   support

           0       0.91      0.36      0.51       622
           1       0.86      0.04      0.08       546
           2       0.76      1.00      0.86      2832

    accuracy                           0.77      4000
   macro avg       0.84      0.47      0.49      4000
weighted avg       0.80      0.77      0.70      4000

Support Vector Machine:
               precision    recall  f1-score   support

           0       0.87      0.62      0.72       622
           1       0.68      0.15      0.25       546
           2       0.82      0.99      0.90      2832

    accuracy                           0.82      4000
   macro avg       0.79      0.59      0.62      4000
weighted avg       0.81      0.82      0.78      4000



In [None]:
from xgboost import XGBClassifier
xgb_clf = XGBClassifier(
    max_depth=10,
    learning_rate=0.2,
    n_estimators=100,
    objective='multi:softmax',
    num_class=3,
    eval_metric='mlogloss',
    use_label_encoder=False
)
xgb_clf.fit(X_train, y_train)

In [None]:
preds = xgb_clf.predict(X_test)
print(classification_report(y_test, preds, target_names=class_mapping))

              precision    recall  f1-score   support

    negative       0.84      0.68      0.75       622
     neutral       0.55      0.34      0.42       546
    positive       0.87      0.97      0.91      2832

    accuracy                           0.84      4000
   macro avg       0.75      0.66      0.69      4000
weighted avg       0.82      0.84      0.82      4000



In [None]:
import joblib

model_filename = "xgboost_model.pkl"
joblib.dump(xgb_clf, model_filename)

['xgboost_model.pkl']