In [1]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from xgboost import XGBClassifier
import re

In [2]:
positive_reviews = pd.read_csv('hotel_positive_reviews.csv')
neutral_reviews = pd.read_csv('hotel_neutral_reviews.csv')
negative_reviews = pd.read_csv('hotel_negative_reviews.csv')

In [3]:
positive_reviews.rename(columns={'positive reviews':'Reviews'},inplace=True)
neutral_reviews.rename(columns={'neutral reviews':'Reviews'},inplace=True)
negative_reviews.rename(columns={'negative reviews':'Reviews'},inplace=True)

In [4]:
merged_df = pd.concat([positive_reviews,neutral_reviews,negative_reviews],ignore_index=True)

In [5]:
merged_df = merged_df.sample(frac=1, random_state=42).reset_index(drop=True)

In [6]:
label_encoder = LabelEncoder()
merged_df['label'] = label_encoder.fit_transform(merged_df['sentiment'])

def preprocess_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters and numbers
    text = text.lower().split()               # Lowercase and tokenize
    return text

In [7]:
merged_df['cleaned_review'] = merged_df['Reviews'].apply(preprocess_text)

# Train Word2Vec model
w2v_model = Word2Vec(sentences=merged_df['cleaned_review'], vector_size=100, window=5, min_count=2, sg=1)

# Create embeddings for each review by averaging word vectors
def get_embedding(review):
    vectors = [w2v_model.wv[word] for word in review if word in w2v_model.wv]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(100)

In [8]:
X = np.array([get_embedding(review) for review in merged_df['cleaned_review']])
y = merged_df['label']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [9]:
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', n_estimators=200, max_depth=6)
xgb_model.fit(X_train, y_train)

In [10]:
y_pred = xgb_model.predict(X_test)
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

              precision    recall  f1-score   support

    negative       1.00      1.00      1.00       555
     neutral       1.00      1.00      1.00       561
    positive       1.00      1.00      1.00       614

    accuracy                           1.00      1730
   macro avg       1.00      1.00      1.00      1730
weighted avg       1.00      1.00      1.00      1730



In [15]:
text = u"Exceptional stay at Hotel Sunset! Friendly staff, immaculate rooms, and top-notch amenities. Perfect location and outstanding service. Highly recommended!"

In [17]:
y_pred = xgb_model.predict(text)
print(y_pred)

XGBoostError: [14:51:16] C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-0b3782d1791676daf-1\xgboost\xgboost-ci-windows\src\data\array_interface.h:135: Check failed: typestr.size() == 3 || typestr.size() == 4: `typestr' should be of format <endian><type><size of type in bytes>.