In [1]:
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.metrics import accuracy_score
import pandas as pd
import re
import string

# Download necessary NLTK resources
nltk.download('vader_lexicon')
nltk.download('punkt')
nltk.download('stopwords')

# Initialize Sentiment Analyzer
sia = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
train_reviews = pd.read_csv(r'C:\Users\hp\Downloads\archive (2)\Reviews\hotel_reviews.csv',header=None)

In [9]:
train_reviews

Unnamed: 0,0
0,1. Amazing hotel with excellent service and am...
1,"2. ""The staff was friendly and helpful, making..."
2,"3. ""The room was spacious, clean, and well-mai..."
3,"4. ""The bed was incredibly comfortable, ensuri..."
4,"5. ""The hotel's location was perfect, close to..."
...,...
70,"? ""The hotel was a complete rip-off."""
71,"? ""The hotel was a terrible place to relax."""
72,"? ""The hotel was a terrible base for explorin..."
73,"? ""The hotel was a terrible place to meet oth..."


In [17]:
train_reviews = train_reviews.drop(30)  # Drop row with index 30
train_reviews = train_reviews.reset_index(drop=True) #reset index
print(train_reviews)

                                                    0
0   1. Amazing hotel with excellent service and am...
1   2. "The staff was friendly and helpful, making...
2   3. "The room was spacious, clean, and well-mai...
3   4. "The bed was incredibly comfortable, ensuri...
4   5. "The hotel's location was perfect, close to...
..                                                ...
69             ?  "The hotel was a complete rip-off."
70      ?  "The hotel was a terrible place to relax."
71  ?  "The hotel was a terrible base for explorin...
72  ?  "The hotel was a terrible place to meet oth...
73  ?  "The hotel was a terrible place to create l...

[74 rows x 1 columns]


In [None]:
train_reviews

In [None]:
train_reviews['ratings'] = train_reviews[]

In [2]:
review = pd.read_csv('ReviewFINALdataset_1.csv')

In [3]:
review.head()

Unnamed: 0,travelCode,User_ID,Car_rented,review_car,review_hotel,review_flights,flight_rating,hotel_rating,car_rating,overall_rating
0,0,0,Not Rented,,Great value for money! Got a five-star experie...,"It was a standard flight. Nothing wrong, but n...",2,5,0,3.5
1,2,0,Not Rented,,"The worst hotel stay ever! The room was dirty,...",The flight was on time and the staff was very ...,5,1,0,3.0
2,7,0,Not Rented,,"The staff was helpful, but service could have ...","The flight was delayed for hours, and there wa...",2,2,0,2.0
3,11,0,Not Rented,,"The location was convenient, but the surroundi...","Had a smooth flight with no delays, and the fo...",5,2,0,3.5
4,13,0,Not Rented,,"Terrible experience! The room was not clean, a...","The cabin was dirty, and the bathrooms were no...",1,1,0,1.0


In [4]:
hotel_reviews = review[['review_hotel','hotel_rating']].copy(deep=True)

In [5]:
hotel_reviews.head()

Unnamed: 0,review_hotel,hotel_rating
0,Great value for money! Got a five-star experie...,5
1,"The worst hotel stay ever! The room was dirty,...",1
2,"The staff was helpful, but service could have ...",2
3,"The location was convenient, but the surroundi...",2
4,"Terrible experience! The room was not clean, a...",1


In [6]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    words = word_tokenize(text)  # Tokenization
    words = [word for word in words if word not in stopwords.words('english')]  # Remove stopwords
    return ' '.join(words)

In [7]:
hotel_reviews['cleaned_review'] = hotel_reviews['review_hotel'].apply(preprocess_text)

In [8]:
def get_sentiment_score(text):
    score = sia.polarity_scores(text)['compound']
    if score >= 0.5:
        return 5  # Very Positive
    elif score >= -0.3:
        return 3  # Neutral
    else:
        return 1  # Negative

In [9]:
hotel_reviews['predicted_label'] = hotel_reviews['cleaned_review'].apply(get_sentiment_score)

In [10]:
accuracy = accuracy_score(hotel_reviews['hotel_rating'], hotel_reviews['predicted_label'])
print(f'Accuracy: {accuracy:.2f}')

Accuracy: 0.20


In [11]:
hotel_reviews["hotel_rating"] = hotel_reviews["hotel_rating"].replace({2: 3})

In [12]:
hotel_reviews['hotel_rating'].value_counts()

3    34430
5     4445
1     1677
Name: hotel_rating, dtype: int64

In [13]:
accuracy = accuracy_score(hotel_reviews['hotel_rating'], hotel_reviews['predicted_label'])
print(f'Accuracy: {accuracy:.2f}')

Accuracy: 0.63


In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [15]:
X_train, X_test, y_train, y_test = train_test_split(
    hotel_reviews['cleaned_review'],  # Processed text data
    hotel_reviews['hotel_rating'],    # Target labels
    test_size=0.2, random_state=42    # 80% Train, 20% Test
)

In [16]:
vectorizer = TfidfVectorizer()

In [17]:
# Step 3: Fit TF-IDF on Training Data & Transform
X_train_tfidf = vectorizer.fit_transform(X_train)

# Step 4: Transform Test Data (Using Learned Vocabulary)
X_test_tfidf = vectorizer.transform(X_test)


In [18]:
# Step 5: Train a Simple Logistic Regression Model (Baseline)
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)


In [19]:
y_pred = model.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)

print(f"TF-IDF + Logistic Regression Accuracy: {accuracy:.4f}")

TF-IDF + Logistic Regression Accuracy: 1.0000


In [22]:
hotel_reviews['cleaned_review'].nunique()

45