In [1]:
import numpy as np
import pandas as pd
import random
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import pandas as pd
import re
import string


In [52]:
positive_reviews = pd.read_csv('hotel_positive_reviews.csv')
neutral_reviews = pd.read_csv('hotel_neutral_reviews.csv')
negative_reviews = pd.read_csv('hotel_negative_reviews.csv')

In [53]:
positive_reviews.rename(columns={'positive reviews':'Reviews'},inplace=True)
neutral_reviews.rename(columns={'neutral reviews':'Reviews'},inplace=True)
negative_reviews.rename(columns={'negative reviews':'Reviews'},inplace=True)

In [54]:
merged_df = pd.concat([positive_reviews,neutral_reviews,negative_reviews],ignore_index=True)

In [55]:
merged_df

Unnamed: 0,sentiment,Reviews
0,positive,Everything about this stay was amazing! The ou...
1,positive,"If you're looking for a great stay, this is th..."
2,positive,"If you're looking for a great stay, this is th..."
3,positive,Highly recommend this place! The luxurious was...
4,positive,"If you're looking for a great stay, this is th..."
...,...,...
8642,negative,It was a nightmare! The unbearable was bed com...
8643,negative,The stay was completely terrible. The view fro...
8644,negative,Do yourself a favor and skip this one. The ter...
8645,negative,The stay was completely poor. The location was...


In [56]:
merged_df = merged_df.sample(frac=1, random_state=42).reset_index(drop=True)


In [57]:
merged_df

Unnamed: 0,sentiment,Reviews
0,neutral,"If you're looking for a basic stay, this is ok..."
1,neutral,"Nothing exceptional, but nothing terrible eith..."
2,positive,The stay was delightful. The bed comfort was a...
3,negative,Avoid at all costs! The shocking was bed comfo...
4,positive,Can't wait to come back! The incredible was fa...
...,...,...
8642,negative,"I had high hopes, but the dirty was security a..."
8643,neutral,"If you're looking for a basic stay, this is ok..."
8644,neutral,Fairly standard. The decent was check-in proce...
8645,positive,Best decision ever! The comfortable was bathro...


In [61]:
merged_df.to_csv("Hotel_Reviews.csv",index=False)

In [8]:
# Downloading necessary NLTK resources
nltk.download('vader_lexicon')
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [9]:
sia = SentimentIntensityAnalyzer()


In [10]:
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    words = word_tokenize(text)  # Tokenization
    words = [word for word in words if word not in stopwords.words('english')]  # Remove stopwords
    return ' '.join(words)

In [11]:
merged_df['cleaned_review'] = merged_df['Reviews'].apply(preprocess_text)

In [12]:
merged_df.head()

Unnamed: 0,sentiment,Reviews,cleaned_review
0,neutral,"If you're looking for a basic stay, this is ok...",youre looking basic stay okay mediocre checkin...
1,neutral,"Nothing exceptional, but nothing terrible eith...",nothing exceptional nothing terrible either sa...
2,positive,The stay was delightful. The bed comfort was a...,stay delightful bed comfort amazing room clean...
3,negative,Avoid at all costs! The shocking was bed comfo...,avoid costs shocking bed comfort security made...
4,positive,Can't wait to come back! The incredible was fa...,cant wait come back incredible fantastic bathr...


In [13]:
X_train, X_test, y_train, y_test = train_test_split(
    merged_df['cleaned_review'],  # Processed text data
    merged_df['sentiment'],    # Target labels
    test_size=0.2, random_state=42    # 80% Train, 20% Test
)


In [14]:
vectorizer = TfidfVectorizer()

In [15]:
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [16]:
model = LogisticRegression()


In [17]:
model.fit(X_train_tfidf,y_train)

In [18]:
y_pred = model.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)

print(f"TF-IDF + Logistic Regression Accuracy: {accuracy:.4f}")

TF-IDF + Logistic Regression Accuracy: 1.0000


In [49]:
text = ["""I recently spent one night with my family at Vivanta Vadodara, and the experience was a mix of highs and lows.
The welcome was disappointing. Instead of a refreshing drink, we were handed a small, warm water bottle, which wasn’t a great start. However, the rooms were impressive—clean, spacious, and well-maintained. The bed was comfortable, and the linens were of high quality, ensuring a restful and comfortable night.
While the bathroom fittings were modern and functional, the washroom itself was very small. We felt it could have been designed better with more space.
The hotel has a swimming pool of decent size, but its maintenance was average. There was no lifeguard on duty, which could be a safety concern. The towels provided near the pool were stinking badly. When we raised this issue, we received a lackluster response stating that the odor was "permanent" since the towels were also used for the steam room—this was quite disappointing.
On the brighter side, the food was a standout feature. We opted for the dinner and breakfast inclusive package, and both meals were delicious with a wide variety of options. The cuisine was predominantly Indian, which aligned with our preference, and we truly enjoyed every dish. The restaurant staff were warm and attentive, adding to the overall dining experience.
In summary, while the stay had its memorable moments, there were several areas that need improvement, particularly in terms of the welcome experience and poolside maintenance. However, the excellent food and restaurant hospitality were redeeming factors."""]

In [31]:
# text.reshape(1,-1)

AttributeError: 'list' object has no attribute 'reshape'

In [25]:
X_test

5601    loved place spotless room cleanliness air cond...
4103    ordinary experience acceptable pool food quali...
222     stay exceptional restaurant service amazing wi...
7230       decent stay standard security spa could better
318     favor skip one shocking food quality parking a...
                              ...                        
7835    decent stay okay bathroom facilities staff fri...
491     avoid costs awful location restaurant service ...
2344    youre looking great stay place outstanding bed...
5871    youre looking great stay place exceptional sec...
218     unforgettable experience unbelievable noise le...
Name: cleaned_review, Length: 1730, dtype: object

In [50]:
text_vec = vectorizer.transform([text[0]])

In [51]:
y_pred = model.predict(text_vec)
print(y_pred)

['neutral']


In [30]:
def preprocess_review(text):
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    
    # Remove special characters except punctuation
    text = re.sub(r'[^a-zA-Z0-9\s.,!?]', '', text)
    
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

In [32]:
merged_df['sia_cleaned_review'] = merged_df['Reviews'].apply(preprocess_review)

In [39]:
def predict_sentiment(review):
    score = sia.polarity_scores(review)['compound']
    if score > 0.5:
        return 'positive'
    elif score < -0.5:
        return 'negative'
    else:
        return 'neutral'


In [40]:
merged_df['predicted_sentiment'] = merged_df['sia_cleaned_review'].apply(predict_sentiment)

In [41]:
from sklearn.metrics import accuracy_score, classification_report

# Calculate accuracy
accuracy = accuracy_score(merged_df['sentiment'], merged_df['predicted_sentiment'])
print(f'Accuracy: {accuracy:.2f}')

# Detailed performance report
print("\nClassification Report:")
print(classification_report(merged_df['sentiment'], merged_df['predicted_sentiment']))

Accuracy: 0.74

Classification Report:
              precision    recall  f1-score   support

    negative       0.89      0.75      0.82      2929
     neutral       0.65      0.54      0.59      2894
    positive       0.71      0.95      0.81      2824

    accuracy                           0.74      8647
   macro avg       0.75      0.75      0.74      8647
weighted avg       0.75      0.74      0.74      8647



In [38]:
merged_df

Unnamed: 0,sentiment,Reviews,cleaned_review,sia_cleaned_review,predicted_sentiment
0,neutral,"If you're looking for a basic stay, this is ok...",youre looking basic stay okay mediocre checkin...,"If youre looking for a basic stay, this is oka...",neu
1,neutral,"Nothing exceptional, but nothing terrible eith...",nothing exceptional nothing terrible either sa...,"Nothing exceptional, but nothing terrible eith...",pos
2,positive,The stay was delightful. The bed comfort was a...,stay delightful bed comfort amazing room clean...,The stay was delightful. The bed comfort was a...,pos
3,negative,Avoid at all costs! The shocking was bed comfo...,avoid costs shocking bed comfort security made...,Avoid at all costs! The shocking was bed comfo...,neu
4,positive,Can't wait to come back! The incredible was fa...,cant wait come back incredible fantastic bathr...,Cant wait to come back! The incredible was fan...,pos
...,...,...,...,...,...
8642,negative,"I had high hopes, but the dirty was security a...",high hopes dirty security location disappointing,"I had high hopes, but the dirty was security a...",neg
8643,neutral,"If you're looking for a basic stay, this is ok...",youre looking basic stay okay decent gym pool ...,"If youre looking for a basic stay, this is oka...",neu
8644,neutral,Fairly standard. The decent was check-in proce...,fairly standard decent checkin process view ro...,Fairly standard. The decent was checkin proces...,neg
8645,positive,Best decision ever! The comfortable was bathro...,best decision ever comfortable bathroom facili...,Best decision ever! The comfortable was bathro...,pos


In [42]:
merged_df[merged_df['sentiment']!=merged_df['predicted_sentiment']]

Unnamed: 0,sentiment,Reviews,cleaned_review,sia_cleaned_review,predicted_sentiment
1,neutral,"Nothing exceptional, but nothing terrible eith...",nothing exceptional nothing terrible either sa...,"Nothing exceptional, but nothing terrible eith...",positive
3,negative,Avoid at all costs! The shocking was bed comfo...,avoid costs shocking bed comfort security made...,Avoid at all costs! The shocking was bed comfo...,neutral
5,neutral,"Fairly standard. The decent was spa, and the p...",fairly standard decent spa pool neither good bad,"Fairly standard. The decent was spa, and the p...",negative
7,neutral,"Didn't impress, but didn't disappoint either. ...",didnt impress didnt disappoint either passable...,"Didnt impress, but didnt disappoint either. Th...",positive
9,negative,Do yourself a favor and skip this one. The mis...,favor skip one miserable food quality bed comf...,Do yourself a favor and skip this one. The mis...,neutral
...,...,...,...,...,...
8631,neutral,The stay was passable. The breakfast options w...,stay passable breakfast options okay air condi...,The stay was passable. The breakfast options w...,negative
8639,neutral,"Fairly standard. The moderate was bar, and the...",fairly standard moderate bar parking availabil...,"Fairly standard. The moderate was bar, and the...",negative
8641,negative,Avoid at all costs! The subpar was bar and the...,avoid costs subpar bar wifi speed made unbearable,Avoid at all costs! The subpar was bar and the...,neutral
8644,neutral,Fairly standard. The decent was check-in proce...,fairly standard decent checkin process view ro...,Fairly standard. The decent was checkin proces...,negative


In [43]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download stopwords and WordNet
nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...


In [51]:
def preprocess_review(text):
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    
    # Remove special characters except punctuation
    text = re.sub(r'[^a-zA-Z0-9\s.,!?]', '', text)
    
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    
    # Convert to lowercase
    text = text.lower()
    
    # Tokenize and remove stopwords
    words = text.split()
    words = [word for word in words if word not in stop_words]
    
    # Lemmatize words
    words = [lemmatizer.lemmatize(word) for word in words]
    
    # Join words back into a single string
    text = ' '.join(words)
    
    return text

merged_df['sian_cleaned_review'] = merged_df['Reviews'].apply(preprocess_review)


In [52]:
def predict_sentiment(review):
    score = sia.polarity_scores(review)['compound']
    if score > 0.3:    # Reduced positive threshold
        return 'positive'
    elif score < -0.3:  # Reduced negative threshold
        return 'negative'
    else:
        return 'neutral'

merged_df['sian_predicted_sentiment'] = merged_df['sian_cleaned_review'].apply(predict_sentiment)

In [55]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=5000)
X_tfidf = vectorizer.fit_transform(merged_df['sian_cleaned_review']).toarray()

# Extract SIA scores as additional features
merged_df['pos_score'] = merged_df['sian_cleaned_review'].apply(lambda x: sia.polarity_scores(x)['pos'])
merged_df['neu_score'] = merged_df['sian_cleaned_review'].apply(lambda x: sia.polarity_scores(x)['neu'])
merged_df['neg_score'] = merged_df['sian_cleaned_review'].apply(lambda x: sia.polarity_scores(x)['neg'])
merged_df['compound_score'] = merged_df['sian_cleaned_review'].apply(lambda x: sia.polarity_scores(x)['compound'])

# Combine TF-IDF and SIA features
X_combined = np.hstack((X_tfidf, merged_df[['pos_score', 'neu_score', 'neg_score', 'compound_score']].values))

In [56]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
scores = cross_val_score(model, X_combined, merged_df['sentiment'], cv=5)
print("Cross-Validation Accuracy:", scores.mean())


Cross-Validation Accuracy: 1.0


In [53]:
merged_df

Unnamed: 0,sentiment,Reviews,cleaned_review,sia_cleaned_review,predicted_sentiment,sian_cleaned_review,sian_predicted_sentiment,pos_score,neu_score,neg_score,compound_score
0,neutral,"If you're looking for a basic stay, this is ok...",youre looking basic stay okay mediocre checkin...,"If youre looking for a basic stay, this is oka...",neutral,"youre looking basic stay, okay. mediocre check...",neutral,0.160,0.840,0.000,0.2263
1,neutral,"Nothing exceptional, but nothing terrible eith...",nothing exceptional nothing terrible either sa...,"Nothing exceptional, but nothing terrible eith...",positive,"nothing exceptional, nothing terrible either. ...",negative,0.138,0.536,0.326,-0.3527
2,positive,The stay was delightful. The bed comfort was a...,stay delightful bed comfort amazing room clean...,The stay was delightful. The bed comfort was a...,positive,"stay delightful. bed comfort amazing, room cle...",positive,0.627,0.373,0.000,0.8779
3,negative,Avoid at all costs! The shocking was bed comfo...,avoid costs shocking bed comfort security made...,Avoid at all costs! The shocking was bed comfo...,neutral,avoid costs! shocking bed comfort security mad...,neutral,0.355,0.290,0.355,0.0000
4,positive,Can't wait to come back! The incredible was fa...,cant wait come back incredible fantastic bathr...,Cant wait to come back! The incredible was fan...,positive,"cant wait come back! incredible fantastic, bat...",positive,0.520,0.480,0.000,0.8221
...,...,...,...,...,...,...,...,...,...,...,...
8642,negative,"I had high hopes, but the dirty was security a...",high hopes dirty security location disappointing,"I had high hopes, but the dirty was security a...",negative,"high hopes, dirty security location disappoint...",neutral,0.391,0.150,0.459,-0.2263
8643,neutral,"If you're looking for a basic stay, this is ok...",youre looking basic stay okay decent gym pool ...,"If youre looking for a basic stay, this is oka...",neutral,"youre looking basic stay, okay. decent gym, po...",neutral,0.192,0.808,0.000,0.2263
8644,neutral,Fairly standard. The decent was check-in proce...,fairly standard decent checkin process view ro...,Fairly standard. The decent was checkin proces...,negative,"fairly standard. decent checkin process, view ...",neutral,0.215,0.604,0.182,0.1139
8645,positive,Best decision ever! The comfortable was bathro...,best decision ever comfortable bathroom facili...,Best decision ever! The comfortable was bathro...,positive,best decision ever! comfortable bathroom facil...,positive,0.661,0.339,0.000,0.9134


In [54]:
accuracy = accuracy_score(merged_df['sentiment'], merged_df['sian_predicted_sentiment'])
print(f'Accuracy: {accuracy:.2f}')

# Detailed performance report
print("\nClassification Report:")
print(classification_report(merged_df['sentiment'], merged_df['sian_predicted_sentiment']))

Accuracy: 0.69

Classification Report:
              precision    recall  f1-score   support

    negative       0.85      0.73      0.78      2929
     neutral       0.60      0.39      0.47      2894
    positive       0.64      0.97      0.77      2824

    accuracy                           0.69      8647
   macro avg       0.70      0.69      0.67      8647
weighted avg       0.70      0.69      0.67      8647



In [10]:
!pip install transformers torch
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset
from transformers import AdamW, get_linear_schedule_with_warmup
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report



[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Defaulting to user installation because normal site-packages is not writeable


In [11]:
label_encoder = LabelEncoder()
merged_df['label'] = label_encoder.fit_transform(merged_df['sentiment'])


In [12]:
X_train, X_test, y_train, y_test = train_test_split(
    merged_df['Reviews'], merged_df['label'], test_size=0.2, random_state=42
)


In [12]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

class HotelReviewsDataset(Dataset):
    def __init__(self, reviews, labels, tokenizer, max_len=128):
        self.reviews = reviews
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.reviews)

    def __getitem__(self, idx):
        review = str(self.reviews.iloc[idx])
        label = self.labels.iloc[idx]
        encoding = self.tokenizer.encode_plus(
            review,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }




In [13]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)
model = model.to(device)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
from torch.utils.data import DataLoader

# Create Dataset objects
train_dataset = HotelReviewsDataset(X_train, y_train, tokenizer, max_len=128)
test_dataset = HotelReviewsDataset(X_test, y_test, tokenizer, max_len=128)


In [15]:
# Create DataLoader objects
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)


In [16]:
optimizer = AdamW(model.parameters(), lr=2e-5)
total_steps = len(train_loader) * 4  # 4 epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)




In [None]:
def train_model():
    model.train()
    for epoch in range(4):
        total_loss = 0
        for batch in train_loader:
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()

            loss.backward()
            optimizer.step()
            scheduler.step()
        
        avg_loss = total_loss / len(train_loader)
        print(f'Epoch {epoch + 1} | Loss: {avg_loss:.4f}')

train_model()


In [14]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=3)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
class HotelReviewsDataset(Dataset):
    def __init__(self, reviews, labels, tokenizer, max_len=64):
        self.reviews = reviews
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.reviews)

    def __getitem__(self, idx):
        review = str(self.reviews.iloc[idx])
        label = self.labels.iloc[idx]
        encoding = self.tokenizer.encode_plus(
            review,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }


In [16]:
train_dataset = HotelReviewsDataset(X_train, y_train, tokenizer, max_len=64)
test_dataset = HotelReviewsDataset(X_test, y_test, tokenizer, max_len=64)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

In [18]:
from torch.cuda.amp import autocast, GradScaler
optimizer = AdamW(model.parameters(), lr=2e-5)
total_steps = len(train_loader) * 2  # 2 epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Step 8: Mixed precision for faster training
scaler = GradScaler()

  scaler = GradScaler()


In [None]:
def train_model():
    model.train()
    for epoch in range(2):  # Fewer epochs for testing
        total_loss = 0
        for batch in train_loader:
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            with autocast():  # Mixed precision
                outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
                loss = outputs.loss

            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            scheduler.step()

            total_loss += loss.item()

        avg_loss = total_loss / len(train_loader)
        print(f'Epoch {epoch + 1} | Loss: {avg_loss:.4f}')

train_model()


  with autocast():  # Mixed precision
