In [2]:
import pandas as pd
df = pd.read_csv('tripadvisor_hotel_reviews.csv')
df.head()

Unnamed: 0,Review,Rating
0,nice hotel expensive parking got good deal sta...,4
1,ok nothing special charge diamond member hilto...,2
2,nice rooms not 4* experience hotel monaco seat...,3
3,"unique, great stay, wonderful time hotel monac...",5
4,"great stay great stay, went seahawk game aweso...",5


In [2]:
len(df.index)

20491

In [3]:
#Data Preprocessing
import numpy as np

def create_sentiment(rating):
    
    if rating==1 or rating==2:
        return -1 # negative sentiment
    elif rating==4 or rating==5:
        return 1 # positive sentiment
    else:
        return 0 # neutral sentiment

df['Sentiment'] = df['Rating'].apply(create_sentiment)

In [4]:
df.head()

Unnamed: 0,Review,Rating,Sentiment
0,nice hotel expensive parking got good deal sta...,4,1
1,ok nothing special charge diamond member hilto...,2,-1
2,nice rooms not 4* experience hotel monaco seat...,3,0
3,"unique, great stay, wonderful time hotel monac...",5,1
4,"great stay great stay, went seahawk game aweso...",5,1


In [4]:
from sklearn.feature_extraction.text import re

def clean_data(review):
    
    no_punc = re.sub(r'[^\w\s]', '', review)
    no_digits = ''.join([i for i in no_punc if not i.isdigit()])
    
    return(no_digits)

In [7]:
#we need to preprocess the “Review” column in order to remove punctuation, characters, and digits. 
#import re. The re module is a built-in module in Python for regular expressions,
#In this example, CountVectorizer is used to convert a list of documents
#into a matrix of word counts. The resulting matrix is a sparse matrix where rows represent documents,
#columns represent unique words, and the values represent the word counts in each document.
from sklearn.feature_extraction.text import re

def clean_data(review):
    
    no_punc = re.sub(r'[^\w\s]', '', review)
#This line of code is using a list comprehension to iterate over each character (i) in the no_punc 
#string and include only those characters that are not digits (checked using isdigit()). 
#The characters are then joined back into a string using ''.join().
    no_digits = ''.join([i for i in no_punc if not i.isdigit()])
    
    return(no_digits)

In [8]:
df['Review'][0]

'nice hotel expensive parking got good deal stay hotel anniversary, arrived late evening took advice previous reviews did valet parking, check quick easy, little disappointed non-existent view room room clean nice size, bed comfortable woke stiff neck high pillows, not soundproof like heard music room night morning loud bangs doors opening closing hear people talking hallway, maybe just noisy neighbors, aveda bath products nice, did not goldfish stay nice touch taken advantage staying longer, location great walking distance shopping, overall nice experience having pay 40 parking night,  '

In [9]:
df['Review'] = df['Review'].apply(clean_data)
df['Review'][0]

'nice hotel expensive parking got good deal stay hotel anniversary arrived late evening took advice previous reviews did valet parking check quick easy little disappointed nonexistent view room room clean nice size bed comfortable woke stiff neck high pillows not soundproof like heard music room night morning loud bangs doors opening closing hear people talking hallway maybe just noisy neighbors aveda bath products nice did not goldfish stay nice touch taken advantage staying longer location great walking distance shopping overall nice experience having pay  parking night  '

In [10]:
 #how important a word is to a document in a collection or corpus.
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(strip_accents=None, #accents will not be removed i.e é, è, á
                        lowercase=False,#all words will not be converted to lower case
                        preprocessor=None)

X = tfidf.fit_transform(df['Review'])

In [11]:
from sklearn.model_selection import train_test_split#X: The features or input data.#y: The target variable or labels.
y = df['Sentiment'] # target variable
X_train, X_test, y_train, y_test = train_test_split(X,y)

In [12]:
from sklearn.linear_model import LogisticRegression#Logistic regression is a classification algorithm that is often used when the target variable is binary (two classes).
lr = LogisticRegression(solver='liblinear')
lr.fit(X_train,y_train) # fit the model
preds = lr.predict(X_test) # make predictions

In [13]:
from sklearn.metrics import accuracy_score
accuracy_score(preds,y_test) 

0.8549677923091938

In [14]:
df=print(X_test)

  (0, 12914)	0.1753284192881434
  (0, 30535)	0.17769375450153893
  (0, 53151)	0.18347102567170948
  (0, 11163)	0.16955114811797287
  (0, 44140)	0.16032275631846515
  (0, 13981)	0.13689824962049935
  (0, 439)	0.13095704171695213
  (0, 34534)	0.139346057797104
  (0, 12491)	0.11516770941125695
  (0, 30427)	0.09943548157394375
  (0, 17141)	0.12139628358101663
  (0, 65476)	0.20502834813199325
  (0, 70485)	0.14300746451873028
  (0, 39571)	0.13897756785637957
  (0, 53236)	0.13047311135305106
  (0, 34234)	0.12035859911817097
  (0, 67569)	0.11561901241084606
  (0, 66028)	0.10490923857914658
  (0, 1955)	0.13689824962049935
  (0, 69040)	0.10251417406599662
  (0, 60303)	0.09663319556965393
  (0, 61584)	0.10825187450746923
  (0, 10860)	0.10464324610351851
  (0, 50236)	0.10857064530077287
  (0, 35254)	0.0956808467796389
  :	:
  (5122, 58059)	0.08642276962108336
  (5122, 39192)	0.0897850849607488
  (5122, 63433)	0.05436430919002246
  (5122, 69115)	0.10833980975430967
  (5122, 25848)	0.080781773678453

In [15]:
df1=print(preds)

[1 1 1 ... 1 1 1]


In [16]:
df2=print(y_test)

11966    1
8418     1
4833     1
3415     0
7538     1
        ..
19695    1
9982     1
2685     1
3317    -1
5952     1
Name: Sentiment, Length: 5123, dtype: int64


In [17]:
df3=print(X_train)

  (0, 9164)	0.14154898057322646
  (0, 67176)	0.12887633305077095
  (0, 7871)	0.1359412447106151
  (0, 31304)	0.14154898057322646
  (0, 38132)	0.13196249279717592
  (0, 39213)	0.1359412447106151
  (0, 29209)	0.13196249279717592
  (0, 6769)	0.14154898057322646
  (0, 37643)	0.09648827267320444
  (0, 43294)	0.09911664760871146
  (0, 37199)	0.09053038194656882
  (0, 16088)	0.12237600502112538
  (0, 43030)	0.12422279470593646
  (0, 28959)	0.11797166942512781
  (0, 33676)	0.12422279470593646
  (0, 16748)	0.08240106996775112
  (0, 46168)	0.12887633305077095
  (0, 12874)	0.08480787050097864
  (0, 50045)	0.08503838214322929
  (0, 16472)	0.07711724535906182
  (0, 5330)	0.08288982677718898
  (0, 26105)	0.10196365940743045
  (0, 16410)	0.09257617331466732
  (0, 6856)	0.09160861956371918
  (0, 60875)	0.11278951724507484
  :	:
  (15367, 53697)	0.040502238550716434
  (15367, 49447)	0.03686541881482052
  (15367, 9996)	0.0331840562920967
  (15367, 44781)	0.08737190333580394
  (15367, 62622)	0.0461576832

In [18]:
df3=print(y_test)

11966    1
8418     1
4833     1
3415     0
7538     1
        ..
19695    1
9982     1
2685     1
3317    -1
5952     1
Name: Sentiment, Length: 5123, dtype: int64


In [19]:
df4=print(y_train)

6436     1
12660   -1
8380     1
10409    1
10365    1
        ..
7691     1
14690    1
3940     1
15513   -1
7814    -1
Name: Sentiment, Length: 15368, dtype: int64


In [20]:
# Example new input text
new_input_text = "I really enjoyed my stay at this hotel. The staff was friendly and the room was comfortable."

# Clean the input text
cleaned_text = clean_data(new_input_text)

# Transform the cleaned text into TF-IDF features
new_text_features = tfidf.transform([cleaned_text])

# Make predictions
new_pred = lr.predict(new_text_features)

# Print the predicted sentiment
print("Predicted Sentiment:", new_pred[0])


Predicted Sentiment: 1


In [21]:
# Example new input text
new_input_text = "i am bad."

# Clean the input text
cleaned_text = clean_data(new_input_text)

# Transform the cleaned text into TF-IDF features
new_text_features = tfidf.transform([cleaned_text])

# Make predictions
new_pred = lr.predict(new_text_features)

# Print the predicted sentiment
print("Predicted Sentiment:", new_pred[0])

Predicted Sentiment: -1


In [22]:
# Example new input text
new_input_text = 'Amazing experience'

# Clean the input text
cleaned_text = clean_data(new_input_text)

# Transform the cleaned text into TF-IDF features
new_text_features = tfidf.transform([cleaned_text])

# Make predictions
new_pred = lr.predict(new_text_features)

# Print the predicted sentiment
print("Predicted Sentiment:", new_pred[0])

Predicted Sentiment: 1


In [26]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

df = pd.read_csv('tripadvisor_hotel_reviews.csv')

def create_sentiment(rating):
    res = 0  # neutral sentiment
    if rating == 1 or rating == 2:
        res = -1  # negative sentiment
    elif rating == 4 or rating == 5:
        res = 1  # positive sentiment
    return res

df['Sentiment'] = df['Rating'].apply(create_sentiment)

def clean_data(review):
    no_punc = re.sub(r'[^\w\s]', '', review)
    no_digits = ''.join([i for i in no_punc if not i.isdigit()])
    return no_digits

df['Review'] = df['Review'].apply(clean_data)

# Simple rule-based sarcasm detection
def detect_sarcasm(review):
    # Check for the presence of certain sarcastic markers (e.g., "not")
    if "not" in review:
        return 1  # sarcasm detected
    else:
        return 0  # no sarcasm detected

df['Sarcasm'] = df['Review'].apply(detect_sarcasm)

# Combine the cleaned text and sarcasm features
df['TextWithSarcasm'] = df['Review'] + df['Sarcasm'].astype(str)

# TF-IDF Vectorization
tfidf = TfidfVectorizer(strip_accents=None, lowercase=False, preprocessor=None)
X = tfidf.fit_transform(df['TextWithSarcasm'])
y = df['Sentiment']

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Logistic Regression Model
lr = LogisticRegression(solver='liblinear')
lr.fit(X_train, y_train)

# Predictions and Accuracy Calculation
preds = lr.predict(X_test)
accuracy = accuracy_score(preds, y_test)
print("Accuracy:", accuracy)

# Example new input text
#new_input_text = "i am not young enough to know every"
new_input_text = "Amazing experience"

# Clean the input text
cleaned_text = clean_data(new_input_text)

# Transform the cleaned text into TF-IDF features
new_text_features = tfidf.transform([cleaned_text])

# Make predictions
new_pred = lr.predict(new_text_features)

# Print the predicted sentiment
print("Predicted Sentiment:", new_pred[0])


Accuracy: 0.8594779214442547
Predicted Sentiment: 1
