In [51]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
import swifter
import re
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv("../Dataset/tripadvisor_hotel_reviews.csv")
df.head()

Unnamed: 0,Review,Rating
0,nice hotel expensive parking got good deal sta...,4
1,ok nothing special charge diamond member hilto...,2
2,nice rooms not 4* experience hotel monaco seat...,3
3,"unique, great stay, wonderful time hotel monac...",5
4,"great stay great stay, went seahawk game aweso...",5


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20491 entries, 0 to 20490
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Review  20491 non-null  object
 1   Rating  20491 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 320.3+ KB


In [4]:
df.dtypes

Review    object
Rating     int64
dtype: object

In [5]:
df['Rating'].value_counts()

Rating
5    9054
4    6039
3    2184
2    1793
1    1421
Name: count, dtype: int64

In [6]:
df.isnull().sum()

Review    0
Rating    0
dtype: int64

In [7]:
def sentiment_conversion(num: int) -> str:
    if num > 3:
        return "Positive"
    elif num == 3:
        return "Neutral"
    else:
        return "Negative"

df['Sentiment'] = df['Rating'].apply(sentiment_conversion)

In [8]:
df.head()

Unnamed: 0,Review,Rating,Sentiment
0,nice hotel expensive parking got good deal sta...,4,Positive
1,ok nothing special charge diamond member hilto...,2,Negative
2,nice rooms not 4* experience hotel monaco seat...,3,Neutral
3,"unique, great stay, wonderful time hotel monac...",5,Positive
4,"great stay great stay, went seahawk game aweso...",5,Positive


In [9]:
df.drop('Rating', axis=1, inplace=True)

In [10]:
df.head()

Unnamed: 0,Review,Sentiment
0,nice hotel expensive parking got good deal sta...,Positive
1,ok nothing special charge diamond member hilto...,Negative
2,nice rooms not 4* experience hotel monaco seat...,Neutral
3,"unique, great stay, wonderful time hotel monac...",Positive
4,"great stay great stay, went seahawk game aweso...",Positive


In [11]:
print(f"Number of reviews: {len(df)}\n\nPositive Review:\t{df['Sentiment'].value_counts().iloc[0] / len(df) * 100}%\nNegative Review:\t{df['Sentiment'].value_counts().iloc[1] / len(df) * 100}%\nNeutral Review:\t{df['Sentiment'].value_counts().iloc[2] / len(df) * 100}%")

Number of reviews: 20491

Positive Review:	73.65672734371186%
Negative Review:	15.684934849446098%
Neutral Review:	10.658337806842027%


In [12]:
from nltk.corpus import stopwords
from nltk.stem.snowball import PorterStemmer
import string

In [13]:
stemmer = PorterStemmer()
stopwords = set(stopwords.words('english'))
stopwords.remove('not')

In [14]:
stopwords

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 "he's",
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 "i'd",
 "i'll",
 "i'm",
 "i've",
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it'd",
 "it'll",
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'o

In [15]:
def clean_text(text):
    text = re.sub(f'[{string.punctuation}]', "", text.lower())
    words = nltk.word_tokenize(text)
    words = [word for word in words if word not in stopwords]
    stemmed_words = [stemmer.stem(word) for word in words]
    return ' '.join(stemmed_words)

In [16]:
df['Review'] = df['Review'].swifter.apply(clean_text)

Pandas Apply: 100%|██████████| 20491/20491 [02:36<00:00, 131.06it/s]


In [17]:
df.head()

Unnamed: 0,Review,Sentiment
0,nice hotel expens park got good deal stay hote...,Positive
1,ok noth special charg diamond member hilton de...,Negative
2,nice room not 4 experi hotel monaco seattl goo...,Neutral
3,uniqu great stay wonder time hotel monaco loca...,Positive
4,great stay great stay went seahawk game awesom...,Positive


In [18]:
X = df['Review']
y = df['Sentiment']

In [19]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
y = encoder.fit_transform(y)

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [21]:
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [22]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(max_iter=500, solver='liblinear', random_state=42, multi_class='ovr')
classifier.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'liblinear'
,max_iter,500


In [23]:
classifier.score(X_test, y_test)

0.8545986826055135

In [50]:
text = "It was best stay"
text = clean_text(text)
text = vectorizer.transform([text])
classifier.predict(text)

array([2])

In [25]:
import pickle

with open("../Models/classifier.pkl", "wb") as file:
    pickle.dump(classifier, file)

with open("../Models/tfidf_vectorizer.pkl", "wb") as file:
    pickle.dump(vectorizer, file)

with open("../Models/label_encoder.pkl", "wb") as file:
    pickle.dump(encoder, file)