In [1]:
import pandas as pd

In [3]:
#read the csv review dataset
trip = pd.read_csv("hotel_sentiment_analysis.csv")

FileNotFoundError: [Errno 2] No such file or directory: 'hotel_sentiment_analysis.csv'

In [None]:
trip.head()

In [None]:
# Let's create a new data frame
 
trip = trip[(trip['Rating']==5)|(trip['Rating']==2)|(trip['Rating']==1)][['Review','Rating']]

# Lets modify the Rating column
trip['Rating'] = trip['Rating'].apply(lambda rating: 'Pos' if rating==5 else 'Neg')

In [None]:
# reseting the index because after removing some rows, the index gets crowded
trip.reset_index(inplace=True)
trip.head()

In [None]:
trip['Rating'].value_counts()

In [None]:
#Data cleaning and preprocessing
import re
import nltk

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [None]:
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
# Lemmatization object
ps = WordNetLemmatizer()
corpus = []

In [None]:
# Text preprocessing
# keep only text based
# lower all the letters
# split the words
for i in range(0,len(trip)):
    review = re.sub('[^a-zA-Z]'," ",trip['Review'][i])
    review = review.lower()
    review = review.split()
    review = [ps.lemmatize(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [None]:
trip.to_csv('tdata.csv', index = False)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()

In [None]:
X = cv.fit_transform(corpus).toarray()

In [None]:
X.shape

In [None]:
y = pd.get_dummies(trip['Rating'])
y = y.iloc[:,1].values
y

In [None]:
#train test split
from sklearn.model_selection import train_test_split
X_train, X_test,y_train,y_test = train_test_split(X,y,test_size=0.20,random_state=3)

Naive Bayes

In [None]:
#Naive bayes classifier

from sklearn.naive_bayes import MultinomialNB
model1 = MultinomialNB().fit(X_train,y_train)

# for the accuracy
model1.score(X_test,y_test)

y_pred = model1.predict(X_test)

In [None]:
#compare y test and y_pred
#confusion matrix is a 2x2 matrix and it tells,
#how many number of elements are correctly predicted.

from sklearn.metrics import confusion_matrix
confusion_m = confusion_matrix(y_test,y_pred)

confusion_m

In [None]:
#checking accuracy score

from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test,y_pred)

accuracy

In [None]:
#checking precision score

from sklearn.metrics import precision_score
precision_score(y_test,y_pred)

In [None]:
#checking recall score

from sklearn.metrics import recall_score
recall_score(y_test,y_pred)

In [None]:
#checking f-beta score

from sklearn.metrics import fbeta_score
fbeta_score(y_test,y_pred,beta=1)

In [None]:
# Checking the training model with custom input data
message = "the hotel was nice and comfy"
data = [message]
vect = cv.transform(data).toarray()
my_prediction = model1.predict(vect)
my_prediction_prob = model1.predict_proba(vect)

if my_prediction==1:
    print("Positive")
    print(my_prediction_prob[0][1])
else:
    print("Negative")
    print(my_prediction_prob[0][0])

In [None]:
message = "the hotel was bad and the staff was rude"
data = [message]
vect = cv.transform(data).toarray()
my_prediction = model1.predict(vect)
my_prediction_prob = model1.predict_proba(vect)

if my_prediction==1:
    print("Positive")
    print(my_prediction_prob[0][1])
else:
    print("Negative")
    print(my_prediction_prob[0][0])

Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(solver='liblinear')

In [None]:
model2 = logreg.fit(X_train, y_train)

In [None]:
y_pred_class = logreg.predict(X_test)

In [None]:
from sklearn import metrics

In [None]:
metrics.accuracy_score(y_test, y_pred)

In [None]:
metrics.precision_score(y_test, y_pred)

In [None]:
metrics.precision_score(y_test, y_pred)

In [None]:
metrics.recall_score(y_test, y_pred)

In [None]:
metrics.fbeta_score(y_test, y_pred, beta=1)

In [None]:
metrics.roc_auc_score(y_test, y_pred)

In [None]:
metrics.confusion_matrix(y_test, y_pred)

In [None]:
message = "cozy stay rainy city"
data = [message]
vect = cv.transform(data).toarray()
my_prediction = model2.predict(vect)
my_prediction_prob = model2.predict_proba(vect)

if my_prediction==1:
    print("Positive")
    print(my_prediction_prob[0][1])
else:
    print("Negative")
    print(my_prediction_prob[0][0])

In [None]:
message = "the hotel was bad and the staff was rude"
data = [message]
vect = cv.transform(data).toarray()
my_prediction = model2.predict(vect)
my_prediction_prob = model2.predict_proba(vect)

if my_prediction==1:
    print("Positive")
    print(my_prediction_prob[0][1])
else:
    print("Negative")
    print(my_prediction_prob[0][0])

In [None]:
# Dump the machine learning model outsite so you can use outsite and not retrain again and again
import pickle

#pickle file for logistic regression

filename = 'logistic_regression_model.pkl'
pickle.dump(model2, open(filename, 'wb'))
pickle.dump(cv, open('tranform_logistic.pkl', 'wb'))

In [None]:
#pickle file for naive bayes

filename = 'naive_bayes_model.pkl'
pickle.dump(model1, open(filename, 'wb'))
pickle.dump(cv, open('transform_naive.pkl', 'wb'))