In [1]:
import pandas as pd

In [2]:
#read the csv review dataset
trip = pd.read_csv("hotel_reviews.csv")

In [3]:
trip.head()

Unnamed: 0,Review,Rating
0,nice hotel expensive parking got good deal sta...,4
1,ok nothing special charge diamond member hilto...,2
2,nice rooms not 4* experience hotel monaco seat...,3
3,"unique, great stay, wonderful time hotel monac...",5
4,"great stay great stay, went seahawk game aweso...",5


In [4]:
# Let's create a new data frame
 
trip = trip[(trip['Rating']==5)|(trip['Rating']==2)|(trip['Rating']==1)][['Review','Rating']]

# Lets modify the Rating column
trip['Rating'] = trip['Rating'].apply(lambda rating: 'Pos' if rating==5 else 'Neg')

In [5]:
# reseting the index because after removing some rows, the index gets crowded
trip.reset_index(inplace=True)
trip.head()

Unnamed: 0,index,Review,Rating
0,1,ok nothing special charge diamond member hilto...,Neg
1,3,"unique, great stay, wonderful time hotel monac...",Pos
2,4,"great stay great stay, went seahawk game aweso...",Pos
3,5,love monaco staff husband stayed hotel crazy w...,Pos
4,6,"cozy stay rainy city, husband spent 7 nights m...",Pos


In [6]:
trip['Rating'].value_counts()

Pos    9054
Neg    3214
Name: Rating, dtype: int64

In [7]:
#Data cleaning and preprocessing
import re
import nltk

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [8]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [9]:
# Lemmatization object
ps = WordNetLemmatizer()
corpus = []

In [10]:
# Text preprocessing
# keep only text based
# lower all the letters
# split the words
for i in range(0,len(trip)):
    review = re.sub('[^a-zA-Z]'," ",trip['Review'][i])
    review = review.lower()
    review = review.split()
    review = [ps.lemmatize(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [12]:
trip.to_csv('tdata.csv', index = False)

In [13]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()

In [15]:
X = cv.fit_transform(corpus).toarray()

In [16]:
X.shape

(12268, 34569)

In [17]:
y = pd.get_dummies(trip['Rating'])
y = y.iloc[:,1].values
y

array([0, 1, 1, ..., 0, 0, 0], dtype=uint8)

In [18]:
#train test split
from sklearn.model_selection import train_test_split
X_train, X_test,y_train,y_test = train_test_split(X,y,test_size=0.20,random_state=3)

Naive Bayes

In [19]:
#Naive bayes classifier

from sklearn.naive_bayes import MultinomialNB
model1 = MultinomialNB().fit(X_train,y_train)

# for the accuracy
model1.score(X_test,y_test)

y_pred = model1.predict(X_test)

In [20]:
#compare y test and y_pred
#confusion matrix is a 2x2 matrix and it tells,
#how many number of elements are correctly predicted.

from sklearn.metrics import confusion_matrix
confusion_m = confusion_matrix(y_test,y_pred)

confusion_m

array([[ 574,   69],
       [  44, 1767]], dtype=int64)

In [21]:
#checking accuracy score

from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test,y_pred)

accuracy

0.9539527302363489

In [22]:
#checking precision score

from sklearn.metrics import precision_score
precision_score(y_test,y_pred)

0.9624183006535948

In [23]:
#checking recall score

from sklearn.metrics import recall_score
recall_score(y_test,y_pred)

0.9757040309221424

In [25]:
#checking f-beta score

from sklearn.metrics import fbeta_score
fbeta_score(y_test,y_pred,beta=1)

0.9690156292843433

In [27]:
# Checking the training model with custom input data
message = "the hotel was nice and comfy"
data = [message]
vect = cv.transform(data).toarray()
my_prediction = model1.predict(vect)
my_prediction_prob = model1.predict_proba(vect)

if my_prediction==1:
    print("Positive")
    print(my_prediction_prob[0][1])
else:
    print("Negative")
    print(my_prediction_prob[0][0])

Positive
0.9689465878406804


In [28]:
message = "the hotel was bad and the staff was rude"
data = [message]
vect = cv.transform(data).toarray()
my_prediction = model1.predict(vect)
my_prediction_prob = model1.predict_proba(vect)

if my_prediction==1:
    print("Positive")
    print(my_prediction_prob[0][1])
else:
    print("Negative")
    print(my_prediction_prob[0][0])

Negative
0.8445454685022524


Logistic Regression

In [29]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(solver='liblinear')

In [30]:
model2 = logreg.fit(X_train, y_train)

In [31]:
y_pred_class = logreg.predict(X_test)

In [32]:
from sklearn import metrics

In [33]:
metrics.accuracy_score(y_test, y_pred)

0.9539527302363489

In [34]:
metrics.precision_score(y_test, y_pred)

0.9624183006535948

In [35]:
metrics.precision_score(y_test, y_pred)

0.9624183006535948

In [36]:
metrics.recall_score(y_test, y_pred)

0.9757040309221424

In [37]:
metrics.fbeta_score(y_test, y_pred, beta=1)

0.9690156292843433

In [38]:
metrics.roc_auc_score(y_test, y_pred)

0.9341972720707136

In [39]:
metrics.confusion_matrix(y_test, y_pred)

array([[ 574,   69],
       [  44, 1767]], dtype=int64)

In [40]:
message = "cozy stay rainy city"
data = [message]
vect = cv.transform(data).toarray()
my_prediction = model2.predict(vect)
my_prediction_prob = model2.predict_proba(vect)

if my_prediction==1:
    print("Positive")
    print(my_prediction_prob[0][1])
else:
    print("Negative")
    print(my_prediction_prob[0][0])

Positive
0.6713279013696746


In [41]:
message = "the hotel was bad and the staff was rude"
data = [message]
vect = cv.transform(data).toarray()
my_prediction = model2.predict(vect)
my_prediction_prob = model2.predict_proba(vect)

if my_prediction==1:
    print("Positive")
    print(my_prediction_prob[0][1])
else:
    print("Negative")
    print(my_prediction_prob[0][0])

Negative
0.8996732896446032


In [2]:
# Dump the machine learning model outsite so you can use outsite and not retrain again and again
import pickle

#pickle file for logistic regression

filename = 'logistic_regression_model.pkl'
pickle.dump(model2, open(filename, 'wb'))
pickle.dump(cv, open('tranform_logistic.pkl', 'wb'))

NameError: name 'model2' is not defined

In [1]:
#pickle file for naive bayes

filename = 'naive_bayes_model.pkl'
pickle.dump(model1, open(filename, 'wb'))
pickle.dump(cv, open('transform_naive.pkl', 'wb'))

NameError: name 'pickle' is not defined