In [1]:
import numpy as np 
import pandas as pd 
import re
import nltk
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import pickle

#### Loading Data


In [2]:
data = pd.read_csv('hotel_reviews.csv')[:5000]
data.shape

(5000, 5)

In [3]:
data.isnull().sum() #checking for null values

User_ID         0
Description     0
Browser_Used    0
Device_Used     0
Is_Response     0
dtype: int64

In [4]:
data.head(10)

Unnamed: 0,User_ID,Description,Browser_Used,Device_Used,Is_Response
0,id10326,The room was kind of clean but had a VERY stro...,Edge,Mobile,not happy
1,id10327,I stayed at the Crown Plaza April -- - April -...,Internet Explorer,Mobile,not happy
2,id10328,I booked this hotel through Hotwire at the low...,Mozilla,Tablet,not happy
3,id10329,Stayed here with husband and sons on the way t...,InternetExplorer,Desktop,happy
4,id10330,My girlfriends and I stayed here to celebrate ...,Edge,Tablet,not happy
5,id10331,We had - rooms. One was very nice and clearly ...,InternetExplorer,Desktop,happy
6,id10332,My husband and I have stayed in this hotel a f...,Firefox,Tablet,not happy
7,id10333,My wife & I stayed in this glorious city a whi...,Google Chrome,Mobile,happy
8,id10334,My boyfriend and I stayed at the Fairmont on a...,Internet Explorer,Desktop,happy
9,id10335,"Wonderful staff, great location, but it was de...",Chrome,Tablet,not happy


In [5]:
data.drop(columns = ['User_ID', 'Browser_Used', 'Device_Used'], inplace = True)

In [6]:
data.head()

Unnamed: 0,Description,Is_Response
0,The room was kind of clean but had a VERY stro...,not happy
1,I stayed at the Crown Plaza April -- - April -...,not happy
2,I booked this hotel through Hotwire at the low...,not happy
3,Stayed here with husband and sons on the way t...,happy
4,My girlfriends and I stayed here to celebrate ...,not happy


In [7]:

def encode(x):
    if x == 'not happy':
        return 0
    else:
        return 1
data.Is_Response = data.Is_Response.apply(encode)
# 1 for happy
#0 for unhappy

In [8]:
data

Unnamed: 0,Description,Is_Response
0,The room was kind of clean but had a VERY stro...,0
1,I stayed at the Crown Plaza April -- - April -...,0
2,I booked this hotel through Hotwire at the low...,0
3,Stayed here with husband and sons on the way t...,1
4,My girlfriends and I stayed here to celebrate ...,0
...,...,...
4995,Hilton NYC is a wonderful hotel where I stay w...,1
4996,I have stayed at the Dunhill - or - times on b...,1
4997,We stayed at this hotel for a four day weekend...,1
4998,The Hilton Ft. Worth proved to be a great choi...,1


In [9]:
def removeApostrophe(review):
    phrase = re.sub(r"won't", "will not", review)
    phrase = re.sub(r"can\'t", "can not", review)
    phrase = re.sub(r"n\'t", " not", review)
    phrase = re.sub(r"\'re", " are", review)
    phrase = re.sub(r"\'s", " is", review)
    phrase = re.sub(r"\'d", " would", review)
    phrase = re.sub(r"\'ll", " will", review)
    phrase = re.sub(r"\'t", " not", review)
    phrase = re.sub(r"\'ve", " have", review)
    phrase = re.sub(r"\'m", " am", review)
    return phrase

In [10]:
def cleaning(df):
    all_reviews = list()
    lines = df["Description"].values.tolist()
    for text in lines:
        text = text.lower() # converting the text to lower case
        pattern = re.compile('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
        text = pattern.sub('', text) # removes URL'S
        text = re.sub(r"[,.\"!@#$%^&*(){}?/;`~:<>+=-]", "", text) #removes punctuation
        text = removeApostrophe(text)
        tokens = word_tokenize(text) #tokenizing
        table = str.maketrans('', '', string.punctuation)
        stripped = [w.translate(table) for w in tokens]
        words = [word for word in stripped if word.isalpha()] #filtering only text data
        stop_words = set(stopwords.words("english"))
        stop_words.discard("not") #removing "not" from stopwords as it is sentimental analysis
        PS = PorterStemmer()
        words = [PS.stem(w) for w in words if not w in stop_words] #stemming and removing stopwords
        words = ' '.join(words) #joining strings 
        all_reviews.append(words)
    return all_reviews

reviews = cleaning(data)


In [11]:
data['cleaned_reviews'] = reviews

In [12]:
from sklearn.model_selection import train_test_split

X = data['cleaned_reviews']
y = data["Is_Response"]
#Splitting Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.svm import SVC


tvec = TfidfVectorizer() #TF-IDF
clf = LogisticRegression() #Classification Model
clf2 = SVC(kernel='linear',gamma=100)
clf3 = XGBClassifier()
svc = SVC(C=5, gamma=0.1, probability=True, decision_function_shape='ovr')


from sklearn.pipeline import Pipeline
model = Pipeline([('vectorizer',tvec),('classifier',svc)])

model.fit(X_train, y_train)


from sklearn.metrics import confusion_matrix

predictions = model.predict(X_test)

confusion_matrix(predictions, y_test)
print(model.score(X_test,y_test))

0.878


In [13]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

print("Accuracy : ", accuracy_score(predictions, y_test))
print("Precision : ", precision_score(predictions, y_test, average = 'weighted'))
print("Recall : ", recall_score(predictions, y_test, average = 'weighted'))

Accuracy :  0.878
Precision :  0.8889471953901525
Recall :  0.878


In [22]:
filename = 'model_hotelreviews.pkl'
pickle.dump(model, open(filename, 'wb'))
 
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(X_test, y_test)
print(result)

0.878


In [23]:
def predict(review):
    model = pickle.load(open('model_hotelreviews.pkl', 'rb'))
    l = []
    l.append(review)
    sent = model.predict(l)
    if sent[0]== 0:
        return "Negative"
    else:
        return "Positive"

In [24]:
predict("not good")


'Negative'

0.878


In [25]:
predict("Siddharth from MyLoanCare helped well and the loan got processed fast")


'Positive'

In [35]:
customer_reviews = ['Siddharth from MyLoanCare helped well and the loan got processed fast','Its a very good experience with you. Here I found the best deal and best interest rate of all banks'
                   ,'Very good. Special thanks to Saurabh Gandhi from MyLoanCare and Ram from HDFC Bank',
                   'Amazing service. Really happy with the team. Definitely recommend it to everyone.',
                   'Worst service by the bank',
                   'had very bad expierience']

In [36]:
for i in customer_reviews:
    print(predict(i))

Positive
Positive
Positive
Positive
Negative
Negative
