In [34]:
import numpy as np 
import pandas as pd 
import re
import nltk
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import pickle
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

#### Loading Data


In [24]:
data = pd.read_csv('/content/hotel_reviews10000.csv')
data.shape

(10000, 5)

In [25]:
data.isnull().sum() #checking for null values

User_ID         0
Description     0
Browser_Used    0
Device_Used     0
Is_Response     0
dtype: int64

In [26]:
data.head(10)

Unnamed: 0,User_ID,Description,Browser_Used,Device_Used,Is_Response
0,id10326,The room was kind of clean but had a VERY stro...,Edge,Mobile,not happy
1,id10327,I stayed at the Crown Plaza April -- - April -...,Internet Explorer,Mobile,not happy
2,id10328,I booked this hotel through Hotwire at the low...,Mozilla,Tablet,not happy
3,id10329,Stayed here with husband and sons on the way t...,InternetExplorer,Desktop,happy
4,id10330,My girlfriends and I stayed here to celebrate ...,Edge,Tablet,not happy
5,id10331,We had - rooms. One was very nice and clearly ...,InternetExplorer,Desktop,happy
6,id10332,My husband and I have stayed in this hotel a f...,Firefox,Tablet,not happy
7,id10333,My wife & I stayed in this glorious city a whi...,Google Chrome,Mobile,happy
8,id10334,My boyfriend and I stayed at the Fairmont on a...,Internet Explorer,Desktop,happy
9,id10335,"Wonderful staff, great location, but it was de...",Chrome,Tablet,not happy


In [27]:
data.drop(columns = ['User_ID', 'Browser_Used', 'Device_Used'], inplace = True)

In [28]:
data.head()

Unnamed: 0,Description,Is_Response
0,The room was kind of clean but had a VERY stro...,not happy
1,I stayed at the Crown Plaza April -- - April -...,not happy
2,I booked this hotel through Hotwire at the low...,not happy
3,Stayed here with husband and sons on the way t...,happy
4,My girlfriends and I stayed here to celebrate ...,not happy


In [29]:

def encode(x):
    if x == 'not happy':
        return 0
    else:
        return 1
data.Is_Response = data.Is_Response.apply(encode)
# 1 for happy
#0 for unhappy

In [30]:
data.head(10)

Unnamed: 0,Description,Is_Response
0,The room was kind of clean but had a VERY stro...,0
1,I stayed at the Crown Plaza April -- - April -...,0
2,I booked this hotel through Hotwire at the low...,0
3,Stayed here with husband and sons on the way t...,1
4,My girlfriends and I stayed here to celebrate ...,0
5,We had - rooms. One was very nice and clearly ...,1
6,My husband and I have stayed in this hotel a f...,0
7,My wife & I stayed in this glorious city a whi...,1
8,My boyfriend and I stayed at the Fairmont on a...,1
9,"Wonderful staff, great location, but it was de...",0


In [120]:
def removeApostrophe(review):
    phrase = re.sub(r"won't", "will not", review)
    phrase = re.sub(r"can\'t", "can not", review)
    phrase = re.sub(r"n\'t", " not", review)
    phrase = re.sub(r"\'re", " are", review)
    phrase = re.sub(r"\'s", " is", review)
    phrase = re.sub(r"\'d", " would", review)
    phrase = re.sub(r"\'ll", " will", review)
    phrase = re.sub(r"\'t", " not", review)
    phrase = re.sub(r"\'ve", " have", review)
    phrase = re.sub(r"\'m", " am", review)
    return phrase

In [121]:
removeApostrophe("I can't go home")

"I can't go home"

In [35]:
def cleaning(df):
    all_reviews = list()
    lines = df["Description"].values.tolist()
    for text in lines:
        text = text.lower() # converting the text to lower case
        pattern = re.compile('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
        text = pattern.sub('', text) # removes URL'S
        text = re.sub(r"[,.\"!@#$%^&*(){}?/;`~:<>+=-]", "", text) #removes punctuation
        text = removeApostrophe(text)
        tokens = nltk.word_tokenize(text) #tokenizing
        table = str.maketrans('', '', string.punctuation)
        stripped = [w.translate(table) for w in tokens]
        words = [word for word in stripped if word.isalpha()] #filtering only text data
        stop_words = set(stopwords.words("english"))
        stop_words.discard("not") #removing "not" from stopwords as it is sentimental analysis
        PS = PorterStemmer()
        words = [PS.stem(w) for w in words if not w in stop_words] #stemming and removing stopwords
        words = ' '.join(words) #joining strings 
        all_reviews.append(words)
    return all_reviews

reviews = cleaning(data)


In [39]:
data['cleaned_reviews'] = reviews

In [40]:
data.head()

Unnamed: 0,Description,Is_Response,cleaned_reviews
0,The room was kind of clean but had a VERY stro...,0,room kind clean strong smell dog gener averag ...
1,I stayed at the Crown Plaza April -- - April -...,0,stay crown plaza april april staff friendli at...
2,I booked this hotel through Hotwire at the low...,0,book hotel hotwir lowest price could find got ...
3,Stayed here with husband and sons on the way t...,1,stay husband son way alaska cruis love hotel g...
4,My girlfriends and I stayed here to celebrate ...,0,girlfriend stay celebr th birthday plan weeken...


In [106]:
from sklearn.model_selection import train_test_split

X = data['cleaned_reviews']
y = data["Is_Response"]
#Splitting Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV


tvec = TfidfVectorizer() #TF-IDF
clf = LogisticRegression(random_state=1, n_jobs=-1) #Classification Model 0.863
clf2 = SVC(kernel='linear',gamma=100) # 0.8515
clf3 = XGBClassifier() # 0.8085
dtc = DecisionTreeClassifier(criterion = 'gini',max_depth = 10, min_samples_leaf =3, random_state=101) # 0.699
rdf = RandomForestClassifier(criterion = 'gini',max_depth = 10, min_samples_leaf =3, random_state=101) # 0.7725
svc = SVC(C=5, gamma=0.1, probability=True, decision_function_shape='ovr') #0.8505


from sklearn.pipeline import Pipeline

model = Pipeline([('vectorizer', tvec),('classifier', clf)])
model.fit(X_train, y_train)
prediction = model.predict(X_test)

from sklearn.metrics import confusion_matrix
print(confusion_matrix(prediction, y_test))


from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report

print("Accuracy : ", accuracy_score(predictions, y_test))
# print("Precision : ", precision_score(predictions, y_test, average = 'weighted'))
# print("Recall : ", recall_score(predictions, y_test, average = 'weighted'))

print('train score: ', model.score(X_train, y_train))
print('test score: ', model.score(X_test, y_test))
print('pred score: ', model.score(X_test, prediction))
print("classification: ", classification_report(predictions,y_test))

[[889 121]
 [153 837]]
Accuracy :  0.7915
train score:  0.907125
test score:  0.863
pred score:  1.0
classification:                precision    recall  f1-score   support

           0       0.78      0.81      0.80       993
           1       0.81      0.77      0.79      1007

    accuracy                           0.79      2000
   macro avg       0.79      0.79      0.79      2000
weighted avg       0.79      0.79      0.79      2000



In [108]:
X_train.head()

9254    not bad build oldest skyscrap philli someth li...
1561    husband stay watertown love room clean bed com...
1670    basic want closest budget hotelmotel seattl ce...
6087    boyfriend friend move stay hotel week move new...
6669    not fanci hotel standard clean comfort locat u...
Name: cleaned_reviews, dtype: object

In [109]:
from sklearn.naive_bayes import MultinomialNB

mul = MultinomialNB()
model1 = Pipeline([('vectorizer', tvec),('classifier', mul)])
model1.fit(X_train, y_train)

y_pred = model1.predict(X_test)


In [110]:
print("Accuracy : ", accuracy_score(y_pred, y_test))
print('train score: ', model.score(X_train, y_train))
print('test score: ', model.score(X_test, y_test))
print('pred score: ', model.score(X_test, y_pred))
print("classification: ", classification_report(y_pred,y_test))

Accuracy :  0.847
train score:  0.907125
test score:  0.863
pred score:  0.922
classification:                precision    recall  f1-score   support

           0       0.82      0.88      0.85       980
           1       0.87      0.82      0.85      1020

    accuracy                           0.85      2000
   macro avg       0.85      0.85      0.85      2000
weighted avg       0.85      0.85      0.85      2000



In [111]:
filename = 'model_hotelreviews.pkl'
pickle.dump(model1, open(filename, 'wb'))
 
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(X_test, y_test)
print(result)

0.847


In [116]:
def predict(review):
    model = pickle.load(open('model_hotelreviews.pkl', 'rb'))

    sent = model.predict([review])
    if sent[0]== 0:
        return "Negative"
    else:
        return "Positive"

In [117]:
predict("not good")


'Negative'

0.878


In [None]:
predict("Siddharth from MyLoanCare helped well and the loan got processed fast")


'Positive'

In [None]:
customer_reviews = ['Siddharth from MyLoanCare helped well and the loan got processed fast','Its a very good experience with you. Here I found the best deal and best interest rate of all banks'
                   ,'Very good. Special thanks to Saurabh Gandhi from MyLoanCare and Ram from HDFC Bank',
                   'Amazing service. Really happy with the team. Definitely recommend it to everyone.',
                   'Worst service by the bank',
                   'had very bad expierience']

In [None]:
for i in customer_reviews:
    print(predict(i))

Positive
Positive
Positive
Positive
Negative
Negative
