##  <P182 - Hotel Rating Classification >
**Business Objective:**
- The major objective is what are the attributes that travelers are considering while selecting a hotel. With this manager can understand which elements of their hotel influence more in forming a positive review or improves hotel brand image.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import string
import spacy

from wordcloud import WordCloud, STOPWORDS
%matplotlib inline

In [None]:
data=pd.read_excel("hotel_reviews.xlsx")
data

In [None]:
data["Review"][4]

In [None]:
data.info()

Every value in Rating column is float

In [None]:
sns.countplot(x='Rating',data= data)
plt.title("Distribution of Rating")

In [None]:
data["Rating"].value_counts()

In [None]:
print(data["Rating"].value_counts()/len(data))

73% of ratings are good

In [None]:
Reviews = data.Review.values
Reviews

In [None]:
len(Reviews)

In [None]:
Reviews_text=' '.join(Reviews)
Reviews_text

# Tockenization

In [None]:
from nltk.tokenize import TweetTokenizer
tknzr = TweetTokenizer(strip_handles=True)
Reviews_tokens=tknzr.tokenize(Reviews_text)

In [None]:
len(Reviews_tokens)

In [None]:
Reviews_tokens

In [None]:
Reviews_tokens_text=' '.join(Reviews_tokens)
Reviews_tokens_text

# Remove Punctuations

In [None]:
no_punc_Reviews_tokens_text= Reviews_tokens_text.translate(str.maketrans('','',string.punctuation))
no_punc_Reviews_tokens_text

# Tokenization

In [None]:
from nltk.tokenize import word_tokenize
text_tokens=word_tokenize(no_punc_Reviews_tokens_text)
text_tokens

In [None]:
len(text_tokens)

In [None]:
from nltk.corpus import stopwords
stop_words=stopwords.words('english')

In [None]:
print(stop_words)

In [None]:
no_stop_tokens=[word for word in text_tokens if not word in stop_words]
no_stop_tokens

# Normalizing

In [None]:
lower_words=[Text.lower() for Text in no_stop_tokens]
print(lower_words[100:200])

# Stemming

In [None]:
from nltk.stem import PorterStemmer
ps=PorterStemmer()
stemmed_tokens=[ps.stem(word) for word in lower_words]
print(stemmed_tokens[100:200])

In [None]:
len(lower_words)/10000

In [None]:
doc1=[]
for i in range (0, len(lower_words) , 10000):
    if(i<len(lower_words)):
        doc1.append(lower_words[0:10000])
    else : doc1.append(lower_words[-i:])

In [None]:
nlp=spacy.load('en_core_web_sm')
for word_list in doc1:
    doc=nlp(' '.join(word_list))
print(doc)

In [None]:
len(doc)

# Lemmatization

In [None]:
lemmas=[token.lemma_ for token in doc]
lemmas

In [None]:
clean_reviews=' '.join(lemmas)
clean_reviews

In [None]:
len(clean_reviews)

# Feature extraction

# Count vectoriser

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer()
Reviews_cv=cv.fit_transform(lemmas)

In [None]:
print(cv.vocabulary_)

In [None]:
print(cv.get_feature_names()[100:200])

In [None]:
def plot_cloud(wordcloud):
    plt.figure(figsize=(40,30))
    plt.imshow(wordcloud)
    plt.axis('off')
    
# Generate Word Cloud

STOPWORDS.add('pron')
STOPWORDS.add('rt')
STOPWORDS.add('yeah')
wordcloud=WordCloud(width=3000,height=2000,background_color='white',max_words=50,
                   colormap='Set1',stopwords=STOPWORDS).generate(clean_reviews)
plot_cloud(wordcloud)

# Named Entity Recognition (NER)

In [None]:
nlp=spacy.load('en_core_web_sm')

one_block=clean_reviews
doc_block=nlp(one_block)
spacy.displacy.render(doc_block,style='ent',jupyter=True)

In [None]:
for token in doc_block[100:200]:
    print(token,token.pos_)  

In [None]:
nouns_verbs=[token.text for token in doc_block if token.pos_ in ('NOUN','VERB')]
print(nouns_verbs[100:200])

In [None]:
# Counting the noun & verb tokens
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer()

X=cv.fit_transform(nouns_verbs)
sum_words=X.sum(axis=0)

words_freq=[(word,sum_words[0,idx]) for word,idx in cv.vocabulary_.items()]
words_freq=sorted(words_freq, key=lambda x: x[1], reverse=True)

wd_df=pd.DataFrame(words_freq)
wd_df.columns=['word','count']
wd_df[0:10] # viewing top ten results

In [None]:
wd_df[0:10].plot.bar(x='word',figsize=(12,8),title='Top 10 nouns and verbs')

# Emotion Mining - Sentiment Analysis

In [None]:
from nltk import tokenize
sentences=tokenize.sent_tokenize(' '.join(data.Review))
sentences

In [None]:
len(sentences)

In [None]:
df=pd.DataFrame(sentences,columns=['reviews'])
df

In [None]:
!pip install afinn
from afinn import Afinn
afinn = Afinn()

In [None]:
nlp=spacy.load('en_core_web_sm')

def calculate_sentiment(text:str=None):
    sent_score=0
    if text:
        sentence=nlp(text)
        for word in sentence:
            sent_score+=afinn.score(word.lemma_)
    return sent_score

In [None]:
calculate_sentiment(text='bad')

In [None]:
df['sentiment_value']=df['reviews'].apply(calculate_sentiment)
df['sentiment_value']

In [None]:
df

In [None]:
df['sentiment_value'].describe()

# Negative Reviews

In [None]:
df[df['sentiment_value']<0]

In [None]:
import seaborn as sns
plt.figure(figsize=(15,10))
sns.distplot(df['sentiment_value'])

In [None]:
df['index']=range(0,len(df))
df

In [None]:
plt.figure(figsize=(15,10))
sns.lineplot(y='sentiment_value',x='index',data=df)

In [None]:
neg_sentences=tokenize.sent_tokenize(' '.join(df[df['sentiment_value']<0].reviews))
neg_sentences

In [None]:
nlp=spacy.load('en_core_web_sm')
neg_doc=nlp(' '.join(neg_sentences))
print(neg_doc)

In [None]:
lemmas=[token.lemma_ for token in neg_doc]
print(lemmas)

In [None]:
clean_neg_lemas=' '.join(lemmas)
clean_neg_lemas

In [None]:
def plot_cloud(wordcloud):
    plt.figure(figsize=(40,30))
    plt.imshow(wordcloud)
    plt.axis('off')
    
# Generate Word Cloud

STOPWORDS.add('pron')
STOPWORDS.add('rt')
STOPWORDS.add('yeah')
wordcloud=WordCloud(width=3000,height=2000,background_color='white',max_words=50,
                   colormap='Set1',stopwords=STOPWORDS).generate(clean_neg_lemas)
plot_cloud(wordcloud)

In [None]:
data.head(10)

In [None]:
data_neg = data.loc[data["Rating"]<3]
data_neg = data_neg.reset_index(drop=True)

In [None]:
data_pos= data.loc[data["Rating"]==5]
data_pos = data_pos.reset_index(drop=True)

In [None]:
data_posi = data_pos.loc[:len(data_neg)]

In [None]:
data_all = pd.concat([data_neg, data_posi], axis =0)
data_all = data_all.reset_index(drop= True)

In [None]:
data_all.head(20)

In [None]:
len(data_all)

## Create Sentiment Column

In [None]:
data_all.loc[data_all["Rating"]<=3.0,"Sentiment"]="Negative"
data_all.loc[data_all["Rating"]>3.0,"Sentiment"]="Positive"
data_all

In [None]:
data_all

In [None]:
data_all = data_all.sample(frac=1)
data_all = data_all.reset_index(drop=True)

In [None]:
data_all.head(20)

# Model_Building

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, confusion_matrix, accuracy_score
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data_all.Review,data_all.Sentiment, test_size=0.3, random_state= 33)

In [None]:
countvector = CountVectorizer(ngram_range=(2,2))
X_train_v = countvector.fit_transform(X_train)
X_test_v = countvector.transform(X_test)

### Logestic regression

In [None]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr.fit(X_train_v, y_train)


In [None]:
predictions_lr = lr.predict(X_test_v)

In [None]:
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
report_lr=classification_report(y_test,predictions_lr)
print(report_lr)

### Random Forest Classifier

In [None]:
randomclassifier=RandomForestClassifier(n_estimators=100,criterion='entropy')
randomclassifier.fit(X_train_v,y_train)

In [None]:
predictions_RF = randomclassifier.predict(X_test_v)

In [None]:
report_RF=classification_report(y_test,predictions_RF)
print(report_RF)

### Naive Bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB
naive_bayes_classifier = MultinomialNB()
naive_bayes_classifier.fit(X_train_v, y_train)

In [None]:
predictions_NB = naive_bayes_classifier.predict(X_test_v)

In [None]:
report_NB=classification_report(y_test,predictions_NB)
print(report_NB)

### Support Vector Machine

In [None]:
from sklearn.svm import SVC
svm= SVC()
svm.fit(X_train_v, y_train)

In [None]:
predictions_sv = svm.predict(X_test_v)

In [None]:
report_sv=classification_report(y_test,predictions_sv)
print(report_sv)

### KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
KNN=KNeighborsClassifier()
KNN.fit(X_train_v, y_train)

In [None]:
predictions_knn = KNN.predict(X_test_v)

In [None]:
report_knn=classification_report(y_test,predictions_knn)
print(report_knn)

In [None]:
print("final model is Naive Bayes with 94% acc.")

In [None]:
rev = ["Awesome food....Must visit place... Couldnt enjoy swimming pool facility. Rooms are clean.. Very good experince...."]
rev_vec = countvector.transform(rev)

In [None]:
 naive_bayes_classifier.predict(rev_vec)

In [None]:
rev2 = ["In room AC not working properly,No remote working,All games n swimming pool is time restrictions, it's very bad thing"]
rev2_vec = countvector.transform(rev2)
naive_bayes_classifier.predict(rev2_vec)