### Text Classification And Sentiment Analysis On Hotel Reviews
This project aimed to build text classification methods on the domain of hotel reviews.The goal of this project is to implement classifiers that predict the rating of the reviews. Text classification methods were applied to predict ratings using  vectorization algorithms: term frequency-inverse document frequency (tf-idf). The focus of this project is training supervised learning text classification models to see whether or not its possible to predict reviews ratings. The motivation of this project is to predict review ratings using Logistic Regression classification algorithm.

### Strategy and Process
- pre-processing, Stopword, punctuation removal etc.
- LabelEncoder on the review column.
- Making a model to check the sentiment analysis with text (text column and check if the review column). 
- polarity check.
- Making a classification machine learning model and check if the model gives better performance.
- Making a CSV file from the text (review) column. 

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import string
import re

# Machine learning model
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

import nltk
from nltk import corpus
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.probability import FreqDist
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import brown, stopwords
from nltk import pos_tag

import spacy
from spacy import displacy
from spacy.lang.en.stop_words import STOP_WORDS

from collections import Counter
from itertools import chain

from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from PIL import Image
from textblob import TextBlob

import os
#os.getcwd()

import warnings
warnings.filterwarnings('ignore')

In [None]:
nltk.downloader.download('maxent_ne_chunker')
nltk.downloader.download('words')
nltk.downloader.download('treebank')
nltk.downloader.download('maxent_treebank_pos_tagger')
nltk.downloader.download('punkt')
nltk.download('averaged_perceptron_tagger')

In [None]:
# read data 
data = pd.read_csv(r"hotel_reviews.csv", encoding="utf-8")

data.head()


In [None]:
data=data.drop(columns=["language"])

In [None]:
data.feedback = [1 if each == "positive" else 0 for each in data.feedback]

In [None]:
data.shape

In [None]:
# Finding any NaN values
data.isna().any()

In [None]:
data=data.dropna(axis=0)

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(12,6))
plt.title("Percentage Distributions by Review Type")
g = plt.pie(round(data.feedback.value_counts(normalize=True)*100,2),explode=(0.025,0.025), labels=round(data.feedback.value_counts(normalize=True)*100,2).index, colors=["c","m"],autopct="%1.1f%%", startangle=180)
plt.show()

In [None]:
### Cleaning data

In [None]:
def decontract_text(text):
    text = re.sub(r"won\'t", "will not", text)
    text = re.sub(r"can\'t", "can not", text)
    text = re.sub(r"won\’t", "will not", text)
    text = re.sub(r"can\’t", "can not", text)
    text = re.sub(r"\'t've", " not have", text)
    text = re.sub(r"\'d've", " would have", text)
    text = re.sub(r"\'clock", "f the clock", text)
    text = re.sub(r"\'cause", " because", text)
# general
    text = re.sub(r"n\'t", " not", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'s", " is", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'t", " not", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'m", " am", text)
    text = re.sub(r"n\’t", " not", text)
    text = re.sub(r"\’re", " are", text)
    text = re.sub(r"\’d", " would", text)
    text = re.sub(r"\’ll", " will", text)
    text = re.sub(r"\’t", " not", text)
    text = re.sub(r"\’ve", " have", text)
    text = re.sub(r"\’m", " am", text)
    return text

In [None]:
data["title"] = data["title"].apply(lambda x: decontract_text(x))
data["text"] = data["text"].apply(lambda x: decontract_text(x))

In [None]:
import string
def clean_text_round1(t):
    t = t.lower()
    t = re.sub('[(%s)]' % re.escape(string.punctuation), '', t)
    t = re.sub('\w*\d\w*', '', t)
    t = re.sub('\n', '', t)
    return t

In [None]:
data["title"] = data["title"].apply(lambda x: clean_text_round1(x))
data["text"] = data["text"].apply(lambda x: clean_text_round1(x))

In [None]:
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

In [None]:
corpus = []
ps = PorterStemmer()

def clean_text_round2(text):
    for i in text:
        words = i.split()
        dialog_words = [word for word in words if word not in stop_words]
        words1 = [ps.stem(word) for word in dialog_words]
        dialog1 = ' '.join(words1)
        corpus.append(dialog1)
    return corpus

In [None]:
corpus_title=clean_text_round2(data["title"].loc[data["feedback"]==0])

In [None]:
negative_tag=[]
for i in corpus_title:
    words = i.split()
    t=nltk.pos_tag(words)
    negative_tag.append(t) 

In [None]:
negative_noun=[]
for i in negative_tag:
    for j in i:
        if j[1]=='NN':
            negative_noun.append(j[0])   

In [None]:
d1 = {}
for item in negative_noun:
    if item in d1:
        d1[item] += 1
    else:
        d1[item] = 1
#dict(sorted(d1.items(), key=lambda item: item[1],reverse=True))

In [None]:
n_title = " ".join(i for i in corpus_title)

In [None]:
from wordcloud import WordCloud
wordcloud1 = WordCloud(background_color='white', width=3000, height=2500).generate(n_title)
plt.figure(figsize=(8,8))
plt.imshow(wordcloud1)
plt.axis('off')
plt.title("Words which indicate negative title ")
plt.show()

In [None]:
negative_adjective=[]
for i in negative_tag:
    for j in i:
        if j[1]=='JJ':
            negative_adjective.append(j[0])   

In [None]:
d2= {}
for item in negative_adjective:
    if item in d2:
        d2[item] += 1
    else:
        d2[item] = 1
#dict(sorted(d2.items(), key=lambda item: item[1],reverse=True))

In [None]:
corpus_text=clean_text_round2(data["text"].loc[data["feedback"]==1])

In [None]:
tagword_p=[]
for i in corpus_text:
    words = i.split()
    t=nltk.pos_tag(words)
    tagword_p.append(t)

In [None]:
# Finding positive adjective from positive feedbacks
positive_adj=[]
for i in tagword_p:
    for j in i:
        if j[1]=='JJ':
            positive_adj.append(j[0])

In [None]:
#Number of Words
d1= {}
for item in positive_adj:
    if item in d1:
        d1[item] += 1
    else:
        d1[item] = 1
#dict(sorted(d1.items(), key=lambda item: item[1],reverse=True))

In [None]:
p_reviews = " ".join(i for i in corpus_text)

In [None]:
from wordcloud import WordCloud
wordcloud1 = WordCloud(background_color='white', width=3000, height=2500).generate(p_reviews)
plt.figure(figsize=(8,8))
plt.imshow(wordcloud1)
plt.axis('off')
plt.title("Words which indicate positive_feedback ")
plt.show()

In [None]:
import nltk
nltk.download('brown')

In [None]:
from textblob import TextBlob

for i in corpus_text:

    blob = TextBlob(i)
    print(blob.sentences)

    print('\n', blob.words)
    print('\n', blob.tags)

    print('\n', blob.noun_phrases) 

for sentence in blob.sentences:
    print('\nnoun phrases in sentence : ' ,sentence.noun_phrases)
    print(sentence.sentiment)
    print('\ntext sentiment: ', blob.sentiment)
    print('\nFind the start point of Rings: ', blob.find('Rings'))
    print('\nsingular of word Rings: ' , blob.words[4].singularize())
    print('\ncount appearance of word Lord: ' , blob.words.count('Lord'))
    print('\nplural of word Lord: ', blob.words[1].pluralize())
    print('\nroot word written: ', blob.words[-7].lemmatize())

### polarity and subjectivity

In [None]:
Scores_analyis=[]
polarity=[]
subjectivity=[]
for i in data['text']:
    TextBlob_Subjectivity = TextBlob(i).sentiment.subjectivity
    subjectivity.append(TextBlob_Subjectivity)
    TextBlob_Polarity = TextBlob(i).sentiment.polarity
    polarity.append(TextBlob_Polarity)
    def getAnalysis(score):
        if score < 0:
            return 0
       
        else:
            return 1
    TextBlob_Analysis = getAnalysis(TextBlob_Polarity)
    Scores_analyis.append(TextBlob_Analysis)
    
data['subjectivity']=subjectivity
data['polarity'] = polarity
data['Scores_analyis']=Scores_analyis
data

In [None]:
from sklearn.preprocessing import LabelEncoder
lb = LabelEncoder()
data['Scores_analyis'] = lb.fit_transform(data['Scores_analyis'])
y_pred = data['Scores_analyis'].values
y = data['feedback'].values

print('Confusion matrix : \n', confusion_matrix(y, y_pred))
print('Classification report: \n', classification_report(y, y_pred))

In [None]:
texts = " ".join(i for i in data.text)


from PIL import Image

mask = np.array(Image.open('Dolfine.png'))
wordcloud = WordCloud(width= 1500, height = 1000, random_state=1, background_color='black', 
                      colormap='rainbow', collocations=False, stopwords = STOPWORDS, mask=mask).generate(texts)

plt.figure(figsize=(20, 10))
plt.imshow(wordcloud) 
plt.axis("off")
plt.show()

In [None]:
from sklearn.model_selection import train_test_split
x, y = data["text"], data['feedback']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1,random_state=42)
print(f'x_train: {len(x_train)}')
print(f'x_test: {len(x_test)}')
print(f'y_train: {len(y_train)}')
print(f'y_test: {len(y_test)}')
x_train : 35038
x_test  : 3894
y_train : 35038
y_test  : 3894

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
tvec = TfidfVectorizer()
clf2 = LogisticRegression(solver = "lbfgs")
from sklearn.pipeline import Pipeline
model = Pipeline([("vectorizer",tvec),("classifier",clf2)])
model.fit(x_train, y_train)
from sklearn.metrics import confusion_matrix
predictions = model.predict(x_test)
confusion_matrix(predictions, y_test)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score
y_pred = model.predict(x_test)
print(f'Accurcy: {accuracy_score(y_pred, y_test)}')
print(f'Precision: {precision_score(y_pred, y_test, average="weighted")}')
print(f'Recall: {recall_score(y_pred, y_test, average="weighted")}')

In [None]:
# Predicting values
from random import randint
row = randint(0,data.text.shape[0]-1)
sample_text = data.text[row]
sample_text

In [None]:
model.predict([sample_text])

In [None]:
from random import randint
row = randint(0,data.text.shape[0]-1)
sample_text = data.text[row]
sample_text

In [None]:
model.predict([sample_text])

In [None]:
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

In [None]:
import locationtagger
Country_cities=[]
for i in data["text"]:
    place_entity = locationtagger.find_locations(text =i)
    Country_cities.append(place_entity.country_cities)

In [None]:
from collections import Counter

Duplicate_words=[]
Length_words=[] 
Stop_words=[]
Noun_words=[]
Adjective_words=[]
for i in data["text"]:
    words = word_tokenize(i)
    Stop_words.append([word for word in words if word  in stop_words])
    Length_words.append(len(words))
    Duplicate_words.append(sorted(Counter(words) - Counter(set(words))))
    tag_words=nltk.pos_tag(words)
    Noun_words.append([tag[0] for tag in tag_words if tag[1]=="NN"])
    Adjective_words.append([tag[0] for tag in tag_words if tag[1]=="JJ"])
    

In [None]:
dataset = pd.DataFrame(list(zip(Stop_words, Duplicate_words,Length_words, Noun_words, Adjective_words, Country_cities)),
               columns =['Stop_words', 'Duplicate_words', 'Length_words', 'Noun_words', 'Adjective_words', 'Country_cities'])

In [None]:
dataset

In [None]:
dataset.to_csv("Hotels_NLP.csv")