In [None]:
#  Importing Important Packages

In [None]:
import pandas as pd
import numpy as np
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
import re
import spacy
from nltk.corpus import sentiwordnet as swn
from IPython.display import clear_output
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import plotly
plotly.offline.init_notebook_mode (connected = True)
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer
from nltk import ngrams
# The following code creates a word-document matrix.
from sklearn.feature_extraction.text import CountVectorizer
# Modeling packages
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report

# Reading Data

In [None]:
data=pd.read_csv('../input/trip-advisor-hotel-reviews/tripadvisor_hotel_reviews.csv')

In [None]:
## Getting the number of words by splitting them by a space
words_per_review = data.Review.apply(lambda x: len(x.split(" ")))
words_per_review.hist(bins = 100)
plt.xlabel('Review Length (words)')
plt.ylabel('Frequency')
plt.show()

In [None]:
data.shape


In [None]:
data.head(5)

In [None]:
percent_val = 100 * data['Rating'].value_counts()/len(data)
percent_val

In [None]:
percent_val.plot.bar()
plt.show()

In [None]:
data['sentiment'] = np.where(data.Rating >= 3,1,0)    
# Mapping the ratings
data['sentiment'] = np.where(data.Rating > 3,1,0)

## Removing neutral reviews 
data = data[data.Rating != 3]


# Making two copies of Reviews to edit

In [None]:
#Edits After Removing Stopwords
Edited_Review = data['Review'].copy()

In [None]:
data.shape

# Having a look at 1st ten reviews in the data

In [None]:
data.head(10)

# Preprocessing Function

1. Converting words to lower/upper case
2. Removing special characters
3. Removing stopwords and high/low-frequency words
4. lemmatization

In [None]:
data['reviews_text_new'] = data['Review'].str.lower()


In [None]:
# For reviews converted to lower case
token_lists_lower = [word_tokenize(each) for each in data['reviews_text_new']]
tokens_lower = [item for sublist in token_lists_lower for item in sublist]
print("Number of unique tokens now: ",len(set(tokens_lower)))

In [None]:
### Selecting non alpha numeric charactes that are not spaces
spl_chars = data['reviews_text_new'].apply(lambda review: 
                                                     [char for char in list(review) if not char.isalnum() and char != ' '])

## Getting list of list into a single list
flat_list = [item for sublist in spl_chars for item in sublist]

In [None]:
review_backup = data['reviews_text_new'].copy()
data['reviews_text_new'] = data['reviews_text_new'].str.replace(r'[^A-Za-z0-9]+', ' ')

In [None]:
noise_words = []
eng_stop_words = stopwords.words('english')

In [None]:
stop_words = set(eng_stop_words)
without_stop_words = []
stopword = []
sentence = data['reviews_text_new'][3]
words = nltk.word_tokenize(sentence)

for word in words:
    if word in stop_words:
        stopword.append(word)
    else:
        without_stop_words.append(word)

print('-- Original Sentence --\n', sentence)
print('\n-- Stopwords in the sentence --\n', stopword)
print('\n-- Non-stopwords in the sentence --\n', without_stop_words)

In [None]:
def stopwords_removal(stop_words, sentence):
    return [word for word in nltk.word_tokenize(sentence) if word not in stop_words]

data['reviews_text_nonstop'] = data['reviews_text_new'].apply(lambda row: stopwords_removal(stop_words, row))
data[['reviews_text_new','reviews_text_nonstop']]

In [None]:
def make_sentences(data,name):
    data[name]=data[name].apply(lambda x:' '.join([i+' ' for i in x]))
    # Removing double spaces if created
    data[name]=data[name].apply(lambda x:re.sub(r'\s+', ' ', x, flags=re.I))

In [None]:
# Converting all the texts back to sentences
make_sentences(data,'reviews_text_nonstop')

In [None]:
data .head(5)

# Lemmatization Function

In [None]:
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
wordnet_map = {"N":wordnet.NOUN, "V":wordnet.VERB, "J":wordnet.ADJ, "R":wordnet.ADV}
def lemmatize_words(text):
    pos_tagged_text = nltk.pos_tag(text.split())
    return " ".join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_tagged_text])

data["After_lemmatization"] = data['reviews_text_nonstop'].apply(lambda text: lemmatize_words(text))

# Results of Preprocessing data (Removing stopwords & Lemmatization)

In [None]:
data.head(6)

In [None]:
print("- Old Review -")
print(data['Review'][3])
print("\n- New Review -")
print(data['reviews_text_nonstop'][3])
print("\n- Last Edit Review -")
print(data['After_lemmatization'][3])

In [None]:
data['reviews_text_final'] = data['After_lemmatization'].copy()

In [None]:
data[['reviews_text_final','sentiment']].head(5)

# Building a machine learning model

# Bag-of-words and n-grams

# Divide into training and test sets:

# Applying logistic regression

In [None]:
### Changes with respect to the previous code
### 1. Increasing the n-grams from just having 1-gram to (1-gram, 2-gram, 3-gram, and 4-gram)
### 2. Including the stopwords in the bag of words features

bow_counts = CountVectorizer(tokenizer= word_tokenize,
                             lowercase=True,
                             ngram_range=(1,1))

bow_data = bow_counts.fit_transform(data.reviews_text_new)


In [None]:
X_train_bow, X_test_bow, y_train_bow, y_test_bow = train_test_split(bow_data,
                                                                    data['sentiment'],
                                                                    test_size = 0.2,
                                                                    random_state = 0)

In [None]:
# Defining and training the model
lr_model_all_new = LogisticRegression(max_iter = 200)
lr_model_all_new.fit(X_train_bow, y_train_bow)

# Predicting the results
test_pred_lr_all = lr_model_all_new.predict(X_test_bow)


## Calculate key performance metrics

# Print a classification report
print(classification_report(y_test_bow,test_pred_lr_all))

In [None]:
lr_model_all_new.feature_names=bow_counts.get_feature_names()

In [None]:
from joblib import dump, load 

# save model to file 
dump(lr_model_all_new, filename="Sentiment_Analysis_unigram2.joblib")

In [None]:
# import a saved joblib model 
loaded_joblib_model = load(filename="Sentiment_Analysis_unigram2.joblib")

In [None]:
len(loaded_joblib_model.feature_names)

In [None]:
import string
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import WordNetLemmatizer
feats = bow_counts.get_feature_names()
feats_len = len(feats)
sent ='My stay was extremely comfortable. A beautiful hotel surrounded by wonderful staff in a great location.'
sent =sent.lower()
sent = sent.translate(str.maketrans('', '', string.punctuation))
filtered_sentence = [] 
stop_words = set(stopwords.words('english')) 
word_tokens =word_tokenize(sent)
filtered_sentence = [w for w in word_tokens if not w in stop_words ]
listToStr = ' '.join(map(str, filtered_sentence))
lemmatizer = WordNetLemmatizer()
wordnet_map = {"N":wordnet.NOUN, "V":wordnet.VERB, "J":wordnet.ADJ, "R":wordnet.ADV}
def lemmatize_words(text):
    pos_tagged_text = nltk.pos_tag(word_tokenize(text))
    return ([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_tagged_text])
lemmatized_output =[]
lemmatized_output = lemmatize_words(listToStr)

sent_features=[]
sent_dict = {}
for word in lemmatized_output:
    if not word in sent_dict:
        sent_dict[word] = 0
    sent_dict[word] = sent_dict[word] + 1
for i in range(feats_len):
    if not feats[i] in sent_dict:
        sent_features.append(0)
    else:
        sent_features.append(sent_dict[feats[i]])

In [None]:
sent ='The condition of the rooms were very bad. Bed sheets, linens were dirty.'
sent =sent.lower()
sent = sent.translate(str.maketrans('', '', string.punctuation))
filtered_sentence = [] 
stop_words = set(stopwords.words('english')) 
word_tokens =word_tokenize(sent)
filtered_sentence = [w for w in word_tokens if not w in stop_words ]
listToStr = ' '.join(map(str, filtered_sentence))
lemmatizer = WordNetLemmatizer()
wordnet_map = {"N":wordnet.NOUN, "V":wordnet.VERB, "J":wordnet.ADJ, "R":wordnet.ADV}
def lemmatize_words(text):
    pos_tagged_text = nltk.pos_tag(word_tokenize(text))
    return ([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_tagged_text])
lemmatized_output =[]
lemmatized_output = lemmatize_words(listToStr)
print(lemmatized_output)

In [None]:
len(feats)

In [None]:

df = pd.DataFrame(feats, columns=["features"])
df.to_csv('unigram.csv', index=False)

In [None]:
len(sent_features)

In [None]:
joblib_y_preds = loaded_joblib_model.predict([sent_features])

In [None]:
print(joblib_y_preds)