In [1]:
from spacy.lang.en import English

In [3]:
import spacy

In [7]:
nlp = spacy.load('en_core_web_md')

In [11]:
import pandas as pd
import numpy as np

In [12]:
reviews = pd.read_csv('tripadvisor_hotel_reviews.csv')
reviews.head()

Unnamed: 0,Review,Rating
0,nice hotel expensive parking got good deal sta...,4
1,ok nothing special charge diamond member hilto...,2
2,nice rooms not 4* experience hotel monaco seat...,3
3,"unique, great stay, wonderful time hotel monac...",5
4,"great stay great stay, went seahawk game aweso...",5


In [13]:
#Create Columns for Excellent, Satisfactory, and Unsatisfactory ratings

reviews['Excellent'] = np.where(reviews['Rating'] == 5, 1, 0)
reviews['Satisfactory'] = np.where((reviews['Rating'] == 4) | (reviews['Rating'] == 3) , 1, 0)
reviews['Unsatisfactory'] = np.where((reviews['Rating'] == 2) | (reviews['Rating'] == 1) , 1, 0)

In [14]:
# create one column for feature engineering
conditions = [ (reviews['Rating'] <= 2),
            (reviews['Rating'] <= 4), 
            (reviews['Rating'] ==5)
    ]

# create a list of the values we want to assign for each condition
values = ['unsat', 'satis', 'excel']

# create a new column and use np.select to assign values to it using our lists as arguments
reviews['Label'] = np.select(conditions, values)

In [15]:
excellentReviews=reviews.Excellent.sum()
satisfactoryReviews= reviews.Satisfactory.sum()
unsatisfactoryReviews = reviews.Unsatisfactory.sum()
allReviews = len(reviews)

#Double check assignments were done properly with simple T/F test
excellentReviews + satisfactoryReviews + unsatisfactoryReviews == allReviews

True

In [16]:
print('The percentage of excellent reviews is', "{0:.0%}".format(excellentReviews/allReviews) )
print('The percentage of satisfactory reviews is', "{0:.0%}".format(satisfactoryReviews/allReviews) )
print('The percentage of unsatisfactory reviews is', "{0:.0%}".format(unsatisfactoryReviews/allReviews) )

The percentage of excellent reviews is 44%
The percentage of satisfactory reviews is 40%
The percentage of unsatisfactory reviews is 16%


In [9]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from sklearn.metrics import accuracy_score
from nltk.corpus import stopwords
import string
import re
import spacy
from spacy.lang.en import English
parser = English()

In [10]:
STOPLIST = set(stopwords.words('english') + list(ENGLISH_STOP_WORDS))
SYMBOLS = " ".join(string.punctuation).split(" ") + ["-", "...", "”", "”"]
class CleanTextTransformer(TransformerMixin):
   def transform(self, X, **transform_params):
        return [cleanText(text) for text in X]
   def fit(self, X, y=None, **fit_params):
        return self
def get_params(self, deep=True):
        return {}
    
def cleanText(text):
    text = text.strip().replace("\n", " ").replace("\r", " ")
    text = text.lower()
    return text
def tokenizeText(sample):
    tokens = parser(sample)
    lemmas = []
    for tok in tokens:
        lemmas.append(tok.lemma_.lower().strip() if tok.lemma_ != "-PRON-" else tok.lower_)
    tokens = lemmas
    tokens = [tok for tok in tokens if tok not in STOPLIST]
    tokens = [tok for tok in tokens if tok not in SYMBOLS]
    return tokens

In [17]:
STOPLIST = set(stopwords.words('english') + list(ENGLISH_STOP_WORDS))
SYMBOLS = " ".join(string.punctuation).split(" ") + ["-", "...", "”", "”"]
class CleanTextTransformer(TransformerMixin):
   def transform(self, X, **transform_params):
        return [cleanText(text) for text in X]
   def fit(self, X, y=None, **fit_params):
        return self
def get_params(self, deep=True):
        return {}
    
def cleanText(text):
    text = text.strip().replace("\n", " ").replace("\r", " ")
    text = text.lower()
    return text
def tokenizeText(sample):
    tokens = parser(sample)
    lemmas = []
    for tok in tokens:
        lemmas.append(tok.lemma_.lower().strip() if tok.lemma_ != "-PRON-" else tok.lower_)
    tokens = lemmas
    tokens = [tok for tok in tokens if tok not in STOPLIST]
    tokens = [tok for tok in tokens if tok not in SYMBOLS]
    return tokens

In [20]:
from sklearn.model_selection import train_test_split

X = reviews['Review']
y = reviews.Label

# Create training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 53)

In [23]:
def printNMostInformative(vectorizer, clf, N):
    feature_names = vectorizer.get_feature_names()
    coefs_with_fns = sorted(zip(clf.coef_[0], feature_names))
    topClass1 = coefs_with_fns[:N]
    topClass2 = coefs_with_fns[:-(N + 1):-1]
    print("Class 1 best: ")
    for feat in topClass1:
        print(feat)
    print("Class 2 best: ")
    for feat in topClass2:
        print(feat)


vectorizer = CountVectorizer(tokenizer=tokenizeText, ngram_range=(1,1))
clf = LinearSVC()

pipe = Pipeline([('cleanText', CleanTextTransformer()), ('vectorizer', vectorizer), ('clf', clf)])

# data

train1 = reviews['Review'].tolist()
labelsTrain1 = reviews['Label'].tolist()
test1 = reviews['Review'].tolist()
labelsTest1 = reviews['Label'].tolist()
# train
pipe.fit(train1, labelsTrain1)
# test
preds = pipe.predict(test1)
print("accuracy:", accuracy_score(labelsTest1, preds))
print("Top 10 features used to predict: ")

printNMostInformative(vectorizer, clf, 10)
pipe = Pipeline([('cleanText', CleanTextTransformer()), ('vectorizer', vectorizer)])
transform = pipe.fit_transform(train1, labelsTrain1)
vocab = vectorizer.get_feature_names()
for i in range(len(train1)):
    s = ""
    indexIntoVocab = transform.indices[transform.indptr[i]:transform.indptr[i+1]]
    numOccurences = transform.data[transform.indptr[i]:transform.indptr[i+1]]
    for idx, num in zip(indexIntoVocab, numOccurences):
        s += str((vocab[idx], num))



accuracy: 0.1832999853594261
Top 10 features used to predict: 
Class 1 best: 
(-0.013274212938553912, '')
Class 2 best: 
(-0.013274212938553912, '')


In [24]:
from sklearn import metrics
print(metrics.classification_report(labelsTest1, preds, 
                                    target_names=reviews['Label'].unique()))

              precision    recall  f1-score   support

       satis       0.35      0.00      0.00      9054
       unsat       0.38      0.09      0.14      8223
       excel       0.16      0.94      0.28      3214

    accuracy                           0.18     20491
   macro avg       0.30      0.34      0.14     20491
weighted avg       0.33      0.18      0.10     20491



# New Attempts

In [26]:
import string
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English

# Create our list of punctuation marks
punctuations = string.punctuation

# Create our list of stopwords
nlp = spacy.load('en_core_web_md')
stop_words = spacy.lang.en.stop_words.STOP_WORDS

# Load English tokenizer, tagger, parser, NER and word vectors
parser = English()

# Creating our tokenizer function
def spacy_tokenizer(sentence):
    # Creating our token object, which is used to create documents with linguistic annotations.
    mytokens = parser(sentence)

    # Lemmatizing each token and converting each token into lowercase
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]

    # Removing stop words
    mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ]

    # return preprocessed list of tokens
    return mytokens

In [27]:
# Custom transformer using spaCy
class predictors(TransformerMixin):
    def transform(self, X, **transform_params):
        # Cleaning Text
        return [clean_text(text) for text in X]

    def fit(self, X, y=None, **fit_params):
        return self

    def get_params(self, deep=True):
        return {}

# Basic function to clean the text
def clean_text(text):
    # Removing spaces and converting text into lowercase
    return text.strip().lower()

In [28]:
bow_vector = CountVectorizer(tokenizer = spacy_tokenizer, ngram_range=(1,1))

In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vector = TfidfVectorizer(tokenizer = spacy_tokenizer)

In [38]:
from sklearn.model_selection import train_test_split

X = reviews['Review'] # the features we want to analyze
ylabels = reviews['Excellent'] # the labels, or answers, we want to test against

X_train, X_test, y_train, y_test = train_test_split(X, ylabels, test_size=0.3)

In [39]:
reviews.head()

Unnamed: 0,Review,Rating,Excellent,Satisfactory,Unsatisfactory,Label
0,nice hotel expensive parking got good deal sta...,4,0,1,0,satis
1,ok nothing special charge diamond member hilto...,2,0,0,1,unsat
2,nice rooms not 4* experience hotel monaco seat...,3,0,1,0,satis
3,"unique, great stay, wonderful time hotel monac...",5,1,0,0,excel
4,"great stay great stay, went seahawk game aweso...",5,1,0,0,excel


In [40]:
# Logistic Regression Classifier
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()

# Create pipeline using Bag of Words
pipe = Pipeline([("cleaner", predictors()),
                 ('vectorizer', bow_vector),
                 ('classifier', classifier)])

# model generation
pipe.fit(X_train,y_train)

ValueError: empty vocabulary; perhaps the documents only contain stop words

In [None]:
Pipeline(memory=None,
     steps=[('cleaner', <__main__.predictors object at 0x00000254DA6F8940>), ('vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
      ...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [None]:
from sklearn import metrics
# Predicting with a test dataset
predicted = pipe.predict(X_test)

# Model Accuracy
print("Logistic Regression Accuracy:",metrics.accuracy_score(y_test, predicted))
print("Logistic Regression Precision:",metrics.precision_score(y_test, predicted))
print("Logistic Regression Recall:",metrics.recall_score(y_test, predicted))