In [2]:
import importlib
import utils
importlib.reload(utils)

import pandas as pd
import numpy as np
import sklearn

In [3]:
input_data = pd.read_csv('london_hotel_reviews.csv', encoding = 'latin-1')

# set some display options
pd.set_option('display.max_colwidth', None)

"""
Dataset columns:

'Property Name' - name of the hotel, not used
'Review Rating' - integer rating from 1 to 5, this is the 'y' part of our model
'Review Title' - not used 
'Review Text' - this is the 'x' part of our model
'Location Of The Reviewer' - not used
'Date Of Review' - not used
"""

#print(input_data.shape)

"\nDataset columns:\n\n'Property Name' - name of the hotel, not used\n'Review Rating' - integer rating from 1 to 5, this is the 'y' part of our model\n'Review Title' - not used \n'Review Text' - this is the 'x' part of our model\n'Location Of The Reviewer' - not used\n'Date Of Review' - not used\n"

In [26]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB

def process_dataset(data):
    """
    Returns a DataFrame with 3 columns: Review Rating, Review Text, and Processed Text
    
    Processed text is a list of words ready for feature extraction
    """

    data['temp'] = data['Review Text'].map(utils.in_english)
    data = data[data['temp'] == True]
    data = data.drop(columns = ['temp', 'Property Name', 'Review Title', 'Location Of The Reviewer', 'Date Of Review'])
    
    data['Processed Text'] = data['Review Text'].map(utils.clean_text)

    return data

def get_matrix_data(data):
    """
    Returns a (n, m) matrix and an array of length n
    Bag of 1-grams model

    This function does the actual feature extraction
    """

    vectorizer = CountVectorizer(tokenizer=lambda doc: doc, lowercase=False)
    X = vectorizer.fit_transform(data['Processed Text']).toarray()
    y = data['Review Rating'].to_numpy()

    return X, y

def get_matrix_data2(data):
    """
    Same as previous but applies tf-idf over the bag of 1-grams
    """

    vectorizer = CountVectorizer(tokenizer=lambda doc: doc, lowercase=False)
    X = vectorizer.fit_transform(data['Processed Text']).toarray()
    X = TfidfTransformer().fit_transform(X).toarray()
    y = data['Review Rating'].to_numpy()

    return X, y


def logistic_classifier(X_train, y_train, _C = 1.0):
    return LogisticRegression(C=_C).fit(X_train, y_train)

In [27]:
processed_data = process_dataset(input_data)
processed_data_bin = processed_data.copy()
processed_data_bin['Review Rating'] = processed_data_bin['Review Rating'].map(lambda x : 1 if x > 3 else 0)

In [10]:
# processed_data.to_csv(r'processed_data_export.csv')
# processed_data_bin.to_csv(r'processed_data_bin_export.csv')

#processed_data = pd.read_csv('processed_data_export.csv', encoding='latin-1')
#processed_data_bin = pd.read_csv('processed_data_bin_export.csv', encoding='latin-1')

In [31]:
#X, y = get_matrix_data(processed_data)
#X_train, y_train, X_test, y_test = utils.split_matrix_data(X, y, 0.9)

X_bin, y_bin = get_matrix_data2(processed_data_bin)
X_bin_train, y_bin_train, X_bin_test, y_bin_test = utils.split_matrix_data(X_bin, y_bin, 0.9)



In [12]:
regression = LogisticRegression(C=1.6,max_iter=150).fit(X_train, y_train)

regression.score(X_test, y_test)

# 5 classes accuracy (rating is from 1 to 5):
# bag of words: ~74%
# tf-idf: ~75%

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.7493495229835212

In [17]:
nb = MultinomialNB(force_alpha=True)
nb.fit(X_train, y_train)

nb.score(X_test, y_test)

# 5 classes accuracy (rating is from 1 to 5):
# bag of words: ~72.5%
# tf-idf: ~69%

0.47528187337380745

In [30]:
nb = MultinomialNB(force_alpha=True).fit(X_bin_train, y_bin_train)
nb.score(X_bin_test, y_bin_test)

# 2 classes accuracy (positive or negative):
# bag of words: ~93%
# tf-idf: ~89%

0.9336513443191674