In [2]:
import importlib
import utils
importlib.reload(utils)

import pandas as pd
import numpy as np
import sklearn


In [None]:
input_data = pd.read_csv('london_hotel_reviews.csv', encoding = 'latin-1')

# set some display options
pd.set_option('display.max_colwidth', None)

"""
Dataset columns:

'Property Name' - name of the hotel, not used
'Review Rating' - integer rating from 1 to 5, this is the 'y' part of our model
'Review Title' - not used 
'Review Text' - this is the 'x' part of our model
'Location Of The Reviewer' - not used
'Date Of Review' - not used
"""

#print(input_data.shape)

In [10]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression

def process_dataset(data):
    """
    Returns a DataFrame with 3 columns: Review Rating, Review Text, and Processed Text
    
    Processed text is a list of words ready for feature extraction
    """

    data['temp'] = data['Review Text'].map(utils.in_english)
    data = data[data['temp'] == True]
    data = data.drop(columns = ['temp', 'Property Name', 'Review Title', 'Location Of The Reviewer', 'Date Of Review'])
    
    data['Processed Text'] = data['Review Text'].map(utils.clean_text)

    return data

def get_matrix_data(data):
    """
    Returns a (n, m) matrix and an array of length n
    Bag of 1-grams model

    This function does the actual feature extraction
    """

    vectorizer = CountVectorizer(tokenizer=lambda doc: doc, lowercase=False)
    X = vectorizer.fit_transform(data['Processed Text']).toarray()
    y = data['Review Rating'].to_numpy()

    return X, y

def get_matrix_data2(data):
    """
    Same as previous but applies tf-idf over the bag of 1-grams
    """

    vectorizer = CountVectorizer(tokenizer=lambda doc: doc, lowercase=False)
    X = vectorizer.fit_transform(data['Processed Text']).toarray()
    X = TfidfTransformer().fit_transform(X)
    y = data['Review Rating'].to_numpy()

    return X, y


def logistic_classifier(X_train, y_train, _C = 1.0):
    return LogisticRegression(C=_C).fit(X_train, y_train)

In [None]:
processed_data = process_dataset(input_data)

In [None]:
X, y = get_matrix_data2(processed_data)

In [None]:
X_train, y_train, X_test, y_test = utils.split_matrix_data(X, y, 0.9)

In [None]:
regression = LogisticRegression(C=1.6,max_iter=150).fit(X_train, y_train)

regression.score(X_test, y_test)

# 5 classes accuracy (rating is from 1 to 5):
# bag of words: ~74%
# tf-idf: ~75%