# Logistic Regression

## Import Libraries

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.metrics import f1_score, confusion_matrix
import numpy as np

# Train Test Split

In [2]:
df=pd.read_csv('./data/cleaned_hotelreviews.csv')

# Split train test
training_data, testing_data = train_test_split(df, train_size = 0.8, random_state = 3000)

# TFIDF vectors

In [3]:
# Get labels
Y_train = training_data['class'].values
Y_test = testing_data['class'].values

# Get features
tfidf_vectorizer = TfidfVectorizer(use_idf = True,stop_words = 'english', max_df = 0.95)
tfidf_vectorizer.fit_transform(training_data['reviews'].values.astype('U'))

X_train = tfidf_vectorizer.transform(training_data['reviews'].values.astype('U'))
X_test = tfidf_vectorizer.transform(testing_data['reviews'].values.astype('U'))
feature_transformer = tfidf_vectorizer

# Train Model

In [4]:
# Train logistic regression model
scikit_log_reg = LogisticRegression(verbose=1, solver='liblinear',random_state=0, C=5, penalty='l2',max_iter=1000)

# K-fold
scores = cross_val_score(scikit_log_reg, X_train, Y_train, cv=10, scoring='accuracy')
print(scores.mean())


model=scikit_log_reg.fit(X_train,Y_train)

[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear]0.919060429866023
[LibLinear]

# Evaluate Accuracy

In [5]:
Y_pred = model.predict(X_test)
print('Accuracy of logistic regression classifier on test set: {:.9f}'.format(model.score(X_test, Y_test)))

confusion_matrix = confusion_matrix(Y_test, Y_pred)
print(confusion_matrix)

Accuracy of logistic regression classifier on test set: 0.920688394
[[65807  6148]
 [ 7083 87785]]


# Test New Data

In [6]:
# Test with new data
test_features=feature_transformer.transform(['the toilet is unclean service but however, waiter did a good job in ensuring our needs are met'])
test_pred = model.predict(test_features)
test_pred

array(['negative'], dtype=object)

In [7]:
# IGNORE BELOW #

# Accuracy

Accuracy evaluates the fraction of correct predictions. In our case, it is the number of times the PRIMARY category appeared in the top 3 predicted categories divided by the total number of categorization tasks.

## MRR

Unlike accuracy, MRR takes the rank of the first correct answer into consideration (in our case rank of the correctly predicted PRIMARY category). The formula for MRR is as follows:

![image.png](attachment:image.png)

where Q here refers to all the classification tasks in our test set and rank_{i} is the position of the correctly predicted category. The higher the rank of the correctly predicted category, the higher the MRR.

In [8]:
def get_top_k_predictions(model,X_test,k):

    # get probabilities instead of predicted labels, since we want to collect top 3
    probs = model.predict_proba(X_test)

    # GET TOP K PREDICTIONS BY PROB - note these are just index
    best_n = np.argsort(probs, axis=1)[:,-k:]

    # GET CATEGORY OF PREDICTIONS
    preds=[[model.classes_[predicted_cat] for predicted_cat in prediction] for prediction in best_n]

    # REVERSE CATEGORIES - DESCENDING ORDER OF IMPORTANCE
    preds=[ item[::-1] for item in preds]

    return preds

def collect_preds(Y_test,Y_preds):
    """Collect all predictions and ground truth"""
    
    pred_gold_list=[[[Y_test[idx]],pred] for idx,pred in enumerate(Y_preds)]
    return pred_gold_list

def compute_accuracy(eval_items:list):
    correct=0
    total=0
    
    for item in eval_items:
        true_pred=item[0]
        machine_pred=set(item[1])
        
        for cat in true_pred:
            if cat in machine_pred:
                correct+=1
                break
    
    
    accuracy=correct/float(len(eval_items))
    return accuracy

def compute_mrr_at_k(items:list):
    """Compute the MRR (average RR) at cutoff k"""
    rr_total = 0
    
    for item in items:   
        rr_at_k = _reciprocal_rank(item[0],item[1])
        rr_total = rr_total + rr_at_k
        mrr = rr_total / 1/float(len(items))

    return mrr

def _reciprocal_rank(true_labels: list, machine_preds: list):
    """Compute the reciprocal rank at cutoff k"""
    
    # add index to list only if machine predicted label exists in true labels
    tp_pos_list = [(idx + 1) for idx, r in enumerate(machine_preds) if r in true_labels]

    rr = 0
    if len(tp_pos_list) > 0:
        # for RR we need position of first correct item
        first_pos_list = tp_pos_list[0]
        
        # rr = 1/rank
        rr = 1 / float(first_pos_list)

    return rr

In [9]:
# Evaluate accuracy
top_k = 1
preds=get_top_k_predictions(model,X_test,top_k)
    
# GET PREDICTED VALUES AND GROUND TRUTH INTO A LIST OF LISTS
eval_items=collect_preds(Y_test,preds)
    
# GET EVALUATION NUMBERS ON TEST SET -- HOW DID WE DO?
accuracy=compute_accuracy(eval_items)
mrr_at_k=compute_mrr_at_k(eval_items)

print('Accuracy={0}; MMR={1}'.format(accuracy,mrr_at_k))

Accuracy=0.9206883942861596; MMR=0.9206883942861596


# TEST WITH NEW DATA

In [10]:
# Test with new data
test_features=feature_transformer.transform(['the toilet is clean service'])
get_top_k_predictions(model,test_features,1)

[['positive']]