# Aspect Category Detection - Basic Classifiers

## Load necessary libraries

In [19]:
import re
import numpy as np
import pandas as pd
from asp_agg_utils import _readXML, _clean_text
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer, CountVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support

# List of classifiers
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier

## Load datasets

In [2]:
data_parent_dir = "../Datasets/"
filenames = ["Restaurants_Train.xml", "restaurants-trial.xml", "Restaurants_Test_Data_phaseB.xml"]
datasets = list(map(lambda filename: _readXML(data_parent_dir + filename), filenames))
train, val, test = datasets

## Preprocessing

1. Clean text
 - expand contraction of words
 - remove punctuations
 - convert multiple consecutive whitespaces into one
 - remove the starting and ending spaces

In [3]:
x_train, x_val, x_test = map(lambda dataset: dataset.review.apply(_clean_text), datasets)
print("Original sentence:\n" + train["review"][7])
print("Preprocessed:\n" + x_train[7])

Original sentence:
Our agreed favorite is the orrechiete with sausage and chicken (usually the waiters are kind enough to split the dish in half so you get to sample both meats).
Preprocessed:
agreed favorite orrechiete sausage chicken ( usually waiters kind enough split dish half get sample meats ) .


2. Separate labels into different columns which consist of only binary values (0 and 1)

In [4]:
def _to_categorical_columns(df):
    mlb = MultiLabelBinarizer()
    return pd.DataFrame(mlb.fit_transform(df.values), columns=mlb.classes_, index=df.index)

In [5]:
y_train, y_val, y_test = map(lambda dataset: _to_categorical_columns(dataset["aspect"]), datasets)
print(train["aspect"][:5])
y_train.head()

0                          [service]
1    [food, anecdotes/miscellaneous]
2                             [food]
3                          [service]
4          [anecdotes/miscellaneous]
Name: aspect, dtype: object


Unnamed: 0,ambience,anecdotes/miscellaneous,food,price,service
0,0,0,0,0,1
1,0,1,1,0,0
2,0,0,1,0,0
3,0,0,0,0,1
4,0,1,0,0,0


## Build classifier
A class for classifier 

In [20]:
class BasicClassifier():
    
    def __init__(self, clf_type, emb = None):
        self.classifier = self._create_classifier(clf_type) 
        
    def _create_classifier(self, clf_type, emb = None):
        if clf_type == "NB":
            clf = OneVsRestClassifier(MultinomialNB(fit_prior = True, class_prior = None))
        elif clf_type == "SVM":    
            clf = OneVsRestClassifier(LinearSVC())
        elif clf_type == "DT":    
            clf = OneVsRestClassifier(DecisionTreeClassifier(random_state=0))
        else:
            clf = OneVsRestClassifier(LogisticRegression())
        return self._classifier_wrapper(clf, emb)
    
    def _classifier_wrapper(self, clf, emb):
        if emb is None:
            return Pipeline([
                            ('tfidf', TfidfVectorizer()),
                            ('clf', clf),
                            ])
        else:
            return Pipeline([ 
                            ('emb', emb),
                            ('clf', clf),
                            ])
        
    def fit(self, data, labels):
        self.classifier.fit(data, labels)
        
    def predict(self, data, labels = None):
        preds = self.classifier.predict(data)
        if labels is None:
            return preds
        return accuracy_score(labels, preds), precision_recall_fscore_support(labels, preds)

A function to build model and test prediction performance

In [42]:
def build_and_run_model(clf_type, x_train, y_train, x_val = None, y_val = None, x_test = None, y_test = None):
    clf = BasicClassifier(clf_type)
    print('-------- Processing: {}--------'.format(clf_type))
    clf.fit(x_train, y_train)
    if x_val is not None:
        val_acc, val_prf1 = clf.predict(x_val, y_val)
        print("Val acc: {:.4f}".format(val_acc))
        
        val_prf1 = list(val_prf1)
        for i in range(len(val_prf1)):
            val_prf1[i] = np.average(val_prf1[i])
            
        print("Val precision: {:.4f}".format(val_prf1[0]))
        print("Val recall: {:.4f}".format(val_prf1[1]))
        print("Val f1: {:.4f}".format(val_prf1[2]))
        print()
        
    if x_test is not None:
        test_acc, test_prf1 = clf.predict(x_test, y_test)
        print("Test acc: {:.4f}".format(test_acc))
        
        test_prf1 = list(test_prf1)
        for i in range(len(test_prf1)):
            test_prf1[i] = np.average(test_prf1[i])
            
        print("Test precision: {:.4f}".format(test_prf1[0]))
        print("Test recall: {:.4f}".format(test_prf1[1]))
        print("Test f1: {:.4f}".format(test_prf1[2]))
    return clf

Initialize classifier and run it

`Naive Bayes (NB)`

In [43]:
_ = build_and_run_model("NB", x_train, y_train, x_val, y_val, x_test, y_test)

-------- Processing: NB--------
Val acc: 0.7700
Val precision: 0.9902
Val recall: 0.5840
Val f1: 0.6726

Test acc: 0.4587
Test precision: 0.9634
Test recall: 0.3255
Test f1: 0.4274


`Support Vector Machine (SVM)`

In [44]:
_ = build_and_run_model("SVM", x_train, y_train, x_val, y_val, x_test, y_test)

-------- Processing: SVM--------
Val acc: 0.9700
Val precision: 0.9907
Val recall: 0.9790
Val f1: 0.9843

Test acc: 0.6837
Test precision: 0.9029
Test recall: 0.7154
Test f1: 0.7958


`Decision Tree (DT)`

In [45]:
_ = build_and_run_model("DT", x_train, y_train, x_val, y_val, x_test, y_test)

-------- Processing: DT--------
Val acc: 0.9800
Val precision: 0.9957
Val recall: 0.9785
Val f1: 0.9867

Test acc: 0.5513
Test precision: 0.7535
Test recall: 0.7717
Test f1: 0.7592


`Logistic Regression`

In [46]:
_ = build_and_run_model("Default", x_train, y_train, x_val, y_val, x_test, y_test)

-------- Processing: Default--------
Val acc: 0.8500
Val precision: 0.9856
Val recall: 0.7881
Val f1: 0.8567

Test acc: 0.5487
Test precision: 0.9470
Test recall: 0.5005
Test f1: 0.6214




## Scrap code which was not used