# Naive Bayes Aspect classification

"""
AIT726 Final project - Part 1 - Naive Bayes Aspect Classification using Naive Bayes Algorithm on SemEval’16 dataset ( 1708 training dataset and 587 testing dataset ) and Foursquare ( 849 testing dataset )

Authors: Yasas, Prashanti, Ashwini

Command to run the file: run aspect_classification.ipynb

Flow: 
Data loading -  For all the restaurant reviews we parsed the XML files and retrieved the reviews for SemEval’16 and foursquare datasets.

Preprocessing and Feature Extraction - Tokenization, and lemmatization of reviews is performed to get word tokens and their root words. Text vectorization is then performed using term frequency-inverse document frequency (TF-IDF) vectorizer with a ngram range of 1-3. In addition to TF-IDF features. Moreover, we provided predefined term categories (obtained from a vocabulary file) for each word in the input review sentence as features for aspect classification. 
 
Baseline Models: Naive Bayes Classifier 

Cross-Validation and Error Analysis: Performed five-fold cross-validation on the training data and performed the error analysis using the predictions obtained in cross-validation. 

Train Models: Trained the models using the optimal hyper-parameters explored in the cross validation process on the whole training dataset. 

Evaluation: Evaluated the trained models on the test data (SemEval’16 and Foursquare datasets)

Note : For more details please check README file
"""

In [None]:
%matplotlib inline

    """ Imported all required libraries """

In [None]:
from absa.config import DATA_PATHS
from absa.dataset import load_dataset
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import cross_validate
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import make_pipeline, make_union
from sklearn.svm import SVC
from sklearn.metrics import f1_score

import nltk
from nltk.corpus import stopwords

# Pass sentences through spacy nlp pipeline and get the output terms
import spacy
nlp = spacy.load('en')

#Create a list of stopwords 
stopwords = set(stopwords.words('english'))

    """Load and display semeval16 training dataset
       Load - Load dataset using load_dataset method ( Reads formatted XML file from the provided path )"""

In [None]:
train_ds_path = DATA_PATHS['asba.semeval16.raw.train']

df_train = load_dataset(train_ds_path)

df_train = pd.DataFrame({
    'text': df_train.groupby('id')['text'].first(),
    'categories': df_train.groupby('id')['category'].apply(list),
})

df_train.head()

    """Create lookup dictionary using the vocabulary.txt ( In this Vocabulary we have mappings of words to aspect                  categories ) 

In [None]:
lookup = {}
with open('../resources/vocabulary.txt', 'r', encoding='utf-8') as f:
    for idx, section in enumerate(f.read().split('\n\n')):
        for key in section.split('\n'):
            lookup[key.lower()] = idx
lookup

    """ find_terms - Looks for categories for each word in the sentence """

In [None]:
def find_terms(sents):
    for sent in sents:
        term_keys = {}
        for key in lookup.keys():
            if key in sent.lower():
                if lookup[key] not in term_keys:
                    term_keys[lookup[key]] = 0
                term_keys[lookup[key]] += 1
        yield term_keys

#list(find_terms(['Pizza was too hot!']))

# Extract Features

In [None]:
    """ lemmatize  """

In [None]:
def lemmatize(x):
    return ' '.join([token.lemma_ for token in nlp(x)])

    """ Feature extraction - Text Vectorization using TF-IDF (Ngrams considered - Unigram, bigram and trigram)
        The obtained key value pairs are vectorized using DictVectorizer. Later a featureunion of both results are 
        performed """

In [None]:
pipeline = make_union(
    # Stopwords: Keep/Remove - Removed as part of error analysis
    # Lemmatize: Yes
    TfidfVectorizer(preprocessor=lemmatize, ngram_range=(1, 3)),
    make_pipeline(
        FunctionTransformer(lambda x: list(find_terms(x))),
        DictVectorizer()
    ),
)


#Generate the training dataset
x_train = pipeline.fit_transform(df_train.text).toarray()

x_train.shape

# Prepare Labels

    """As we have multiple aspects we have used MultiLabelBinarizer to create y_train"""

In [None]:
mlb = MultiLabelBinarizer()

y_train = mlb.fit_transform(df_train.categories)

y_train.shape

# Cross Validatation

    """ multiclass/multilabel Gaussian Naive Bayes model is used to train semeval training dataset with 5-fold                     cross validation. Respective precision_micro, recall_micro and f1_micro scores are presented """

In [None]:
scoring = ['precision_micro', 'recall_micro', 'f1_micro']

clf = OneVsRestClassifier(GaussianNB())

scores = cross_validate(clf, x_train, y_train, scoring=scoring, cv=5)

print('Micro Precision Score\t', np.average(scores['test_precision_micro']))
print('Micro Recall Score\t', np.average(scores['test_recall_micro']))
print('Micro F1 Score\t\t', np.average(scores['test_f1_micro']))

# Error Analysis

    """ Export results to an excel for performing error Analysis """

In [None]:
from sklearn.model_selection import cross_val_predict

In [None]:
train_pred = cross_val_predict(clf, x_train, y_train, cv=5)

In [None]:
prds = mlb.inverse_transform(train_pred)

In [None]:
df_train['predictions'] = [list(x) for x in prds]

In [None]:
df_train.to_excel('./output/ac_nb.xlsx')

# Train Model

    """Model fit"""

In [None]:
clf.fit(x_train, y_train)

# Evaluate Model

    """Load and display semeval16 testing dataset
       Load - Load dataset using load_dataset method ( Reads formatted XML file from the provided path )"""

In [None]:
test_ds_path = DATA_PATHS['asba.semeval16.raw.test.gold']

df_test = load_dataset(test_ds_path)

df_test = pd.DataFrame({
    'text': df_test.groupby('id')['text'].first(),
    'categories': df_test.groupby('id')['category'].apply(list),
})

df_test.head()

    """ Predict the results on semval2016 testing dataset """

In [None]:
# Testing
x_test = pipeline.transform(df_test.text).toarray()

y_pred = clf.predict(x_test)

y_pred.shape

In [None]:
y_true = mlb.transform(df_test.categories)

y_true.shape

    """ Evaluate the results for semval2016 testing dataset """"

In [None]:
('f1_score', f1_score(y_true, y_pred, average='micro'))

# Foursquare dataset

    """Load and display Foursquare testing dataset
       Load - Load dataset using load_dataset method ( Reads formatted XML file from the provided path )"""

In [None]:
test_fs_ds_path = DATA_PATHS['asba.foursquare.raw.test.gold']

df_test_fs = load_dataset(test_fs_ds_path)

df_test_fs = pd.DataFrame({
    'text': df_test_fs.groupby('id')['text'].first(),
    'categories': df_test_fs.groupby('id')['category'].apply(list),
})

df_test_fs.head()

    """ Predict the results on Foursquare testing dataset """

In [None]:
# Testing
x_test_fs = pipeline.transform(df_test_fs.text).toarray()

y_pred_fs = clf.predict(x_test_fs)

y_pred_fs.shape

In [None]:
y_true_fs = mlb.transform(df_test_fs.categories)

y_true_fs.shape

    """ Evaluate the results for Foursquare testing dataset """"

In [None]:
('f1_score', f1_score(y_true_fs, y_pred_fs, average='micro'))