In [35]:
%matplotlib inline

In [36]:
from absa.config import DATA_PATHS
from absa.dataset import load_dataset
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import cross_validate
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import make_pipeline, make_union
from sklearn.svm import SVC
from sklearn.metrics import f1_score
import spacy
nlp = spacy.load('en')

In [37]:
train_ds_path = DATA_PATHS['asba.semeval16.raw.train']

df_train = load_dataset(train_ds_path)

df_train = df_train.loc[:, ['id', 'text', 'category', 'polarity']]

df_train = pd.DataFrame({
    'polarity': df_train.groupby(['id', 'text', 'category'])['polarity'].apply(list),
}).reset_index()

df_train.head()

Unnamed: 0,id,text,category,polarity
0,1004293:0,Judging from previous posts this used to be a ...,RESTAURANT#GENERAL,[negative]
1,1004293:1,"We, there were four of us, arrived at noon - t...",SERVICE#GENERAL,[negative]
2,1004293:2,"They never brought us complimentary noodles, i...",SERVICE#GENERAL,[negative]
3,1004293:3,The food was lousy - too sweet or too salty an...,FOOD#QUALITY,[negative]
4,1004293:3,The food was lousy - too sweet or too salty an...,FOOD#STYLE_OPTIONS,[negative]


# Prepare Labels

In [38]:
mlb = MultiLabelBinarizer()

y_train = mlb.fit_transform(df_train.polarity)

y_train.shape

(2258, 3)

# Extract Features

In [39]:
def lemmatize(x):
    return ' '.join([token.lemma_ for token in nlp(x)])

def select_column(df, column):
    return df.loc[:, column].values


def reshape_array(array, shape):
    return array.reshape(*shape)

In [40]:
pipeline = make_union(
    make_pipeline(
        FunctionTransformer(select_column, kw_args={'column': 'text'}),
        TfidfVectorizer(preprocessor=lemmatize, ngram_range=(1, 3)),
    ),
    make_pipeline(
        FunctionTransformer(select_column, kw_args={'column': 'category'}),
        FunctionTransformer(reshape_array, kw_args={'shape': (-1, 1)}),
        OneHotEncoder(),
    ),
)

x_train = pipeline.fit_transform(df_train).toarray()

x_train.shape

(2258, 28231)

# Cross Validatation

In [41]:
scoring = ['precision_micro', 'recall_micro', 'f1_micro']

clf = OneVsRestClassifier(GaussianNB())

scores = cross_validate(clf, x_train, y_train, scoring=scoring, cv=3)

print('Micro Precision Score\t', np.average(scores['test_precision_micro']))
print('Micro Recall Score\t', np.average(scores['test_recall_micro']))
print('Micro F1 Score\t\t', np.average(scores['test_f1_micro']))

# -----------------------------------------

# Without #AspectCategory As Feature

# Micro Precision Score	 0.569311128799821
# Micro Recall Score	 0.5921557834734619
# Micro F1 Score		 0.5804751685040593

# -----------------------------------------

# With #AspectCategory as Feature

# Micro Precision Score	 0.5703663218941576
# Micro Recall Score	 0.5917189114769569
# Micro F1 Score		 0.5808142238635255

Micro Precision Score	 0.6634040895737204
Micro Recall Score	 0.6486659359280148
Micro F1 Score		 0.655863225325498


# Train Model

In [42]:
clf.fit(x_train, y_train)

OneVsRestClassifier(estimator=GaussianNB(priors=None, var_smoothing=1e-09),
                    n_jobs=None)

# Evaluate Model

In [43]:
test_ds_path = DATA_PATHS['asba.semeval16.raw.test.gold']

df_test = load_dataset(test_ds_path)

df_test = df_test.loc[:, ['id', 'text', 'category', 'polarity']]

df_test = pd.DataFrame({
    'polarity': df_test.groupby(['id', 'text', 'category'])['polarity'].apply(list),
}).reset_index()

df_test.head()

Unnamed: 0,id,text,category,polarity
0,en_BlueRibbonSushi_478218171:0,Yum!,FOOD#QUALITY,[positive]
1,en_BlueRibbonSushi_478218171:1,Serves really good sushi.,FOOD#QUALITY,[positive]
2,en_BlueRibbonSushi_478218171:2,Not the biggest portions but adequate.,FOOD#STYLE_OPTIONS,[neutral]
3,en_BlueRibbonSushi_478218171:3,Green Tea creme brulee is a must!,FOOD#QUALITY,[positive]
4,en_BlueRibbonSushi_478218171:4,Don't leave the restaurant without it.,FOOD#QUALITY,[positive]


In [44]:
# Testing
x_test = pipeline.transform(df_test).toarray()

y_pred = clf.predict(x_test)

y_pred.shape

(743, 3)

In [45]:
y_true = mlb.transform(df_test.polarity)

y_true.shape

(743, 3)

In [46]:
('f1_score', f1_score(y_true, y_pred, average='micro'))

('f1_score', 0.6278145695364239)

In [47]:
from sklearn.model_selection import cross_val_predict

train_pred = cross_val_predict(clf, x_train, y_train, cv=5)

prds = mlb.inverse_transform(train_pred)

df_train['predictions'] = [list(x) for x in prds]

df_train.to_excel('./output/sc_nb.xlsx')