In [48]:
%matplotlib inline

In [49]:
from absa.config import DATA_PATHS
from absa.dataset import load_dataset
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import cross_validate
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline, make_union
from sklearn.svm import SVC
from sklearn.metrics import f1_score
import spacy
nlp = spacy.load('en')

In [55]:
train_ds_path = DATA_PATHS['asba.semeval16.raw.train']

df_train = load_dataset(train_ds_path)

df_train = pd.DataFrame({
    'text': df_train.groupby('id')['text'].first(),
    'categories': df_train.groupby('id')['category'].apply(list),
    'target': df_train.groupby('id')['target'].apply(list),
})

df_train.head()

Unnamed: 0_level_0,text,categories,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1004293:0,Judging from previous posts this used to be a ...,[RESTAURANT#GENERAL],[place]
1004293:1,"We, there were four of us, arrived at noon - t...",[SERVICE#GENERAL],[staff]
1004293:2,"They never brought us complimentary noodles, i...",[SERVICE#GENERAL],[NULL]
1004293:3,The food was lousy - too sweet or too salty an...,"[FOOD#QUALITY, FOOD#STYLE_OPTIONS]","[food, portions]"
1004293:4,"After all that, they complained to me about th...",[SERVICE#GENERAL],[NULL]


In [51]:
lookup = {}
with open('../resources/vocabulary.txt', 'r', encoding='utf-8') as f:
    for idx, section in enumerate(f.read().split('\n\n')):
        for key in section.split('\n'):
            lookup[key.lower()] = idx
lookup

{'restaurant': 0,
 'bar': 0,
 'bill': 0,
 'booking': 0,
 'reservation': 0,
 'breakfast': 0,
 'chef': 0,
 'dinner': 0,
 'lunch': 0,
 'main course': 0,
 'menu': 1,
 'service charge': 0,
 'starter': 0,
 'tip': 0,
 'all day breakfast': 1,
 'apple crumble': 1,
 'apple pie': 2,
 'bacon and eggs': 1,
 'baked potato or jacket potato': 1,
 'beefburger or hamburger': 1,
 'boiled potatoes': 1,
 'boiled rice': 1,
 'bread and butter pudding': 2,
 'catch of the day': 1,
 'cauliflower cheese': 2,
 'cheese and biscuits': 1,
 'cheeseburger': 3,
 'cheesecake': 2,
 'chicken': 2,
 'chocolate cake': 2,
 'chocolate mousse': 1,
 'crème brûlée': 1,
 'curry': 2,
 'custard': 2,
 'danish pastry': 1,
 'dish of the day': 1,
 'egg and chips': 1,
 'fillet steak': 1,
 'fish and chips': 1,
 'fish pie': 2,
 'french fries or fries': 1,
 'fried rice': 1,
 'fruit salad': 2,
 'full english breakfast': 1,
 'green salad': 1,
 'ice cream': 2,
 'lasagne': 2,
 'lemon meringue pie': 2,
 'lemon tart': 1,
 'mashed potatoes': 1,
 '

In [52]:
def find_terms(sents):
    for sent in sents:
        term_keys = {}
        for key in lookup.keys():
            if key in sent.lower():
                if lookup[key] not in term_keys:
                    term_keys[lookup[key]] = 0
                term_keys[lookup[key]] += 1
        yield term_keys

list(find_terms(['Pizza was too hot!']))

[{2: 1, 4: 1}]

# Extract Features

In [53]:
def lemmatize(x):
    return ' '.join([token.lemma_ for token in nlp(x)])

In [54]:
pipeline = make_union(
    # Stopwords: Keep/Remove
    # Lemmatize: Yes
    TfidfVectorizer(preprocessor=lemmatize, ngram_range=(1, 3)),
    make_pipeline(
        FunctionTransformer(lambda x: list(find_terms(x))),
        DictVectorizer()
    ),
)

x_train = pipeline.fit_transform(df_train.text).toarray()

x_train.shape

(1708, 28224)

# Prepare Labels

In [40]:
mlb = MultiLabelBinarizer()

y_train = mlb.fit_transform(df_train.categories)

y_train.shape

(1708, 12)

# Cross Validatation

In [41]:
scoring = ['precision_micro', 'recall_micro', 'f1_micro']

clf = OneVsRestClassifier(LogisticRegression())

scores = cross_validate(clf, x_train, y_train, scoring=scoring, cv=5)

print('Micro Precision Score\t', np.average(scores['test_precision_micro']))
print('Micro Recall Score\t', np.average(scores['test_recall_micro']))
print('Micro F1 Score\t\t', np.average(scores['test_f1_micro']))

# Without Lemmatization
# Micro Precision Score	 0.8088599767846789
# Micro Recall Score	 0.26736160581030693
# Micro F1 Score		 0.401698438726358

# With Lemmatization
# Micro Precision Score	 0.8175558802393373
# Micro Recall Score	 0.27265653423258635
# Micro F1 Score		 0.4087834386991657

Micro Precision Score	 0.8175558802393373
Micro Recall Score	 0.27265653423258635
Micro F1 Score		 0.4087834386991657


# Train Model

In [42]:
clf.fit(x_train, y_train)

OneVsRestClassifier(estimator=LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='auto',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=None,
                                                 solver='lbfgs', tol=0.0001,
                                                 verbose=0, warm_start=False),
                    n_jobs=None)

# Evaluate Model

In [43]:
test_ds_path = DATA_PATHS['asba.semeval16.raw.test.gold']

df_test = load_dataset(test_ds_path)

df_test = pd.DataFrame({
    'text': df_test.groupby('id')['text'].first(),
    'categories': df_test.groupby('id')['category'].apply(list),
})

df_test.head()

Unnamed: 0_level_0,text,categories
id,Unnamed: 1_level_1,Unnamed: 2_level_1
en_BlueRibbonSushi_478218171:0,Yum!,[FOOD#QUALITY]
en_BlueRibbonSushi_478218171:1,Serves really good sushi.,[FOOD#QUALITY]
en_BlueRibbonSushi_478218171:2,Not the biggest portions but adequate.,[FOOD#STYLE_OPTIONS]
en_BlueRibbonSushi_478218171:3,Green Tea creme brulee is a must!,[FOOD#QUALITY]
en_BlueRibbonSushi_478218171:4,Don't leave the restaurant without it.,[FOOD#QUALITY]


In [44]:
# Testing
x_test = pipeline.transform(df_test.text).toarray()

y_pred = clf.predict(x_test)

y_pred.shape

(587, 12)

In [45]:
y_true = mlb.transform(df_test.categories)

y_true.shape

(587, 12)

In [46]:
('f1_score', f1_score(y_true, y_pred, average='micro'))

('f1_score', 0.5046728971962616)

# Get Predicitons on Train Data

In [None]:
from sklearn.model_selection import cross_val_predict

train_pred = cross_val_predict(clf, x_train, y_train, cv=5)

prds = mlb.inverse_transform(train_pred)

df_train['predictions'] = [list(x) for x in prds]

df_train.to_excel('./output/ac_lr.xlsx')

In [None]:
df_train.to_excel('./output/target.xlsx')