#Text classification for aspect detection

## 1. Prepare data for classification

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


### Get text and targets from xml

In [None]:
from lxml import etree

# define data path
train_set = "/content/drive/My Drive/TFM-private/absa-2016-rest-train.xml"
test_set = "/content/drive/My Drive/TFM-private/absa-2016-rest-test.xml"

# parse XML documents
parser = etree.XMLParser()
tree = etree.parse(train_set, parser)

# initialise lists
all_texts = []
texts = []
texts_per_sentence = []
categories_per_sentence = []
targets = []
targets_per_sentence = []

# get texts and categories for train
for review in tree.xpath('.//Review'):

    # get text, opinion and sentence positions
    text_positions = review.xpath('sentences/sentence/text')
    sentence_positions = review.xpath('sentences/sentence')
    opinion_positions = review.xpath('sentences/sentence')

    # get texts
    rev_texts = []
    rev_texts_per_sentence = []

    for elem in text_positions:

        sentence = []
        text = elem.text
        rev_texts.append(text)
        sentence.append(text)
        all_texts.append(text)
        rev_texts_per_sentence.append(sentence)
    

    # get categories per sentence
    sent_categories = []

    for elem in sentence_positions:

        sent_cats = elem.xpath('Opinions/Opinion/@category')
        sent_categories.append(sent_cats)

    # get targets per sentence per review
    sent_targets = []

    for elem in sentence_positions:

        sent_targs = elem.xpath('Opinions/Opinion/@target')
        sent_targets.append(sent_targs)
    
    rev_targets = review.xpath('sentences/sentence/Opinions/Opinion/@target')
    targets.append(rev_targets)
    
    # append information to data lists
    texts.append(rev_texts)
    texts_per_sentence.append(rev_texts_per_sentence)
    categories_per_sentence.append(sent_categories)
    targets_per_sentence.append(sent_targets)

# get texts and categories for test
for review in tree.xpath('.//Review'):

    # get text, opinion and sentence positions
    text_positions = review.xpath('sentences/sentence/text')
    sentence_positions = review.xpath('sentences/sentence')
    opinion_positions = review.xpath('sentences/sentence')

    # get texts
    rev_texts = []
    rev_texts_per_sentence = []

    for elem in text_positions:

        sentence = []
        text = elem.text
        rev_texts.append(text)
        sentence.append(text_positions)
        all_texts.append(text)
        rev_texts_per_sentence.append(sentence)
    

    # get categories per sentence
    sent_categories = []

    for elem in sentence_positions:

        sent_cats = elem.xpath('Opinions/Opinion/@category')
        sent_categories.append(sent_cats)

    # get targets per sentence per review
    sent_targets = []

    for elem in sentence_positions:

        sent_targs = elem.xpath('Opinions/Opinion/@target')
        sent_targets.append(sent_targs)
    
    rev_targets = review.xpath('sentences/sentence/Opinions/Opinion/@target')
    targets_test.append(rev_targets)
    
    # append information to data lists
    texts.append(rev_texts)
    texts_per_sentence.append(rev_texts_per_sentence)
    categories_per_sentence.append(sent_categories)
    targets_per_sentence.append(sent_target)

### Handle sentences without targets

In [None]:
# replace empty list for target/category with NONE/NO_CAT
# this way we have same number of sentences and targets

replaced_targets = []
replaced_categories = []

for review in targets_per_sentence:
    for sentence in review:
        if len(sentence) == 0:
            empty_sentence = sentence.append("NONE")
            replaced_targets.append(empty_sentence)
        else:
            replaced_targets.append(sentence)

for review in categories_per_sentence:
    for sentence in review:
        if len(sentence) == 0:
            empty_sentence = sentence.append("NO_CAT")
            replaced_categories.append(empty_sentence)
        else:
            replaced_categories.append(sentence)

In [None]:
# get correct form of list 
flat_list = [item for sublist in categories_per_sentence for item in sublist]
flat_categories = [item for sublist in flat_list for item in sublist]
print(flat_categories)

### Multiply texts with various categories

In [None]:
## for train data
# get the number of categories for each sentence
category_quantity = []

for review in categories_per_sentence:
    for sentence in review:
        sent_length = len(sentence)
        category_quantity.append(sent_length)

# multiply the sentence by the category quantity

multiplied_texts = []
for text, category in zip(all_texts, category_quantity):
    multiplied_texts.extend([text] * category)

print(all_texts)
print(multiplied_texts)

###Create dataframe of texts and categories

In [None]:
import pandas as pd

df = pd.DataFrame(list(zip(flat_categories, multiplied_texts)), columns =['Category', 'Text'])
df.head(50)

Unnamed: 0,Category,Text
0,FOOD#QUALITY,Yum!
1,FOOD#QUALITY,Serves really good sushi.
2,FOOD#STYLE_OPTIONS,Not the biggest portions but adequate.
3,FOOD#QUALITY,Green Tea creme brulee is a must!
4,FOOD#QUALITY,Don't leave the restaurant without it.
5,RESTAURANT#GENERAL,No Comparison
6,RESTAURANT#GENERAL,– I can't say enough about this place.
7,FOOD#QUALITY,It has great sushi and even better service.
8,SERVICE#GENERAL,It has great sushi and even better service.
9,SERVICE#GENERAL,The entire staff was extremely accomodating an...


###Format labels for classifier

In [None]:
### make sure to run this only once, otherwise __label__ will be repeated

# get flair format for labels
def format_label_to_flair(flair_label):
  flair_label['Category'] = '__label__' + flair_label['Category'].astype(str)
  return flair_label

# format labels
training_data = format_label_to_flair(df)

# get files from dataframes
train_file = training_data.to_csv("train_categories.txt", index=False, sep="\t")

display(training_data)

Unnamed: 0,Category,Text
0,__label__RESTAURANT#GENERAL,Judging from previous posts this used to be a ...
1,__label__SERVICE#GENERAL,"We, there were four of us, arrived at noon - t..."
2,__label__SERVICE#GENERAL,"They never brought us complimentary noodles, i..."
3,__label__FOOD#QUALITY,The food was lousy - too sweet or too salty an...
4,__label__FOOD#STYLE_OPTIONS,The food was lousy - too sweet or too salty an...
...,...,...
2794,__label__SERVICE#GENERAL,The waitress came to check in on us every few ...
2795,__label__SERVICE#GENERAL,I couldn't ignore the fact that she reach over...
2796,__label__SERVICE#GENERAL,She then put the check down without asking if ...
2797,__label__RESTAURANT#GENERAL,"I wish I could like this place more, and I wis..."


Unnamed: 0,Category,Text
0,__label__FOOD#QUALITY,Yum!
1,__label__FOOD#QUALITY,Serves really good sushi.
2,__label__FOOD#STYLE_OPTIONS,Not the biggest portions but adequate.
3,__label__FOOD#QUALITY,Green Tea creme brulee is a must!
4,__label__FOOD#QUALITY,Don't leave the restaurant without it.
...,...,...
943,__label__RESTAURANT#GENERAL,"All considered, I have to say that Ray's Boath..."
944,__label__SERVICE#GENERAL,While I could have done without the youth who ...
945,__label__FOOD#QUALITY,While I could have done without the youth who ...
946,__label__RESTAURANT#MISCELLANEOUS,While I could have done without the youth who ...


## 2. Start classifying

### Install Flair

In [None]:
pip install --upgrade flair

### Load the data set

In [None]:
from flair.data import Corpus                                                                                                                                                           
from flair.datasets import ClassificationCorpus
from flair.embeddings import WordEmbeddings, FlairEmbeddings, FastTextEmbeddings, DocumentRNNEmbeddings                                                             
from flair.models import TextClassifier                                                                                                                                                 
from flair.trainers import ModelTrainer

# get the corpus
corpus_folder = '/content/drive/MyDrive/TFM-private/flair-experiments/'

corpus: Corpus = ClassificationCorpus(corpus_folder,
                                      train_file='train_categories.txt',                                      
                                      test_file='test_categories.txt')

# create the label dictionary
label_dict = corpus.make_label_dictionary(label_type=None)

2022-01-16 11:13:48,527 Reading data from /content/drive/MyDrive/TFM-private/flair-experiments
2022-01-16 11:13:48,528 Train: /content/drive/MyDrive/TFM-private/flair-experiments/train_categories.txt
2022-01-16 11:13:48,534 Dev: None
2022-01-16 11:13:48,535 Test: /content/drive/MyDrive/TFM-private/flair-experiments/test_categories.txt
2022-01-16 11:13:49,777 Initialized corpus /content/drive/MyDrive/TFM-private/flair-experiments/ (label type name is 'class')
2022-01-16 11:13:49,781 Computing label dictionary. Progress:


  cpuset_checked))
100%|██████████| 2519/2519 [00:01<00:00, 1573.80it/s]

2022-01-16 11:13:51,575 Corpus contains the labels: class (#2519)
2022-01-16 11:13:51,576 Created (for label 'None') Dictionary with 14 tags: <unk>, RESTAURANT#GENERAL, SERVICE#GENERAL, FOOD#QUALITY, FOOD#STYLE_OPTIONS, DRINKS#STYLE_OPTIONS, RESTAURANT#PRICES, RESTAURANT#MISCELLANEOUS, AMBIENCE#GENERAL, NO_CAT, FOOD#PRICES, LOCATION#GENERAL, DRINKS#QUALITY, DRINKS#PRICES





###Install word embeddings and instantiate the text classifier

In [None]:
# 3. make a list of word embeddings
word_embeddings = [
                   WordEmbeddings('en-crawl'),
                   #FlairEmbeddings('news-forward'),
                   #FlairEmbeddings('news-backward'),
]

# 4. initialize document embedding by passing list of word embeddings                                                                                                                   
# Can choose between many RNN types (GRU by default, to change use rnn_type parameter)                                                                                                  
document_embeddings: DocumentRNNEmbeddings = DocumentRNNEmbeddings(word_embeddings,
                                                                   hidden_size=512,
                                                                   reproject_words=True,
                                                                   reproject_words_dimension=256,
                                                                   rnn_type='LSTM',
)
                                                                                                                                                                                         
# 5. create the text classifier
classifier = TextClassifier(document_embeddings, label_dictionary=label_dict, label_type=None)

# 6. initialize the text classifier trainer
trainer = ModelTrainer(classifier, corpus)

2022-01-16 11:15:08,263 https://flair.informatik.hu-berlin.de/resources/embeddings/token/en-fasttext-crawl-300d-1M.vectors.npy not found in cache, downloading to /tmp/tmpt42g_3i1


100%|██████████| 1200000128/1200000128 [01:05<00:00, 18213625.93B/s]

2022-01-16 11:16:14,648 copying /tmp/tmpt42g_3i1 to cache at /root/.flair/embeddings/en-fasttext-crawl-300d-1M.vectors.npy





2022-01-16 11:16:18,858 removing temp file /tmp/tmpt42g_3i1
2022-01-16 11:16:19,489 https://flair.informatik.hu-berlin.de/resources/embeddings/token/en-fasttext-crawl-300d-1M not found in cache, downloading to /tmp/tmprlkue6rq


100%|██████████| 39323680/39323680 [00:03<00:00, 9971602.96B/s] 

2022-01-16 11:16:24,061 copying /tmp/tmprlkue6rq to cache at /root/.flair/embeddings/en-fasttext-crawl-300d-1M





2022-01-16 11:16:24,101 removing temp file /tmp/tmprlkue6rq


###Run the trainer

In [None]:
# 7. start the training
trainer.train('flair-ACD-model',
              train_with_dev=False,
              max_epochs=5)

2022-01-16 11:17:29,177 ----------------------------------------------------------------------------------------------------
2022-01-16 11:17:29,179 Model: "TextClassifier(
  (loss_function): CrossEntropyLoss()
  (document_embeddings): DocumentRNNEmbeddings(
    (embeddings): StackedEmbeddings(
      (list_embedding_0): WordEmbeddings(
        'en-crawl'
        (embedding): Embedding(1000001, 300)
      )
    )
    (word_reprojection_map): Linear(in_features=300, out_features=256, bias=True)
    (rnn): LSTM(256, 512, batch_first=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Linear(in_features=512, out_features=14, bias=True)
  (weights): None
  (weight_tensor) None
)"
2022-01-16 11:17:29,180 ----------------------------------------------------------------------------------------------------
2022-01-16 11:17:29,181 Corpus: "Corpus: 2519 train + 280 dev + 948 test sentences"
2022-01-16 11:17:29,183 -------------------------------------------------------------------

  cpuset_checked))


2022-01-16 11:17:30,236 epoch 1 - iter 7/79 - loss 0.07910656 - samples/sec: 356.84 - lr: 0.100000
2022-01-16 11:17:30,495 epoch 1 - iter 14/79 - loss 0.07651971 - samples/sec: 893.43 - lr: 0.100000
2022-01-16 11:17:30,767 epoch 1 - iter 21/79 - loss 0.07446519 - samples/sec: 906.36 - lr: 0.100000
2022-01-16 11:17:31,024 epoch 1 - iter 28/79 - loss 0.07246271 - samples/sec: 956.27 - lr: 0.100000
2022-01-16 11:17:31,310 epoch 1 - iter 35/79 - loss 0.07064093 - samples/sec: 856.61 - lr: 0.100000
2022-01-16 11:17:31,589 epoch 1 - iter 42/79 - loss 0.06979712 - samples/sec: 890.70 - lr: 0.100000
2022-01-16 11:17:31,846 epoch 1 - iter 49/79 - loss 0.06925279 - samples/sec: 1045.59 - lr: 0.100000
2022-01-16 11:17:32,131 epoch 1 - iter 56/79 - loss 0.06880635 - samples/sec: 794.58 - lr: 0.100000
2022-01-16 11:17:32,428 epoch 1 - iter 63/79 - loss 0.06845964 - samples/sec: 865.17 - lr: 0.100000
2022-01-16 11:17:32,723 epoch 1 - iter 70/79 - loss 0.06852284 - samples/sec: 812.98 - lr: 0.100000


  cpuset_checked))


2022-01-16 11:17:42,412 epoch 2 - iter 7/79 - loss 0.06751744 - samples/sec: 675.83 - lr: 0.100000
2022-01-16 11:17:42,681 epoch 2 - iter 14/79 - loss 0.06380077 - samples/sec: 943.94 - lr: 0.100000
2022-01-16 11:17:42,919 epoch 2 - iter 21/79 - loss 0.06399323 - samples/sec: 978.77 - lr: 0.100000
2022-01-16 11:17:43,213 epoch 2 - iter 28/79 - loss 0.06414655 - samples/sec: 896.55 - lr: 0.100000
2022-01-16 11:17:43,469 epoch 2 - iter 35/79 - loss 0.06421369 - samples/sec: 896.54 - lr: 0.100000
2022-01-16 11:17:43,728 epoch 2 - iter 42/79 - loss 0.06431578 - samples/sec: 957.06 - lr: 0.100000
2022-01-16 11:17:44,371 epoch 2 - iter 49/79 - loss 0.06440535 - samples/sec: 936.80 - lr: 0.100000
2022-01-16 11:17:44,648 epoch 2 - iter 56/79 - loss 0.06438360 - samples/sec: 819.71 - lr: 0.100000
2022-01-16 11:17:44,891 epoch 2 - iter 63/79 - loss 0.06407876 - samples/sec: 957.20 - lr: 0.100000
2022-01-16 11:17:45,136 epoch 2 - iter 70/79 - loss 0.06430723 - samples/sec: 991.09 - lr: 0.100000
2

  cpuset_checked))


2022-01-16 11:18:07,084 epoch 3 - iter 7/79 - loss 0.06123751 - samples/sec: 805.54 - lr: 0.100000
2022-01-16 11:18:07,764 epoch 3 - iter 14/79 - loss 0.06261674 - samples/sec: 893.49 - lr: 0.100000
2022-01-16 11:18:08,046 epoch 3 - iter 21/79 - loss 0.06250937 - samples/sec: 878.41 - lr: 0.100000
2022-01-16 11:18:08,299 epoch 3 - iter 28/79 - loss 0.06314919 - samples/sec: 897.85 - lr: 0.100000
2022-01-16 11:18:08,564 epoch 3 - iter 35/79 - loss 0.06246283 - samples/sec: 921.30 - lr: 0.100000
2022-01-16 11:18:08,836 epoch 3 - iter 42/79 - loss 0.06252058 - samples/sec: 888.29 - lr: 0.100000
2022-01-16 11:18:09,102 epoch 3 - iter 49/79 - loss 0.06241298 - samples/sec: 901.60 - lr: 0.100000
2022-01-16 11:18:09,362 epoch 3 - iter 56/79 - loss 0.06240581 - samples/sec: 902.78 - lr: 0.100000
2022-01-16 11:18:09,918 epoch 3 - iter 63/79 - loss 0.06234993 - samples/sec: 423.66 - lr: 0.100000
2022-01-16 11:18:10,134 epoch 3 - iter 70/79 - loss 0.06245366 - samples/sec: 1107.07 - lr: 0.100000


  cpuset_checked))


2022-01-16 11:18:33,589 epoch 4 - iter 7/79 - loss 0.05782342 - samples/sec: 577.44 - lr: 0.100000
2022-01-16 11:18:33,854 epoch 4 - iter 14/79 - loss 0.06060231 - samples/sec: 875.32 - lr: 0.100000
2022-01-16 11:18:34,139 epoch 4 - iter 21/79 - loss 0.06017948 - samples/sec: 908.54 - lr: 0.100000
2022-01-16 11:18:34,412 epoch 4 - iter 28/79 - loss 0.05952499 - samples/sec: 859.51 - lr: 0.100000
2022-01-16 11:18:35,051 epoch 4 - iter 35/79 - loss 0.06002453 - samples/sec: 944.04 - lr: 0.100000
2022-01-16 11:18:35,316 epoch 4 - iter 42/79 - loss 0.05986240 - samples/sec: 908.19 - lr: 0.100000
2022-01-16 11:18:35,572 epoch 4 - iter 49/79 - loss 0.05965300 - samples/sec: 940.55 - lr: 0.100000
2022-01-16 11:18:35,822 epoch 4 - iter 56/79 - loss 0.05971062 - samples/sec: 975.17 - lr: 0.100000
2022-01-16 11:18:36,089 epoch 4 - iter 63/79 - loss 0.05972656 - samples/sec: 887.83 - lr: 0.100000
2022-01-16 11:18:36,336 epoch 4 - iter 70/79 - loss 0.05969080 - samples/sec: 970.81 - lr: 0.100000
2

  cpuset_checked))


2022-01-16 11:18:59,969 epoch 5 - iter 7/79 - loss 0.05802462 - samples/sec: 637.30 - lr: 0.100000
2022-01-16 11:19:00,244 epoch 5 - iter 14/79 - loss 0.05774764 - samples/sec: 866.19 - lr: 0.100000
2022-01-16 11:19:00,561 epoch 5 - iter 21/79 - loss 0.05801063 - samples/sec: 799.00 - lr: 0.100000
2022-01-16 11:19:00,853 epoch 5 - iter 28/79 - loss 0.05919675 - samples/sec: 783.49 - lr: 0.100000
2022-01-16 11:19:01,160 epoch 5 - iter 35/79 - loss 0.05818667 - samples/sec: 798.21 - lr: 0.100000
2022-01-16 11:19:01,430 epoch 5 - iter 42/79 - loss 0.05852587 - samples/sec: 927.98 - lr: 0.100000
2022-01-16 11:19:01,720 epoch 5 - iter 49/79 - loss 0.05799724 - samples/sec: 860.18 - lr: 0.100000
2022-01-16 11:19:01,964 epoch 5 - iter 56/79 - loss 0.05847798 - samples/sec: 967.72 - lr: 0.100000
2022-01-16 11:19:02,610 epoch 5 - iter 63/79 - loss 0.05838911 - samples/sec: 924.02 - lr: 0.100000
2022-01-16 11:19:02,868 epoch 5 - iter 70/79 - loss 0.05826648 - samples/sec: 922.18 - lr: 0.100000
2

  cpuset_checked))


2022-01-16 11:20:05,611 1.0	0.662	0.7966	0.4895
2022-01-16 11:20:05,618 
Results:
- F-score (micro) 0.7966
- F-score (macro) 0.2088
- Accuracy 0.4895

By class:
                          precision    recall  f1-score   support

            FOOD#QUALITY     1.0000    0.9739    0.9868       614
         SERVICE#GENERAL     1.0000    0.6951    0.8201       246
      RESTAURANT#GENERAL     1.0000    0.7597    0.8634       233
                  NO_CAT     1.0000    0.0222    0.0435        90
        AMBIENCE#GENERAL     0.0000    0.0000    0.0000        66
      FOOD#STYLE_OPTIONS     0.0000    0.0000    0.0000        55
RESTAURANT#MISCELLANEOUS     0.0000    0.0000    0.0000        33
             FOOD#PRICES     0.0000    0.0000    0.0000        23
          DRINKS#QUALITY     0.0000    0.0000    0.0000        22
       RESTAURANT#PRICES     0.0000    0.0000    0.0000        21
        LOCATION#GENERAL     0.0000    0.0000    0.0000        13
    DRINKS#STYLE_OPTIONS     0.0000    0.0000 

{'dev_loss_history': [tensor(0.0713, device='cuda:0'),
  tensor(0.0651, device='cuda:0'),
  tensor(0.0621, device='cuda:0'),
  tensor(0.0598, device='cuda:0'),
  tensor(0.0557, device='cuda:0')],
 'dev_score_history': [0.6991260923845194,
  0.7436918990703851,
  0.7608695652173912,
  0.7734806629834254,
  0.7854137447405329],
 'test_score': 0.7966386554621848,
 'train_loss_history': [0.06799007839227111,
  0.06457391666391152,
  0.06248788168241979,
  0.05987774223124711,
  0.058183053742802866]}