# Format data for multilabel classification

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


This notebook uses the original ABSA dataset in .xml format and our own dataset with resolved coreference in .csv format. These are used to build a .csv file for training a multilabel classification model. Altogether, three models were trained: 

1.   Original ABSA data
2.   Automatically resolved coreference data
3.   Gold/manual coreference data





In [None]:
# data paths for experiment docs (text per row)
# original ABSA, automatic coreference, gold coreference
train_set = "/content/drive/My Drive/TFM_Rosa-Maria/multilabel-classification/coreference-data/coref_texts_train.csv"
test_set = "/content/drive/My Drive/TFM_Rosa-Maria/multilabel-classification/coreference-data/coref_texts_test.csv"

# original dataset in xml format
xml_train = "/content/drive/My Drive/TFM_Rosa-Maria/ABSA-data/absa-2016-rest-train.xml"
xml_test = "/content/drive/My Drive/TFM_Rosa-Maria/ABSA-data/absa-2016-rest-test.xml"

## Generate random ids

Random IDs will be used in the first column of the final .csv file.

In [None]:
import random
from random import seed

# set seed for number generation
seed(1)

# get unique id for each text
# set range for number of documents
# 1708 for absa train
# 587 for absa test
randomlist = []
for i in range(0,1708):
    n = random.randint(0,10000)
    randomlist.append(str(n))

print(randomlist)

['2201', '9325', '1033', '4179', '1931', '8117', '7364', '7737', '6219', '3439', '1537', '7993', '464', '6386', '7090', '9952', '34', '7297', '4363', '3748', '9685', '1674', '5200', '501', '365', '416', '8870', '150', '6245', '3548', '6915', '475', '8644', '3632', '7174', '8123', '9058', '3818', '5663', '3782', '3584', '7530', '4747', '352', '6818', '9116', '1638', '3045', '4856', '1980', '5450', '8205', '6915', '8318', '3110', '4970', '4655', '9626', '8181', '8278', '6444', '9650', '565', '7868', '3977', '6623', '6788', '2834', '6014', '8991', '6139', '1416', '7191', '8330', '1768', '2682', '8535', '6443', '6070', '8023', '484', '7689', '712', '5054', '9718', '9472', '6448', '2791', '2762', '8228', '3718', '201', '3268', '8841', '8983', '3803', '6626', '8417', '5633', '9466', '5788', '7522', '4411', '8978', '9976', '93', '6286', '8396', '2117', '8498', '9197', '3366', '6981', '919', '7882', '5975', '9338', '9083', '3274', '8269', '6773', '7945', '5845', '6789', '5670', '25', '8822', '

## Get list of categories

In [None]:
from lxml import etree

# parse XML document
parser = etree.XMLParser()
tree = etree.parse(xml_train, parser)

# get categories for each sentence
categories_per_sentence = []
for review in tree.xpath('.//Review'):

    sentence_positions = review.xpath('sentences/sentence')

    for elem in sentence_positions:

        sent_cats = elem.xpath('Opinions/Opinion/@category')
        categories_per_sentence.append(sent_cats)

# eliminate empty lists
opinion_categories = [x for x in categories_per_sentence if x != []]

# check output and length
for item in opinion_categories:
    print(item)

print(len(opinion_categories))

['RESTAURANT#GENERAL']
['SERVICE#GENERAL']
['SERVICE#GENERAL']
['FOOD#QUALITY', 'FOOD#STYLE_OPTIONS']
['SERVICE#GENERAL']
['RESTAURANT#GENERAL']
['FOOD#QUALITY']
['RESTAURANT#GENERAL']
['FOOD#QUALITY', 'FOOD#QUALITY']
['DRINKS#STYLE_OPTIONS', 'DRINKS#PRICES']
['RESTAURANT#PRICES', 'FOOD#QUALITY']
['RESTAURANT#GENERAL']
['SERVICE#GENERAL']
['SERVICE#GENERAL']
['FOOD#QUALITY']
['FOOD#QUALITY', 'FOOD#QUALITY', 'RESTAURANT#MISCELLANEOUS']
['RESTAURANT#GENERAL']
['RESTAURANT#GENERAL']
['SERVICE#GENERAL', 'FOOD#QUALITY', 'RESTAURANT#PRICES']
['RESTAURANT#GENERAL']
['RESTAURANT#GENERAL']
['FOOD#QUALITY', 'SERVICE#GENERAL', 'AMBIENCE#GENERAL']
['FOOD#QUALITY']
['RESTAURANT#GENERAL']
['RESTAURANT#GENERAL']
['FOOD#QUALITY', 'FOOD#QUALITY', 'FOOD#QUALITY']
['FOOD#QUALITY']
['FOOD#QUALITY', 'FOOD#STYLE_OPTIONS']
['AMBIENCE#GENERAL', 'AMBIENCE#GENERAL', 'AMBIENCE#GENERAL']
['DRINKS#STYLE_OPTIONS', 'SERVICE#GENERAL']
['FOOD#QUALITY', 'FOOD#QUALITY']
['SERVICE#GENERAL']
['RESTAURANT#GENERAL']
['RESTA

## Get texts

In [None]:
import pandas as pd

# open file as dataframe
df = pd.read_csv(train_set, sep="\t", header=None) 

# get texts and save as list
text_list = texts.tolist()
texts.head()

0    Judging from previous posts this used to be a ...
1    We, there were four of us, arrived at noon - t...
2    The staff never brought us complimentary noodl...
3    The food was lousy-too sweet or too salty and ...
4    After all that, the staff complained to me abo...
Name: 0, dtype: object

## Create dataframe

In [None]:
df = pd.DataFrame({'id': randomlist, 'comment_text': text_list, 'categories' : opinion_categories })
df.head(10)

Unnamed: 0,id,comment_text,categories
0,2201,Judging from previous posts this used to be a ...,[RESTAURANT#GENERAL]
1,9325,"We, there were four of us, arrived at noon - t...",[SERVICE#GENERAL]
2,1033,The staff never brought us complimentary noodl...,[SERVICE#GENERAL]
3,4179,The food was lousy-too sweet or too salty and ...,"[FOOD#QUALITY, FOOD#STYLE_OPTIONS]"
4,1931,"After all that, the staff complained to me abo...",[SERVICE#GENERAL]
5,8117,Avoid this place!,[RESTAURANT#GENERAL]
6,7364,"I have eaten at saul, many times, the food is ...",[FOOD#QUALITY]
7,7737,Saul is the best restaurant on Smith Street an...,[RESTAURANT#GENERAL]
8,6219,The duck confit is always amazing and the foie...,"[FOOD#QUALITY, FOOD#QUALITY]"
9,3439,The wine list is interesting and has many good...,"[DRINKS#STYLE_OPTIONS, DRINKS#PRICES]"


## Create one-hot encoded dataframe

One-hot encoded matrix is created for the categories

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
df2 = pd.DataFrame(mlb.fit_transform(df['categories']),columns=mlb.classes_)
df2.head(10)

Unnamed: 0,AMBIENCE#GENERAL,DRINKS#PRICES,DRINKS#QUALITY,DRINKS#STYLE_OPTIONS,FOOD#PRICES,FOOD#QUALITY,FOOD#STYLE_OPTIONS,LOCATION#GENERAL,RESTAURANT#GENERAL,RESTAURANT#MISCELLANEOUS,RESTAURANT#PRICES,SERVICE#GENERAL
0,0,0,0,0,0,0,0,0,1,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,1,1,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,1
5,0,0,0,0,0,0,0,0,1,0,0,0
6,0,0,0,0,0,1,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,1,0,0,0
8,0,0,0,0,0,1,0,0,0,0,0,0
9,0,1,0,1,0,0,0,0,0,0,0,0


## Create final .csv file

In [None]:
# insert ID and document in columns 0 and 1
df2.insert(0, 'id', randomlist)
df2.insert(1, 'comment_text', text_list)

# save file
df2.to_csv('coref-train-multilabel.csv', index=False)

In [None]:
#display data
df2.head()

Unnamed: 0,id,comment_text,AMBIENCE#GENERAL,DRINKS#PRICES,DRINKS#QUALITY,DRINKS#STYLE_OPTIONS,FOOD#PRICES,FOOD#QUALITY,FOOD#STYLE_OPTIONS,LOCATION#GENERAL,RESTAURANT#GENERAL,RESTAURANT#MISCELLANEOUS,RESTAURANT#PRICES,SERVICE#GENERAL
0,2201,Judging from previous posts this used to be a ...,0,0,0,0,0,0,0,0,1,0,0,0
1,9325,"We, there were four of us, arrived at noon - t...",0,0,0,0,0,0,0,0,0,0,0,1
2,1033,The staff never brought us complimentary noodl...,0,0,0,0,0,0,0,0,0,0,0,1
3,4179,The food was lousy-too sweet or too salty and ...,0,0,0,0,0,1,1,0,0,0,0,0
4,1931,"After all that, the staff complained to me abo...",0,0,0,0,0,0,0,0,0,0,0,1
