# Clustering Analysis

We load the data for each laptop and restaurant. In this case, we have existing labels to refer to. Hence, to make the analysis easier we will be loading the models of the number of unique classes.

In [1]:
import json

with open('../data_2015/pos tag/laptop_pos_tag.json') as f:
    laptop_tags = json.load(f)
with open('../data_2015/pos tag/restaurant_pos_tag.json') as f:
    restaurant_tags = json.load(f)

In [24]:
def strip_prefix(examples):
    examples['term'] = examples['term'].strip()
    return examples
from datasets import load_from_disk
restaurant_dataset = load_from_disk('../data_2015/mnli/restaurant_aspect_term')
cleaned_restaurant_dataset = restaurant_dataset.map(strip_prefix)

Loading cached processed dataset at e:\UQ\REIT4882\unsupervised-absa\data_2015\mnli\restaurant_aspect_term\cache-457cf310432690a9.arrow


Get all the terms categories

In [2]:
restaurant_categories = {}
unique_category = set()
for sentence in restaurant_tags:
    pos_tags = [word['word'] for word in sentence['pos_tag']]
    for tag in sentence['term ground truth']:
        if tag['term'] in pos_tags:
            unique_category.add(tag['category'])
            if tag['term'] in restaurant_categories:
                restaurant_categories[tag['term']].add(tag['category'])
            else:
                restaurant_categories[tag['term']] = set()
                restaurant_categories[tag['term']].add(tag['category'])

In [3]:
f"There are a total of {len(unique_category)} categories for terms"

'There are a total of 6 categories for terms'

## Analysis of Each Word Embedding Models

In [4]:
import pandas as pd
import numpy as np
def cluster_aggregation(categories, input_word, prediction_label, unique_category):
    existing_key = categories.keys()
    label_count = [{category: 0 for category in unique_category} for _ in unique_category]
    for word, label_idx in zip(input_word, prediction_label):
        if word in existing_key:
            for category in categories[word]:
                old_count = label_count[label_idx][category]
                label_count[label_idx][category] = old_count + 1

    return pd.DataFrame.from_records(label_count)

### Raw Counts

In [5]:
from unsupervised_absa.clustering import ClusteringModel
from sklearn.cluster import KMeans

In [63]:
import copy
def merge_polarity(row):
    new_rows = []
    for pos in row['pos_tag']:
        new_pos = copy.deepcopy(pos)
        new_pos['polarity'] = row['term_polarity'][pos['word']]
        new_rows.append(new_pos)
    return new_rows

#### Glove

In [6]:
import numpy as np
restaurant_glove_embedding = np.load('../data_2015/word embedding data/pretrained/restaurant_glove_embedding.npy', allow_pickle=True).item()
np_restaurant_glove_embedding = np.stack(list(restaurant_glove_embedding.values()), axis=0)
restaurant_word = np.stack(list(restaurant_glove_embedding.keys()), axis=0)
km = KMeans(len(unique_category), n_init=10, max_iter=1000, tol=1e-04, random_state=42)
cluster = ClusteringModel(km)
result, new_embedding = cluster.fit(np_restaurant_glove_embedding, cosine_distance=False)

[32m2023-04-24 08:34:35.791[0m | [1mINFO    [0m | [36munsupervised_absa.clustering[0m:[36mfit[0m:[36m52[0m - [1mStart clustering KMeans(max_iter=1000, n_clusters=6, n_init=10, random_state=42) with 1038 Datapoints.[0m


In [7]:
cluster_aggregation(restaurant_categories, restaurant_word, result['Labels'], unique_category)

Unnamed: 0,LOCATION,FOOD,DRINKS,RESTAURANT,SERVICE,AMBIENCE
0,1,14,16,5,12,13
1,0,94,0,0,0,0
2,3,5,0,4,3,11
3,1,4,1,1,9,2
4,0,0,0,0,3,3
5,0,24,4,4,3,10


In [8]:
glove_labels = {0: 'DRINKS', 1: 'FOOD', 2: 'RESTAURANT', 3: 'SERVICE', 4: 'AMBIENCE', 5: 'LOCATION'}
labels = list(map(lambda x: glove_labels[x], result['Labels']))
cluster_result = dict(zip(restaurant_word, labels))

In [9]:
from datasets import load_from_disk
restaurant_dataset = load_from_disk('../data_2015/mnli/restaurant_aspect_term')

In [10]:
import copy
new_result = []
for sentence in restaurant_tags:
    new_sentence = copy.deepcopy(sentence)
    for idx, tag in enumerate(sentence['pos_tag']):
        key = tag['word'].lower()
        new_tag = copy.deepcopy(tag)
        new_tag['category'] = cluster_result[key]
        new_sentence['pos_tag'][idx] = new_tag
    new_result.append(new_sentence)

In [47]:
import pandas as pd
restaurant_df = cleaned_restaurant_dataset.to_pandas() 
restaurant_df = restaurant_df.groupby('text').apply(lambda x: {term: polarity for term, polarity in zip(x['term'], x['polarity'])})
restaurant_df.name = 'term_polarity'
restaurant_df = restaurant_df.to_frame()
new_df = pd.DataFrame.from_records(new_result)
merged_df = restaurant_df.merge(new_df, on='text', how='left')

In [65]:
merged_df['pos_tag'] = merged_df.apply(merge_polarity, axis=1).drop(columns=['term_polarity', 'category ground truth'])

In [75]:
merged_df['term ground truth'][6]

[{'start': '2',
  'end': '11',
  'polarity': 'negative',
  'term': 'gentleman',
  'category': 'SERVICE'}]

In [68]:
from datasets import Dataset
Dataset.from_pandas(merged_df).save_to_disk('../data_2015/aspect term result/glove embedding')

Saving the dataset (0/1 shards):   0%|          | 0/1159 [00:00<?, ? examples/s]

In [76]:
import numpy as np
restaurant_glove_embedding = np.load('../data_2015/word embedding data/pretrained/restaurant_glove_embedding_word_count.npy', allow_pickle=True).item()
np_restaurant_glove_embedding = np.stack(list(restaurant_glove_embedding.values()), axis=0)
restaurant_word = np.stack(list(restaurant_glove_embedding.keys()), axis=0)
km = KMeans(len(unique_category), n_init=10, max_iter=1000, tol=1e-04, random_state=42)
cluster = ClusteringModel(km)
result, new_embedding = cluster.fit(np_restaurant_glove_embedding, cosine_distance=False)

[32m2023-04-24 09:57:09.781[0m | [1mINFO    [0m | [36munsupervised_absa.clustering[0m:[36mfit[0m:[36m52[0m - [1mStart clustering KMeans(max_iter=1000, n_clusters=6, n_init=10, random_state=42) with 330 Datapoints.[0m


In [77]:
cluster_aggregation(restaurant_categories, restaurant_word, result['Labels'], unique_category)

Unnamed: 0,LOCATION,FOOD,DRINKS,RESTAURANT,SERVICE,AMBIENCE
0,5,2,1,5,2,9
1,0,0,0,0,3,1
2,0,38,4,0,0,0
3,0,5,2,2,9,3
4,0,4,1,1,5,6
5,0,2,0,1,1,1


In [None]:
glove_labels = {0: 'LOCATION', 1: 'SERVICE', 2: 'FOOD', 1: 'RESTAURANT', 4: 'AMBIENCE', 5: 'LOCATION'}
labels = list(map(lambda x: glove_labels[x], result['Labels']))
cluster_result = dict(zip(restaurant_word, labels))