# Extract Embeddings

This section is used to extract the word embeddings of each nouns that are tagged from pos tagging process. These terms will be clustered to get the specific domain they are in.

## Short words removed

In [5]:
import json
from tqdm.auto import tqdm
with open('data/pos tag/semeval_pos_tag_remove_short_words.json') as f:
    pos_tags = json.load(f)

list_of_words = []
for row in tqdm(pos_tags):
    for pos_data in row['pos_tag']:
        list_of_words.append(pos_data['word'])

100%|██████████| 6055/6055 [00:00<00:00, 3257215.69it/s]


In [6]:
# get the list of words, remove all the 
# convert words to lower case
list_of_words = list(set(map(lambda x: x.lower(), list_of_words)))

### Word Embedding

In [19]:
from unsupervised_absa.embedding import ExtractEmbedding, ModelType
bert_embedding = ExtractEmbedding(ModelType.TRANSFORMER_WORD, 'bert-large-uncased', device="cuda")

[32m2023-04-14 16:31:27.084[0m | [1mINFO    [0m | [36munsupervised_absa.embedding[0m:[36m__init__[0m:[36m80[0m - [1mTagger model instantiated with device: cpu[0m


In [20]:
bert_embedding_data = bert_embedding.extract(list_of_words)

100%|██████████| 1393/1393 [03:07<00:00,  7.41it/s]


In [21]:
import numpy as np
np.save('data/word embedding data/sem_eval_bert_embedding.npy', bert_embedding_data)

### Bart-Large

In [3]:
from unsupervised_absa.embedding import ExtractEmbedding, ModelType
bart_embedding = ExtractEmbedding(ModelType.TRANSFORMER_WORD, 'facebook/bart-large', device="cuda")

[32m2023-04-14 16:01:49.825[0m | [1mINFO    [0m | [36munsupervised_absa.embedding[0m:[36m__init__[0m:[36m80[0m - [1mTagger model instantiated with device: cpu[0m


In [7]:
bart_embedding_data = bart_embedding.extract(list_of_words)

100%|██████████| 3906/3906 [13:12<00:00,  4.93it/s] 


In [8]:
import numpy as np
np.save('data/word embedding data/sem_eval_bart_embedding.npy', bart_embedding_data)

### Deberta-v3-Large

In [29]:
from unsupervised_absa.embedding import ExtractEmbedding, ModelType
deberta_embedding = ExtractEmbedding(ModelType.TRANSFORMER_WORD, 'microsoft/deberta-v3-large')

[32m2023-04-13 19:01:21.564[0m | [1mINFO    [0m | [36munsupervised_absa.embedding[0m:[36m__init__[0m:[36m80[0m - [1mTagger model instantiated with device: cpu[0m


In [30]:
deberta_embedding_data = deberta_embedding.extract(list_of_words)

100%|██████████| 1393/1393 [13:49<00:00,  1.68it/s]


In [31]:
import numpy as np
np.save('data/word embedding data/sem_eval_deberta_embedding.npy', deberta_embedding_data)

### Glove

In [32]:
from unsupervised_absa.embedding import ExtractEmbedding, ModelType
glove_embedding = ExtractEmbedding(ModelType.WORD, 'glove')

[32m2023-04-13 19:15:14.245[0m | [1mINFO    [0m | [36munsupervised_absa.embedding[0m:[36m__init__[0m:[36m80[0m - [1mTagger model instantiated with device: cpu[0m


In [33]:
glove_embedding_data = glove_embedding.extract(list_of_words)

100%|██████████| 1393/1393 [00:00<00:00, 5872.58it/s]


In [34]:
import numpy as np
np.save('data/word embedding data/sem_eval_glove_embedding.npy', glove_embedding_data)

### FastText

In [35]:
from unsupervised_absa.embedding import ExtractEmbedding, ModelType
fast_text_embedding = ExtractEmbedding(ModelType.WORD, 'en')

[32m2023-04-13 19:15:27.209[0m | [1mINFO    [0m | [36munsupervised_absa.embedding[0m:[36m__init__[0m:[36m80[0m - [1mTagger model instantiated with device: cpu[0m


In [36]:
fast_text_embedding_data = fast_text_embedding.extract(list_of_words)

100%|██████████| 1393/1393 [00:00<00:00, 5394.54it/s]


In [37]:
import numpy as np
np.save('data/word embedding data/sem_eval_fast_text_embedding.npy', fast_text_embedding_data)

### Word2Vec

Word2Vec is not in flair, what we can do is convert from gensim to flair

#### Using Word2Vec pretrained on google news

In [38]:
import gensim.downloader
model_path = gensim.downloader.load('word2vec-google-news-300', return_path=True)

In [39]:
# load the model to keyedvector and save it as keyedvector
vectors = gensim.models.KeyedVectors.load_word2vec_format(model_path, binary=True)
vectors.save('models/word2vec-google.gensim', pickle_protocol=4)

In [40]:
from unsupervised_absa.embedding import ExtractEmbedding, ModelType
word2vec_embedding = ExtractEmbedding(ModelType.WORD, 'models/word2vec-google.gensim')

[32m2023-04-13 19:16:52.221[0m | [1mINFO    [0m | [36munsupervised_absa.embedding[0m:[36m__init__[0m:[36m80[0m - [1mTagger model instantiated with device: cpu[0m


In [41]:
word2vec_embedding_data = word2vec_embedding.extract(list_of_words)

100%|██████████| 1393/1393 [00:00<00:00, 1637.42it/s]


In [42]:
import numpy as np
np.save('data/word embedding data/sem_eval_word2vec_embedding.npy', word2vec_embedding_data)

### Stacked Embedding

Stacked embedding of the embedding models [bert, glove, FastText, Word2Vec]

In [45]:
from unsupervised_absa.embedding import ExtractEmbedding, ModelType
stacked_embedding = ExtractEmbedding([ModelType.TRANSFORMER_WORD, ModelType.TRANSFORMER_WORD, ModelType.WORD, ModelType.WORD, ModelType.WORD], ['microsoft/deberta-v3-large', 'bert-base-uncased', 'glove', 'en', 'models/word2vec-google.gensim'])

[32m2023-04-13 19:25:37.792[0m | [1mINFO    [0m | [36munsupervised_absa.embedding[0m:[36m__init__[0m:[36m80[0m - [1mTagger model instantiated with device: cpu[0m


In [46]:
stacked_embedding_data = stacked_embedding.extract(list_of_words)

100%|██████████| 1393/1393 [24:47<00:00,  1.07s/it]


In [47]:
import numpy as np
np.save('data/word embedding data/sem_eval_stacked_embedding.npy', stacked_embedding_data)

### sup-promcse-roberta-large word embedding

In [48]:
from unsupervised_absa.embedding import ExtractEmbedding, ModelType
promcse_embedding = ExtractEmbedding(ModelType.TRANSFORMER_DOC, 'YuxinJiang/sup-promcse-roberta-large')

[32m2023-04-13 19:50:35.373[0m | [1mINFO    [0m | [36munsupervised_absa.embedding[0m:[36m__init__[0m:[36m80[0m - [1mTagger model instantiated with device: cpu[0m


In [49]:
promcse_embedding_data = promcse_embedding.extract(list_of_words)

100%|██████████| 1393/1393 [04:46<00:00,  4.86it/s]


In [50]:
import numpy as np
np.save('data/word embedding data/promcse_word_embedding.npy', promcse_embedding_data)

### sup-simcse-roberta-large word embedding

In [51]:
from unsupervised_absa.embedding import ExtractEmbedding, ModelType
simcse_embedding = ExtractEmbedding(ModelType.TRANSFORMER_DOC, 'princeton-nlp/sup-simcse-roberta-large')

[32m2023-04-13 19:55:30.908[0m | [1mINFO    [0m | [36munsupervised_absa.embedding[0m:[36m__init__[0m:[36m80[0m - [1mTagger model instantiated with device: cpu[0m


In [52]:
simcse_embedding_data = simcse_embedding.extract(list_of_words)

100%|██████████| 1393/1393 [04:36<00:00,  5.04it/s]


In [53]:
import numpy as np
np.save('data/word embedding data/simcse_word_embedding.npy', simcse_embedding_data)

### all-MiniLM-L6-v2

In [54]:
from unsupervised_absa.embedding import ExtractEmbedding, ModelType
all_MiniLM_L6_v2_embedding = ExtractEmbedding(ModelType.TRANSFORMER_DOC, 'sentence-transformers/all-MiniLM-L6-v2')

[32m2023-04-13 20:00:09.429[0m | [1mINFO    [0m | [36munsupervised_absa.embedding[0m:[36m__init__[0m:[36m80[0m - [1mTagger model instantiated with device: cpu[0m


In [55]:
all_MiniLM_L6_v2_embedding_data = all_MiniLM_L6_v2_embedding.extract(list_of_words)

100%|██████████| 1393/1393 [00:22<00:00, 61.28it/s]


In [56]:
import numpy as np
np.save('data/word embedding data/all_MiniLM_L6_v2_word_embedding.npy', all_MiniLM_L6_v2_embedding_data)

## Low Frequency (words with count 1) Words Removed

In [10]:
import json
from tqdm.auto import tqdm
with open('data/pos tag/semeval_pos_tag_remove_short_words_and_low_counts.json') as f:
    pos_tags = json.load(f)

list_of_words = []
for row in tqdm(pos_tags):
    for pos_data in row['pos_tag']:
        list_of_words.append(pos_data['word'])

100%|██████████| 6055/6055 [00:00<00:00, 3673732.20it/s]


In [11]:
# get the list of words, remove all the 
# convert words to lower case
list_of_words = list(set(map(lambda x: x.lower(), list_of_words)))

In [12]:
len(list_of_words)

1393

### Word Embedding

In [16]:
from unsupervised_absa.embedding import ExtractEmbedding, ModelType
bert_embedding = ExtractEmbedding(ModelType.TRANSFORMER_WORD, 'bert-large-uncased')

Downloading (…)okenizer_config.json: 100%|██████████| 28.0/28.0 [00:00<00:00, 16.0kB/s]
Downloading (…)lve/main/config.json: 100%|██████████| 571/571 [00:00<00:00, 170kB/s]
Downloading (…)solve/main/vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 595kB/s]
Downloading (…)/main/tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 2.00MB/s]
Downloading pytorch_model.bin: 100%|██████████| 1.34G/1.34G [01:21<00:00, 16.6MB/s]
[32m2023-04-14 16:27:37.585[0m | [1mINFO    [0m | [36munsupervised_absa.embedding[0m:[36m__init__[0m:[36m80[0m - [1mTagger model instantiated with device: cpu[0m


In [17]:
bert_embedding_data = bert_embedding.extract(list_of_words)

100%|██████████| 1393/1393 [03:40<00:00,  6.31it/s]


In [18]:
import numpy as np
np.save('data/word embedding data/sem_eval_bert_embedding_word_count.npy', bert_embedding_data)

Bart-Large

In [13]:
from unsupervised_absa.embedding import ExtractEmbedding, ModelType
bart_embedding = ExtractEmbedding(ModelType.TRANSFORMER_WORD, 'facebook/bart-large', device="cuda")

[32m2023-04-14 16:19:38.267[0m | [1mINFO    [0m | [36munsupervised_absa.embedding[0m:[36m__init__[0m:[36m80[0m - [1mTagger model instantiated with device: cpu[0m


In [14]:
bart_embedding_data = bart_embedding.extract(list_of_words)

100%|██████████| 1393/1393 [03:23<00:00,  6.86it/s]


In [15]:
import numpy as np
np.save('data/word embedding data/sem_eval_bart_embedding_word_count.npy', bart_embedding_data)

### Deberta-v3-Large

In [None]:
from unsupervised_absa.embedding import ExtractEmbedding, ModelType
deberta_embedding = ExtractEmbedding(ModelType.TRANSFORMER_WORD, 'microsoft/deberta-v3-large')

In [63]:
deberta_embedding_data = deberta_embedding.extract(list_of_words)

100%|██████████| 1393/1393 [24:32<00:00,  1.06s/it]


In [64]:
import numpy as np
np.save('data/word embedding data/sem_eval_deberta_embedding_word_count.npy', deberta_embedding_data)

### Glove

In [65]:
from unsupervised_absa.embedding import ExtractEmbedding, ModelType
glove_embedding = ExtractEmbedding(ModelType.WORD, 'glove')

[32m2023-04-13 20:26:48.862[0m | [1mINFO    [0m | [36munsupervised_absa.embedding[0m:[36m__init__[0m:[36m80[0m - [1mTagger model instantiated with device: cpu[0m


In [66]:
glove_embedding_data = glove_embedding.extract(list_of_words)

100%|██████████| 1393/1393 [00:00<00:00, 4154.65it/s]


In [67]:
import numpy as np
np.save('data/word embedding data/sem_eval_glove_embedding_word_count.npy', glove_embedding_data)

### FastText

In [68]:
from unsupervised_absa.embedding import ExtractEmbedding, ModelType
fast_text_embedding = ExtractEmbedding(ModelType.WORD, 'en')

[32m2023-04-13 20:27:07.933[0m | [1mINFO    [0m | [36munsupervised_absa.embedding[0m:[36m__init__[0m:[36m80[0m - [1mTagger model instantiated with device: cpu[0m


In [69]:
fast_text_embedding_data = fast_text_embedding.extract(list_of_words)

100%|██████████| 1393/1393 [00:00<00:00, 3428.09it/s]


In [70]:
import numpy as np
np.save('data/word embedding data/sem_eval_fast_text_embedding_word_count.npy', fast_text_embedding_data)

### Word2Vec

Word2Vec is not in flair, what we can do is convert from gensim to flair

#### Using Word2Vec pretrained on google news

In [71]:
import gensim.downloader
model_path = gensim.downloader.load('word2vec-google-news-300', return_path=True)

In [72]:
# load the model to keyedvector and save it as keyedvector
vectors = gensim.models.KeyedVectors.load_word2vec_format(model_path, binary=True)
vectors.save('models/word2vec-google.gensim', pickle_protocol=4)

In [73]:
from unsupervised_absa.embedding import ExtractEmbedding, ModelType
word2vec_embedding = ExtractEmbedding(ModelType.WORD, 'models/word2vec-google.gensim')

[32m2023-04-13 20:29:15.467[0m | [1mINFO    [0m | [36munsupervised_absa.embedding[0m:[36m__init__[0m:[36m80[0m - [1mTagger model instantiated with device: cpu[0m


In [74]:
word2vec_embedding_data = word2vec_embedding.extract(list_of_words)

100%|██████████| 1393/1393 [00:00<00:00, 1543.02it/s]


In [75]:
import numpy as np
np.save('data/word embedding data/sem_eval_word2vec_embedding_word_count.npy', word2vec_embedding_data)

### Stacked Embedding

Stacked embedding of the embedding models [bert, glove, FastText, Word2Vec]

In [76]:
from unsupervised_absa.embedding import ExtractEmbedding, ModelType
stacked_embedding = ExtractEmbedding([ModelType.TRANSFORMER_WORD, ModelType.TRANSFORMER_WORD, ModelType.WORD, ModelType.WORD, ModelType.WORD], ['microsoft/deberta-v3-large', 'bert-base-uncased', 'glove', 'en', 'models/word2vec-google.gensim'])

[32m2023-04-13 20:31:47.082[0m | [1mINFO    [0m | [36munsupervised_absa.embedding[0m:[36m__init__[0m:[36m80[0m - [1mTagger model instantiated with device: cpu[0m


In [77]:
stacked_embedding_data = stacked_embedding.extract(list_of_words)

100%|██████████| 1393/1393 [24:37<00:00,  1.06s/it]


In [78]:
import numpy as np
np.save('data/word embedding data/sem_eval_stacked_embedding_word_count.npy', stacked_embedding_data)

### sup-promcse-roberta-large word embedding

In [88]:
from unsupervised_absa.embedding import ExtractEmbedding, ModelType
promcse_embedding = ExtractEmbedding(ModelType.TRANSFORMER_DOC, 'YuxinJiang/sup-promcse-roberta-large')

[32m2023-04-13 21:10:01.653[0m | [1mINFO    [0m | [36munsupervised_absa.embedding[0m:[36m__init__[0m:[36m80[0m - [1mTagger model instantiated with device: cpu[0m


In [89]:
promcse_embedding_data = promcse_embedding.extract(list_of_words)

100%|██████████| 1393/1393 [04:10<00:00,  5.57it/s]


In [90]:
import numpy as np
np.save('data/word embedding data/promcse_word_embedding_word_count.npy', promcse_embedding_data)

### sup-simcse-roberta-large word embedding

In [91]:
from unsupervised_absa.embedding import ExtractEmbedding, ModelType
simcse_embedding = ExtractEmbedding(ModelType.TRANSFORMER_DOC, 'princeton-nlp/sup-simcse-roberta-large')

[32m2023-04-13 21:14:18.024[0m | [1mINFO    [0m | [36munsupervised_absa.embedding[0m:[36m__init__[0m:[36m80[0m - [1mTagger model instantiated with device: cpu[0m


In [92]:
simcse_embedding_data = simcse_embedding.extract(list_of_words)

100%|██████████| 1393/1393 [05:11<00:00,  4.48it/s]


In [93]:
import numpy as np
np.save('data/word embedding data/simcse_word_embedding_word_count.npy', simcse_embedding_data)

### all-MiniLM-L6-v2

In [94]:
from unsupervised_absa.embedding import ExtractEmbedding, ModelType
all_MiniLM_L6_v2_embedding = ExtractEmbedding(ModelType.TRANSFORMER_DOC, 'sentence-transformers/all-MiniLM-L6-v2')

[32m2023-04-13 21:19:31.160[0m | [1mINFO    [0m | [36munsupervised_absa.embedding[0m:[36m__init__[0m:[36m80[0m - [1mTagger model instantiated with device: cpu[0m


In [95]:
all_MiniLM_L6_v2_embedding_data = all_MiniLM_L6_v2_embedding.extract(list_of_words)

100%|██████████| 1393/1393 [00:35<00:00, 39.47it/s]


In [96]:
import numpy as np
np.save('data/word embedding data/all_MiniLM_L6_v2_word_embedding_word_count.npy', all_MiniLM_L6_v2_embedding_data)

### Sentence Embedding

In [None]:
from datasets import load_from_disk
preprocessed_dataset = load_from_disk('data/preprocessed data')

In [11]:
# get the sentences
preprocessed_df = preprocessed_dataset.to_pandas()
preprocessed_df = preprocessed_df.drop_duplicates(subset=['text'])

#### sup-promcse-roberta-large

In [None]:
from unsupervised_absa.embedding import ExtractEmbedding, ModelType
promcse_embedding = ExtractEmbedding(ModelType.TRANSFORMER_DOC, 'YuxinJiang/sup-promcse-roberta-large')

[32m2023-04-12 13:28:07.020[0m | [1mINFO    [0m | [36munsupervised_absa.embedding[0m:[36m__init__[0m:[36m80[0m - [1mTagger model instantiated with device: cpu[0m


In [None]:
promcse_embedding_data = promcse_embedding.extract(preprocessed_df['text'])

100%|██████████| 6055/6055 [17:53<00:00,  5.64it/s]


In [None]:
import numpy as np
np.save('data/sentence embedding data/promcse_embedding.npy', promcse_embedding_data)

#### sup-simcse-roberta-large

In [None]:
from unsupervised_absa.embedding import ExtractEmbedding, ModelType
simcse_embedding = ExtractEmbedding(ModelType.TRANSFORMER_DOC, 'princeton-nlp/sup-simcse-roberta-large')

[32m2023-04-12 13:46:04.884[0m | [1mINFO    [0m | [36munsupervised_absa.embedding[0m:[36m__init__[0m:[36m80[0m - [1mTagger model instantiated with device: cpu[0m


In [None]:
simcse_embedding_data = simcse_embedding.extract(preprocessed_df['text'])

100%|██████████| 6055/6055 [18:21<00:00,  5.50it/s]  


In [None]:
import numpy as np
np.save('data/sentence embedding data/simcse_embedding.npy', simcse_embedding_data)

#### all-MiniLM-L6-v2

In [None]:
from unsupervised_absa.embedding import ExtractEmbedding, ModelType
all_MiniLM_L6_v2_embedding = ExtractEmbedding(ModelType.TRANSFORMER_DOC, 'sentence-transformers/all-MiniLM-L6-v2')

[32m2023-04-12 14:04:28.041[0m | [1mINFO    [0m | [36munsupervised_absa.embedding[0m:[36m__init__[0m:[36m80[0m - [1mTagger model instantiated with device: cpu[0m


In [None]:
all_MiniLM_L6_v2_embedding_data = all_MiniLM_L6_v2_embedding.extract(preprocessed_df['text'])

100%|██████████| 6055/6055 [01:10<00:00, 86.18it/s]


In [None]:
import numpy as np
np.save('data/sentence embedding data/all_MiniLM_L6_v2_embedding.npy', all_MiniLM_L6_v2_embedding_data)

#### Stacked Sentence Embedding

Stacked embedding of the embedding models [sup-promcse-roberta-large, princeton-nlp/sup-simcse-roberta-large, sentence-transformers/all-MiniLM-L6-v2]

In [None]:
from unsupervised_absa.embedding import ExtractEmbedding, ModelType
stacked_sentence_embedding = ExtractEmbedding([ModelType.TRANSFORMER_DOC, ModelType.TRANSFORMER_DOC, ModelType.TRANSFORMER_DOC], ['YuxinJiang/sup-promcse-roberta-large', 'princeton-nlp/sup-simcse-roberta-large', 'sentence-transformers/all-MiniLM-L6-v2'])

[32m2023-04-12 14:05:47.821[0m | [1mINFO    [0m | [36munsupervised_absa.embedding[0m:[36m__init__[0m:[36m80[0m - [1mTagger model instantiated with device: cpu[0m


In [None]:
stacked_sentence_embedding_data = stacked_sentence_embedding.extract(preprocessed_df['text'])

100%|██████████| 6055/6055 [37:26<00:00,  2.70it/s]


In [None]:
import numpy as np
np.save('data/sentence embedding data/stacked_sentence_embedding.npy', stacked_sentence_embedding_data)