# Load data

In [None]:
with open('/content/drive/MyDrive/_UIT-VSFC/test/sents.txt') as f:
    lines = [line.rstrip('\n') for line in f]

In [None]:
with open('/content/drive/MyDrive/_UIT-VSFC/test/sentiments.txt') as f:
    sentiments = [int(line.rstrip('\n')) for line in f]

In [None]:
from datasets import Dataset

test_data = {'text': lines, 'sentiments': sentiments}
test_data = Dataset.from_dict(test_data)

In [None]:
test_data[0]

{'text': 'nói tiếng anh lưu loát .', 'sentiments': 2}

In [None]:
pos_neg_only = test_data.filter(lambda x: x['sentiments'] != 1)

  0%|          | 0/4 [00:00<?, ?ba/s]

In [None]:
print(len(test_data))
print(len(pos_neg_only))

3166
2999


# Word2Vec Semantic Orientation

In [2]:
import py_vncorenlp

# Automatically download VnCoreNLP components from the original repository
# and save them in some local working folder
py_vncorenlp.download_model(save_dir='/content')

In [3]:
model = py_vncorenlp.VnCoreNLP(annotators=["wseg", "pos"], save_dir='/content')

In [None]:
def get_pos(annotated):
    sentence = [word['wordForm'] for word in annotated[0]]
    pos = [word['posTag'] for word in annotated[0]]
    return sentence, pos

In [None]:
def extract_opinion_phrases(sentence, pos_tags):
    PATTERNS = [['N', 'A'],
                ['V', 'A'],
                ['R', 'A'],
                ['R', 'V'],
                ['V', 'R']]
                # ['N', 'V']] # May or may not need ['N', 'V]
    extracted_phrases = []
    extracted_pos = []
    for word_idx, _ in enumerate(sentence):
        phrase = sentence[word_idx:word_idx + 2]
        pos = pos_tags[word_idx: word_idx + 2]
        if pos in PATTERNS:
            extracted_phrases.append(phrase)
            extracted_pos.append(pos)
    return extracted_phrases, extracted_pos

In [None]:
import torch

In [None]:
# word2vec dataset to load pretrained word2vec (large file) faster instead of using gensim
from datasets import load_from_disk
word2vec = load_from_disk('/content/drive/MyDrive/pho_w2v')
word_dict = {word: idx for idx, word in enumerate(word2vec['word'])}

def word_embed(word):
    if word not in word_dict:
        return None
    return torch.Tensor(word2vec[word_dict[word]]['embed'])

In [None]:
import gensim
word2vec = gensim.models.KeyedVectors.load_word2vec_format('/content/drive/MyDrive/Colab Notebooks/WebMining/custom_w2v/custom_w2v.bin', binary=True)

In [None]:
def word_embed(word):
    try:
        return torch.Tensor(word2vec[word])
    except KeyError:
        return None

In [None]:
def phrase_embed(phrase):
    embed = torch.zeros(100)
    count = 0
    for word in phrase:
        if word_embed(word) is None:
            continue
        
        embed += word_embed(word)
        count += 1
    return embed / count

In [None]:
import torch.nn.functional as F

In [None]:
def semantic_orientation(phrases):
    pos_anchor = word_embed('tốt')
    neg_anchor = word_embed('kém')

    sentence_semantic = 0
    for phrase in phrases:
        phrase_embedded = phrase_embed(phrase)
        pos_similarity = F.cosine_similarity(phrase_embedded, pos_anchor, dim=0)
        neg_similarity = F.cosine_similarity(phrase_embedded, neg_anchor, dim=0)
        phrase_semantic = pos_similarity - neg_similarity
        sentence_semantic += phrase_semantic
        # print(phrase)
        # print(pos_similarity, neg_similarity)
        # print()
    return sentence_semantic

In [None]:
def semantic_orientation2(phrases):
    pos_anchor = word_embed('tốt')
    neg_anchor = word_embed('kém')

    sentence_semantic = 0
    for phrase in phrases:
        for word in phrase:
            word_embedded = word_embed(word)
            if word_embedded is None:
                continue
            pos_similarity = F.cosine_similarity(word_embedded, pos_anchor, dim=0)
            neg_similarity = F.cosine_similarity(word_embedded, neg_anchor, dim=0)
            word_semantic = pos_similarity - neg_similarity
            sentence_semantic += word_semantic
            
    return sentence_semantic

In [None]:
def eval_test_data(example):
    annotated = model.annotate_text(example['text'])
    sentence, pos_tags = get_pos(annotated)
    extracted_phrases, extracted_pos = extract_opinion_phrases(sentence, pos_tags)
    if len(extracted_phrases) == 0:
        example['score'] = 0
    else:
        score = semantic_orientation(extracted_phrases)
        # score = semantic_orientation2(extracted_phrases)
        example['score'] = score
    return example

In [None]:
def check_example_with_phrase(example):
    annotated = model.annotate_text(example['text'])
    sentence, pos_tags = get_pos(annotated)
    extracted_phrases, extracted_pos = extract_opinion_phrases(sentence, pos_tags)
    if len(extracted_phrases) == 0:
        return False
    return True

In [None]:
pos_neg_only_with_phrase = pos_neg_only.filter(check_example_with_phrase)



  0%|          | 0/3 [00:00<?, ?ba/s]

In [None]:
from tqdm import tqdm

scores = []
for example in tqdm(pos_neg_only_with_phrase):
    score = eval_test_data(example)['score']
    scores.append(float(score))

100%|██████████| 2572/2572 [00:04<00:00, 636.74it/s]


In [None]:
pos_neg_only_scored = pos_neg_only_with_phrase.add_column("scores", scores)

Flattening the indices:   0%|          | 0/3 [00:00<?, ?ba/s]

In [None]:
def predicted_label(example):
    if example['scores'] > 0 :
        example['predicted'] = 2
    elif example['scores'] < 0:
        example['predicted'] = 0
    else:
        example['predicted'] = -1
    return example

In [None]:
pos_neg_only_scored = pos_neg_only_scored.map(predicted_label)

  0%|          | 0/2572 [00:00<?, ?ex/s]

In [None]:
correct = pos_neg_only_scored.filter(lambda x: x['sentiments'] == x['predicted'])
print(len(correct))
print('Accuracy:', len(correct) / len(pos_neg_only_scored))

  0%|          | 0/3 [00:00<?, ?ba/s]

2055
Accuracy: 0.7989891135303266


#### Train word2vec on train data

##### If count examples with no extracted phrase as false examples

|          | No neutral embed | With neutral embed |
| -------- | ---------------- | ------------------ |
| Method 1 | 0.69             | 0.687              |
| Method 2 | 0.687            | 0.684              |

##### If exclude examples with no extracted phrase

|          | No neutral embed | With neutral embed |
| -------- | ---------------- | ------------------ |
| Method 1 | 0.815            | 0.811              |
| Method 2 | 0.812            | 0.808              |

#### Pre-trained Word2Vec

|          |                  | 
| -------- | ---------------- | 
| Method 1 | 0.554            | 
| Method 2 | 0.551            | 

# Experiment with cluster word2vec and train word2vec





## Train word2vec

In [None]:
from datasets import Dataset

with open('/content/drive/MyDrive/_UIT-VSFC/train/sents.txt') as f:
    lines = [line.rstrip('\n') for line in f]

with open('/content/drive/MyDrive/_UIT-VSFC/train/sentiments.txt') as f:
    sentiments = [int(line.rstrip('\n')) for line in f]

train_data = {'text': lines, 'sentiments': sentiments}
train_data = Dataset.from_dict(train_data)

In [None]:
annotated = model.annotate_text(train_data[0]['text'])

In [None]:
get_pos(annotated)

(['slide', 'giáo_trình', 'đầy_đủ', '.'], ['N', 'N', 'A', 'CH'])

In [None]:
def tokenize_sentence(example):
    annotated = model.annotate_text(example['text'])
    words, pos = get_pos(annotated)
    example['words'] = words
    return example

In [None]:
train_data = train_data.map(tokenize_sentence)



  0%|          | 0/11426 [00:00<?, ?ex/s]

In [None]:
train_data = train_data.filter(lambda x: x['sentiments'] != 1)

  0%|          | 0/12 [00:00<?, ?ba/s]

In [None]:
from gensim.models import Word2Vec

custom_w2v = Word2Vec(sentences=train_data['words'], size=100, window=5, min_count=1, workers=2)

In [None]:
custom_w2v.wv.vectors.shape

(3568, 100)

In [None]:
custom_w2v.wv.save_word2vec_format('/content/drive/MyDrive/Colab Notebooks/WebMining/custom_w2v_no_neutral.bin', binary=True)