In [1]:
import torch
import torch.nn as nn
import pickle
import requests

import sys
import os
sys.path.append(os.path.abspath('../..'))
from app.classes.glove_model import Glove

In [2]:
with open('../../app/models/glove_gensim/glove_gensim.pkl', 'rb') as f:
    model = pickle.load(f)

In [3]:
analogy_url = "https://www.fit.vutbr.cz/~imikolov/rnnlm/word-test.v1.txt"

def load_word_analogy_data_for_syntactic_accuracy(url):
    response = requests.get(url)
    response.raise_for_status()
    lines = response.text.strip().split('\n')
    
    # Extract specific section
    section_start = ': gram7-past-tense'
    section_end = ': gram8-plural'
    extract_lines = []
    in_section = False

    for line in lines:
        if line.startswith(section_start):
            in_section = True
            continue
        elif line.startswith(section_end):
            break

        if in_section:
            extract_lines.append(line)

    return [line.split() for line in extract_lines if line]

def load_word_analogy_data_for_semantic_accuracy(url):
    response = requests.get(url)
    response.raise_for_status()
    lines = response.text.strip().split('\n')
    
    # Extract specific section
    section_start = ': capital-common-countries'
    section_end = ': currency'
    extract_lines = []
    in_section = False

    for line in lines:
        if line.startswith(section_start):
            in_section = True
            continue
        elif line.startswith(section_end):
            break

        if in_section:
            extract_lines.append(line)

    return [line.split() for line in extract_lines if line]

syntactic_analogy_data = load_word_analogy_data_for_syntactic_accuracy(analogy_url)
semantic_analogy_data = load_word_analogy_data_for_semantic_accuracy(analogy_url)

In [4]:
def predict_analogy(word_a, word_b, word_c):
    result = model.most_similar(positive=[word_c, word_b], negative=[word_a])
    return result[0][0]

In [5]:
def evaluate_semantic_accuracy(analogy_data):
    correct = 0
    total = 0

    for question in analogy_data:
        if len(question) != 4:
            continue
        word_a, word_b, word_c, word_d = question
        try:
            predicted_word = predict_analogy(word_a, word_b, word_c)
        except:
            predicted_word = None

        if predicted_word == word_d:
            correct += 1

        total += 1

    accuracy = correct / total if total > 0 else 0
    return accuracy

In [6]:
def evaluate_syntactic_accuracy(analogy_data):
    syntactic_correct = 0
    syntactic_total = 0

    for question in analogy_data:
        if len(question) != 4:
            continue
        word_a, word_b, word_c, word_d = question
        # Process syntactic relationships directly from the dataset
        if word_a.endswith("ing") and word_b.endswith("ed") and word_c.endswith("ing") and word_d.endswith("ed"):
            try:
                predicted_word = predict_analogy(word_a, word_b, word_c)
            except:
                predicted_word = None

            if predicted_word == word_d:
                syntactic_correct += 1

            syntactic_total += 1

    syntactic_accuracy = syntactic_correct / syntactic_total if syntactic_total > 0 else 0
    return syntactic_accuracy

In [7]:
syntactic_accuracy = evaluate_syntactic_accuracy(syntactic_analogy_data)
semantic_accuracy = evaluate_semantic_accuracy(semantic_analogy_data)

print(f"Syntactic Accuracy: {syntactic_accuracy * 100:.2f}%")
print(f"Semantic Accuracy: {semantic_accuracy * 100:.2f}%")

Syntactic Accuracy: 2.29%
Semantic Accuracy: 0.00%
