Glove

In [1]:
from torchtext.vocab import Vectors
import numpy as np

def simalarity(word1, word2):
    GloVectors = Vectors(name='./glove.6B/glove.6B.300d.txt')
    word1_vec = GloVectors.vectors[GloVectors.stoi[word1]].numpy()
    word2_vec = GloVectors.vectors[GloVectors.stoi[word2]].numpy()
    dot = np.dot(word1_vec.T, word2_vec)
    return dot / np.sqrt(np.sum(word1_vec ** 2)) / np.sqrt(
        np.sum(word2_vec ** 2))

print(simalarity('well', 'good')) #0.7045711
print(simalarity('bad', 'good')) #0.64452195
print(simalarity('normal', 'good')) #0.41142386

ModuleNotFoundError: No module named 'torchtext'

In [2]:
import warnings
import numpy as np
import time
from gensim import corpora
import jieba
from tqdm import tqdm
warnings.filterwarnings("ignore")
np.random.seed(2021) #设置固定的随机数种子



In [3]:
def randmatrix(m, n):
    """Creates an m x n matrix of random values drawn using
    the Xavier Glorot method."""
    val = np.sqrt(6.0 / (m + n))
    return np.random.uniform(-val, val, size=(m, n))


def log_of_array_ignoring_zeros(M):
    log_M = M.copy()
    mask = log_M > 0
    log_M[mask] = np.log(log_M[mask])
    return log_M


def noise(n, scale=0.01):
    return np.random.normal(0, scale, size=n)


class AdaGradOptimizer:
    def __init__(self, learning_rate, initial_accumulator_value=0.1,momentum=None):
        self.learning_rate = learning_rate
        self.initial_accumulator_value = initial_accumulator_value
        self._momentum = momentum

    def get_step(self, grad):
        if self._momentum is None:
            self._momentum = self.initial_accumulator_value * np.ones_like(grad)
        self._momentum += grad ** 2
        return self.learning_rate * grad / np.sqrt(self._momentum)


class GloVe(object):
    def __init__(self, n, max_iter, learning_rate):
        self.n = n
        self.max_iter = max_iter
        self.xmax = 100
        self.alpha = 0.75
        self.mittens = 0
        self.learning_rate = learning_rate
        self.tol = 1e-4
        self.display_progress = 100
        self.model = None
        self.n_words = None
        self.log_dir = None
        self.log_subdir = None
        self.errors = list()
        self.test_mode = False

    def _initialize(self, coincidence):
        self.n_words = coincidence.shape[0]
        bounded = np.minimum(coincidence, self.xmax)
        weights = (bounded / float(self.xmax)) ** self.alpha
        log_coincidence = log_of_array_ignoring_zeros(coincidence)
        return weights, log_coincidence

    def fit(self, X, vocab=None, initial_embedding_dict=None):
        weights, log_coincidence = self._initialize(X)
        self._initialize_w_c_b(self.n_words, vocab, initial_embedding_dict)
        m_loop = tqdm(range(self.max_iter))
        for iteration in m_loop:
            pred = self._make_prediction()
            gradients, error = self._get_gradients_and_error(
                pred, log_coincidence, weights)
            self.errors.append(error)
            self._apply_updates(gradients)
            m_loop.set_description("Iteration {}:error {:4.4f}".format(iteration + 1, error))
        return self.W + self.C

    def _check_shapes(self, gradients):
        assert gradients['W'].shape == self.W.shape
        assert gradients['C'].shape == self.C.shape
        assert gradients['bw'].shape == self.bw.shape
        assert gradients['bc'].shape == self.bc.shape

    def _initialize_w_c_b(self, n_words, vocab, initial_embedding_dict):
        self.W = randmatrix(n_words, self.n)  # Word weights.
        self.C = randmatrix(n_words, self.n)  # Context weights.
        if initial_embedding_dict:
            assert self.n == len(next(iter(initial_embedding_dict.values())))

            self.original_embedding = np.zeros((len(vocab), self.n))
            self.has_embedding = np.zeros(len(vocab), dtype=bool)

            for i, w in enumerate(vocab):
                if w in initial_embedding_dict:
                    self.has_embedding[i] = 1
                    embedding = np.array(initial_embedding_dict[w])
                    self.original_embedding[i] = embedding
                    # Divide the original embedding into W and C,
                    # plus some noise to break the symmetry that would
                    # otherwise cause both gradient updates to be
                    # identical.
                    self.W[i] = 0.5 * embedding + noise(self.n)
                    self.C[i] = 0.5 * embedding + noise(self.n)
            # This is for testing. It differs from
            # `self.original_embedding` only in that it includes the
            # random noise we added above to break the symmetry.
            self.G_start = self.W + self.C

        self.bw = randmatrix(n_words, 1)
        self.bc = randmatrix(n_words, 1)
        self.ones = np.ones((n_words, 1))

    def _make_prediction(self):
        # Here we make use of numpy's broadcasting rules
        pred = np.dot(self.W, self.C.T) + self.bw + self.bc.T
        return pred

    def _get_gradients_and_error(self,
                                 predictions,
                                 log_coincidence,
                                 weights):
        # First we compute the GloVe gradients
        diffs = predictions - log_coincidence
        weighted_diffs = np.multiply(weights, diffs)
        wgrad = weighted_diffs.dot(self.C)
        cgrad = weighted_diffs.T.dot(self.W)
        bwgrad = weighted_diffs.sum(axis=1).reshape(-1, 1)
        bcgrad = weighted_diffs.sum(axis=0).reshape(-1, 1)
        error = (0.5 * np.multiply(weights, diffs ** 2)).sum()

        # Then we add the Mittens term (only if mittens > 0)
        if self.mittens > 0:
            curr_embedding = self.W + self.C
            distance = curr_embedding[self.has_embedding, :] - \
                       self.original_embedding[self.has_embedding, :]
            wgrad[self.has_embedding, :] += 2 * self.mittens * distance
            cgrad[self.has_embedding, :] += 2 * self.mittens * distance
            error += self.mittens * (
                    np.linalg.norm(distance, ord=2, axis=1) ** 2).sum()
        return {'W': wgrad, 'C': cgrad, 'bw': bwgrad, 'bc': bcgrad}, error

    def _apply_updates(self, gradients):
      
        if not hasattr(self, 'optimizers'):
            self.optimizers = \
                {obj: AdaGradOptimizer(self.learning_rate)
                 for obj in ['W', 'C', 'bw', 'bc']}
        self.W -= self.optimizers['W'].get_step(gradients['W'])
        self.C -= self.optimizers['C'].get_step(gradients['C'])
        self.bw -= self.optimizers['bw'].get_step(gradients['bw'])
        self.bc -= self.optimizers['bc'].get_step(gradients['bc'])


In [5]:
def leftRight(c_pos, max_len, window):
    return c_pos - window if c_pos - window > 0 else 0, \
           c_pos + window + 1 if c_pos + window + 1 < max_len else max_len


def getCoMatriex(texts, token_id, window=2):
    n_matrix = len(token_id)
    word_matrix = np.zeros(shape=[n_matrix, n_matrix])

    for i in range(len(texts)):
        k = len(texts[i])
        for j in range(k):
            left, right = leftRight(j, k, window)
            c_word = texts[i][j]
            c_pos = token_id[c_word]
            for m in range(left, right):
                # 计算共现矩阵
                t_word = texts[i][m]
                t_pos = token_id[t_word]
                if m != j and t_word != c_word:
                    word_matrix[c_pos][t_pos] += 1
    return word_matrix



def getCorpora(texts):
    dct = corpora.Dictionary(texts)
    token2idDict = dct.token2id
    return dct, token2idDict

#模型训练

if __name__ == "__main__":
    # 生成词汇相关矩阵
    with open("doc.txt", "r", encoding='utf-8') as f:
        sentences = f.readlines()

    texts = []
    for text in sentences:
        texts.append(jieba.lcut(text))
    n_dims = 10

     # 获得语料字典
    dct, token_id = getCorpora(texts) 

    stratTime = time.time()
    # 计算共现矩阵
    wordComatrix = getCoMatriex(texts, token_id, window=2)
    print("total time cost:", time.time() - stratTime)

    # print(word_matrix)
    # 设置GloVe模型
    glove = GloVe(n=n_dims, max_iter=6000, learning_rate=0.004)
    #获得GloVe模型的词向量
    wordEmbedding = glove.fit(wordComatrix)
    print(wordEmbedding.shape)
    # 查询词向量
    print("jieba的词向量为:", wordEmbedding[token_id['jieba']])

FileNotFoundError: [Errno 2] No such file or directory: 'doc.txt'