In [1]:
import torch
import torch.nn as nn
from torch.nn import functional as F

class SkipGram(nn.Module):
    def __init__(self, vocab_size, embed_dim):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, embed_dim)
    
    def forward(self, center, context, negatives):
        # center: [batch_size]
        # context: [batch_size] (正例)
        # negatives: [batch_size, k] (负例)
        center_emb = self.embed(center)  # [batch_size, embed_dim]
        context_emb = self.embed(context)  # [batch_size, embed_dim]
        negatives_emb = self.embed(negatives)  # [batch_size, k, embed_dim]
        
        # 正例得分
        pos_score = torch.sum(center_emb * context_emb, dim=1)  # [batch_size]
        pos_loss = -F.logsigmoid(pos_score).mean()
        
        # 负例得分
        neg_score = torch.bmm(negatives_emb, center_emb.unsqueeze(2)).squeeze(2)  # [batch_size, k]
        neg_loss = -F.logsigmoid(-neg_score).mean()
        
        return pos_loss + neg_loss

# mypy: allow-untyped-defs
from typing import Optional

import torch
from torch import Tensor
from torch.nn import functional as F, init
from torch.nn.parameter import Parameter

from .module import Module


__all__ = ["Embedding", "EmbeddingBag"]


class Embedding(Module):
    r"""A simple lookup table that stores embeddings of a fixed dictionary and size.

    This module is often used to store word embeddings and retrieve them using indices.
    The input to the module is a list of indices, and the output is the corresponding
    word embeddings.

    Args:
        num_embeddings (int): size of the dictionary of embeddings
        embedding_dim (int): the size of each embedding vector
        padding_idx (int, optional): If specified, the entries at :attr:`padding_idx` do not contribute to the gradient;
                                     therefore, the embedding vector at :attr:`padding_idx` is not updated during training,
                                     i.e. it remains as a fixed "pad". For a newly constructed Embedding,
                                     the embedding vector at :attr:`padding_idx` will default to all zeros,
                                     but can be updated to another value to be used as the padding vector.
        max_norm (float, optional): If given, each embedding vector with norm larger than :attr:`max_norm`
                                    is renormalized to have norm :attr:`max_norm`.
        norm_type (float, optional): The p of the p-norm to compute for the :attr:`max_norm` option. Default ``2``.
        scale_grad_by_freq (bool, optional): If given, this will scale gradients by the inverse of frequency of
                                                the words in the mini-batch. Default ``False``.
        sparse (bool, optional): If ``True``, gradient w.r.t. :attr:`weight` matrix will be a sparse tensor.
                                 See Notes for more details regarding sparse gradients.

    Attributes:
        weight (Tensor): the learnable weights of the module of shape (num_embeddings, embedding_dim)
                         initialized from :math:`\mathcal{N}(0, 1)`

    Shape:
        - Input: :math:`(*)`, IntTensor or LongTensor of arbitrary shape containing the indices to extract
        - Output: :math:`(*, H)`, where `*` is the input shape and :math:`H=\text{embedding\_dim}`

    .. note::
        Keep in mind that only a limited number of optimizers support
        sparse gradients: currently it's :class:`optim.SGD` (`CUDA` and `CPU`),
        :class:`optim.SparseAdam` (`CUDA` and `CPU`) and :class:`optim.Adagrad` (`CPU`)

    .. note::
        When :attr:`max_norm` is not ``None``, :class:`Embedding`'s forward method will modify the
        :attr:`weight` tensor in-place. Since tensors needed for gradient computations cannot be
        modified in-place, performing a differentiable operation on ``Embedding.weight`` before
        calling :class:`Embedding`'s forward method requires cloning ``Embedding.weight`` when
        :attr:`max_norm` is not ``None``. For example::

            n, d, m = 3, 5, 7
            embedding = nn.Embedding(n, d, max_norm=1.0)
            W = torch.randn((m, d), requires_grad=True)
            idx = torch.tensor([1, 2])
            a = (
                embedding.weight.clone() @ W.t()
            )  # weight must be cloned for this to be differentiable
            b = embedding(idx) @ W.t()  # modifies weight in-place
            out = a.unsqueeze(0) + b.unsqueeze(1)
            loss = out.sigmoid().prod()
            loss.backward()

    Examples::

        >>> # an Embedding module containing 10 tensors of size 3
        >>> embedding = nn.Embedding(10, 3)
        >>> # a batch of 2 samples of 4 indices each
        >>> input = torch.LongTensor([[1, 2, 4, 5], [4, 3, 2, 9]])
        >>> # xdoctest: +IGNORE_WANT("non-deterministic")
        >>> embedding(input)
        tensor([[[-0.0251, -1.6902,  0.7172],
                 [-0.6431,  0.0748,  0.6969],
                 [ 1.4970,  1.3448, -0.9685],
                 [-0.3677, -2.7265, -0.1685]],

                [[ 1.4970,  1.3448, -0.9685],
                 [ 0.4362, -0.4004,  0.9400],
                 [-0.6431,  0.0748,  0.6969],
                 [ 0.9124, -2.3616,  1.1151]]])


        >>> # example with padding_idx
        >>> embedding = nn.Embedding(10, 3, padding_idx=0)
        >>> input = torch.LongTensor([[0, 2, 0, 5]])
        >>> embedding(input)
        tensor([[[ 0.0000,  0.0000,  0.0000],
                 [ 0.1535, -2.0309,  0.9315],
                 [ 0.0000,  0.0000,  0.0000],
                 [-0.1655,  0.9897,  0.0635]]])

        >>> # example of changing `pad` vector
        >>> padding_idx = 0
        >>> embedding = nn.Embedding(3, 3, padding_idx=padding_idx)
        >>> embedding.weight
        Parameter containing:
        tensor([[ 0.0000,  0.0000,  0.0000],
                [-0.7895, -0.7089, -0.0364],
                [ 0.6778,  0.5803,  0.2678]], requires_grad=True)
        >>> with torch.no_grad():
        ...     embedding.weight[padding_idx] = torch.ones(3)
        >>> embedding.weight
        Parameter containing:
        tensor([[ 1.0000,  1.0000,  1.0000],
                [-0.7895, -0.7089, -0.0364],
                [ 0.6778,  0.5803,  0.2678]], requires_grad=True)
    """

    __constants__ = [
        "num_embeddings",
        "embedding_dim",
        "padding_idx",
        "max_norm",
        "norm_type",
        "scale_grad_by_freq",
        "sparse",
    ]

    num_embeddings: int
    embedding_dim: int
    padding_idx: Optional[int]
    max_norm: Optional[float]
    norm_type: float
    scale_grad_by_freq: bool
    weight: Tensor
    freeze: bool
    sparse: bool

In [None]:
import random

def get_centers_and_contexts(corpus,max_window_size):
    centers,contexts=[],[]
    for i in range(len(corpus)):
        center=corpus[i]
        start=max(0,i-max_window_size)
        end=min(len(corpus),i+max_window_size+1)
        context=corpus[start:end]
        if i>=max_window_size and i<len(corpus)-max_window_size:
            centers.append(center)
            contexts.append(context)
    return centers,contexts
class RandomGenerator:
    def __init__(self,sampling_weights):

        """
        初始化方法
        参数:
            sampling_weights: 采样权重列表，用于后续的采样操作
        """
        self.population=list(range(len(sampling_weights)))  # 初始化种群，为每个权重创建一个索引
        self.sampling_weights=sampling_weights  # 设置采样权重属性
        self.candidates=[]  # 初始化候选列表，用于存储采样候选
        self.i=0  # 初始化计数器，可能用于跟踪采样进度
    def draw(self):
        """
        执行随机采样操作的方法
        返回:
            当前索引和对应的采样个体
        """
        # 检查当前索引是否超出候选列表长度
        if self.i>=len(self.candidates):
            # 如果超出，则从当前种群中根据采样权重重新选择候选个体
            # random.choices函数会根据权重列表sampling_weights从population中抽取与population等长的样本
            self.candidates=random.choices(self.population,self.sampling_weights,k=len(self.population))
            # 重置索引为0
            self.i=0
            #self.candiates是返回3个

        # 索引递增
        self.i+=1
        # 返回当前选中的候选个体
        return self.candidates[self.i-1]
generator=RandomGenerator([2,3,4])
a=[generator.draw() for _ in range(20)]
print(generator.draw())

print("a=",a)

def get_negatives(all_contexts,vocab,counter,k):
    sampling_weights=[counter[vocab.to_token(i)]**0.75 for i in range(len(vocab))]
    #计算采样权重，.to_token(i)将索引i转换为对应的单词，counter[vocab.to_token(i)]获取该单词的词频，然后计算词频的0.75次方作为采样权重
    all_negatives=[]
    #counter存储的是单词的词频
    #vocab存储的是单词的索引
    #vocab.to_token(i)将索引i转换为对应的单词
    #all_contexts存储的是所有上下文
    generator=RandomGenerator(sampling_weights)#创建一个随机生成器，用于根据采样权重进行采样
    for contexts in all_contexts:
        negatives=[]#初始化一个空列表，用于存储负样本
        while len(negatives)<len(contexts)*k:
            neg=generator.draw()#从采样器中抽取一个负样本
            if neg not in contexts:   #如果负样本不在上下文中，则将其添加到负样本列表中
                negatives.append(neg)#all_contexts存储的是上下文的索引，negatives存储的是负样本的索引
        all_negatives.append(negatives)#每一个negatives大小为len(contexts)*k,存储的是负样本的索引，
    return all_negatives
#小批量加载训练实例
def batchify(data):
    """返回带有负采样的跳远模型的小批量样本"""
    maxlen=max(len(c)+len(n) for _,c,n in data  )#计算上下文和负样本的最大长度
    centers,context_negatives,masks,labels=[],[],[],[]
    #初始化四个空列表，分别存储中心词、上下文和负样本、掩码和标签
    for center,context,negatives in data:
        #遍历数据集中的每个样本

        cur_len=len(context)+len(negatives)#计算当前样本的长度，即上下文和负样本的总数
        centers+=[center]#将中心词添加到中心词列表中
        contexts_negatives+=[context+negatives]#将上下文和负样本添加到上下文和负样本列表中
        masks+=[ [1]*len(cur_len)+[0]*(maxlen-cur_len)]#将掩码添加到掩码列表中
        labels+=[ [1]*len(context)+[-1]*len(negatives)+[0]*(maxlen-cur_len)]#将标签添加到标签列表中,
        #-1表示负样本,maxlen-cur_len表示负样本的数量,[1]*len(context)表示正样本的数量
    return torch.LongTensor(centers),torch.LongTensor(contexts_negatives),torch.FloatTensor(masks),torch.FloatTensor(labels)


a=[[1]*4+[-1]*2]
print("a=",a)

1
a= [2, 1, 1, 2, 1, 2, 1, 2, 1, 2, 2, 1, 2, 1, 1, 1, 2, 2, 2, 1]
a= [[1, 1, 1, 1, -1, -1]]
