In [1]:
#dataloader
CBOW_N_WORDS=4
SKIPGRAM_N_WORDS=4
MIN_WORD_FREQUENCY=50
MAX_SEQUENCE_LENGTH=256
EMBED_DIMENSION=300
EMBED_MAX_NORM=1

from torchtext.data import get_tokenizer
tokenizer = get_tokenizer("basic_english")
tokens = tokenizer("You can now install TorchText using pip!")
tokens


  from .autonotebook import tqdm as notebook_tqdm


['you', 'can', 'now', 'install', 'torchtext', 'using', 'pip', '!']

In [2]:
from torchtext.data import to_map_style_dataset
from torchtext.vocab import build_vocab_from_iterator
from modelscope.msdatasets import MsDataset
train_data_iter = to_map_style_dataset(MsDataset.load('wikitext', subset_name='wikitext-2-v1', split='train'))
valid_data_iter = to_map_style_dataset(MsDataset.load('wikitext', subset_name='wikitext-2-v1', split='validation'))



In [3]:
train_data_iter=[i['text'] for i in train_data_iter if i['text'] != '' ]
valid_data_iter=[i['text'] for i in valid_data_iter if i['text'] != '' ]

In [4]:
valid_data_iter[:10]

[' = Homarus gammarus = \n',
 ' Homarus gammarus , known as the European lobster or common lobster , is a species of <unk> lobster from the eastern Atlantic Ocean , Mediterranean Sea and parts of the Black Sea . It is closely related to the American lobster , H. americanus . It may grow to a length of 60 cm ( 24 in ) and a mass of 6 kilograms ( 13 lb ) , and bears a conspicuous pair of claws . In life , the lobsters are blue , only becoming " lobster red " on cooking . Mating occurs in the summer , producing eggs which are carried by the females for up to a year before hatching into <unk> larvae . Homarus gammarus is a highly esteemed food , and is widely caught using lobster pots , mostly around the British Isles . \n',
 ' = = Description = = \n',
 ' Homarus gammarus is a large <unk> , with a body length up to 60 centimetres ( 24 in ) and weighing up to 5 – 6 kilograms ( 11 – 13 lb ) , although the lobsters caught in lobster pots are usually 23 – 38 cm ( 9 – 15 in ) long and weigh 0 @

In [5]:
vocab = build_vocab_from_iterator(
        map(tokenizer, train_data_iter+valid_data_iter),
        specials=["<unk>"],
        min_freq=MIN_WORD_FREQUENCY,
    )
vocab.set_default_index(vocab["<unk>"])

In [6]:
len(vocab)

4393

In [7]:
import torch
def collate_cbow(batch,text_pipeline):
    batch_input, batch_output = [], []
    for text in batch:
        text_tokens_ids=text_pipeline(text)
        if len(text_tokens_ids) < CBOW_N_WORDS * 2 + 1:
            continue
        if MAX_SEQUENCE_LENGTH:
            text_tokens_ids = text_tokens_ids[:MAX_SEQUENCE_LENGTH]
        for idx in range(len(text_tokens_ids) - CBOW_N_WORDS * 2):
            token_id_sequence = text_tokens_ids[idx : (idx + CBOW_N_WORDS * 2 + 1)]
            output = token_id_sequence.pop(CBOW_N_WORDS)
            input_ = token_id_sequence
            batch_input.append(input_)
            batch_output.append(output)
    batch_input = torch.tensor(batch_input, dtype=torch.long)
    batch_output = torch.tensor(batch_output, dtype=torch.long)
    return batch_input, batch_output

In [8]:
def collate_skipgram(batch, text_pipeline):
    """
    Collate_fn for Skip-Gram model to be used with Dataloader.
    `batch` is expected to be list of text paragrahs.
    
    Context is represented as N=SKIPGRAM_N_WORDS past words 
    and N=SKIPGRAM_N_WORDS future words.
    
    Long paragraphs will be truncated to contain
    no more that MAX_SEQUENCE_LENGTH tokens.
    
    Each element in `batch_input` is a middle word.
    Each element in `batch_output` is a context word.
    """
    batch_input, batch_output = [], []
    for text in batch:
        text_tokens_ids = text_pipeline(text)

        if len(text_tokens_ids) < SKIPGRAM_N_WORDS * 2 + 1:
            continue

        if MAX_SEQUENCE_LENGTH:
            text_tokens_ids = text_tokens_ids[:MAX_SEQUENCE_LENGTH]

        for idx in range(len(text_tokens_ids) - SKIPGRAM_N_WORDS * 2):
            token_id_sequence = text_tokens_ids[idx : (idx + SKIPGRAM_N_WORDS * 2 + 1)]
            input_ = token_id_sequence.pop(SKIPGRAM_N_WORDS)
            outputs = token_id_sequence

            for output in outputs:
                batch_input.append(input_)
                batch_output.append(output)

    batch_input = torch.tensor(batch_input, dtype=torch.long)
    batch_output = torch.tensor(batch_output, dtype=torch.long)
    return batch_input, batch_output

In [9]:
text_pipeline = lambda x: vocab(tokenizer(x))

In [10]:
from functools import partial
from torch.utils.data import DataLoader
train_dataloader = DataLoader(
    train_data_iter,
    batch_size=96,
    shuffle=True,
    collate_fn=partial(collate_skipgram, text_pipeline=text_pipeline)
    )

val_dataloader= DataLoader(
    valid_data_iter,
    batch_size=96,
    shuffle=True,
    collate_fn=partial(collate_skipgram, text_pipeline=text_pipeline)
)
vocab_size = len(vocab.get_stoi())
print(f"Vocabulary size: {vocab_size}")

Vocabulary size: 4393


In [11]:
import torch.nn as nn

In [12]:
class CBOW(nn.Module):
    def __init__(self,vocab_size:int):
        super(CBOW,self).__init__()
        self.embeddings=nn.Embedding(num_embeddings=vocab_size,embedding_dim=EMBED_DIMENSION,max_norm=EMBED_MAX_NORM)
        self.linear=nn.Linear(in_features=EMBED_DIMENSION,out_features=vocab_size)
    def forward(self,inputs_):
        x=self.embeddings(inputs_)
        x=x.mean(dim=1)
        x=self.linear(x)
        return x

In [13]:
class SkipGram(nn.Module):
    def __init__(self, vocab_size: int):
        super(SkipGram, self).__init__()
        self.embeddings = nn.Embedding(
            num_embeddings=vocab_size,
            embedding_dim=EMBED_DIMENSION,
            max_norm=EMBED_MAX_NORM,
        )
        self.linear = nn.Linear(
            in_features=EMBED_DIMENSION,
            out_features=vocab_size,
        )
    def forward(self, inputs_):
        x = self.embeddings(inputs_)
        x = self.linear(x)
        return x

In [14]:
model = SkipGram(vocab_size=vocab_size)
import torch.optim as optim
from torch.optim.lr_scheduler import LambdaLR
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.025)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [15]:
import os
import numpy as np
import json
import torch



class Trainer:
    """Main class for model training"""
    
    def __init__(
        self,
        model,
        epochs,
        train_dataloader,
        train_steps,
        val_dataloader,
        val_steps,
        checkpoint_frequency,
        criterion,
        optimizer,
        device,
        model_dir,
        model_name,
    ):  
        self.model = model
        self.epochs = epochs
        self.train_dataloader = train_dataloader
        self.train_steps = train_steps
        self.val_dataloader = val_dataloader
        self.val_steps = val_steps
        self.criterion = criterion
        self.optimizer = optimizer
        self.checkpoint_frequency = checkpoint_frequency
        self.device = device
        self.model_dir = model_dir
        self.model_name = model_name

        self.loss = {"train": [], "val": []}
        self.model.to(self.device)

    def train(self):
        for epoch in range(self.epochs):
            lr_lambda = lambda epoch: max(0.001, (5 - epoch) / 5)  # 最低学习率=0.001
            lr_scheduler = LambdaLR(optimizer, lr_lambda, verbose=True)
            print("Current LR:", optimizer.param_groups[0]['lr'])
            self._train_epoch()
            self._validate_epoch()
            print(
                "Epoch: {}/{}, Train Loss={:.5f}, Val Loss={:.5f}".format(
                    epoch + 1,
                    self.epochs,
                    self.loss["train"][-1],
                    self.loss["val"][-1],
                )
            )

            lr_scheduler.step()

            # if self.checkpoint_frequency:
            #     self._save_checkpoint(epoch)
        self.save_model()

    def _train_epoch(self):
        self.model.train()
        running_loss = []

        for i, batch_data in enumerate(self.train_dataloader, 1):
            inputs = batch_data[0].to(self.device)
            labels = batch_data[1].to(self.device)
            
            self.optimizer.zero_grad()
            outputs = self.model(inputs)
            
            loss = self.criterion(outputs, labels)
            loss.backward()
            self.optimizer.step()

            running_loss.append(loss.item())

            if i == self.train_steps:
                break

        epoch_loss = np.mean(running_loss)
        self.loss["train"].append(epoch_loss)

    def _validate_epoch(self):
        self.model.eval()
        running_loss = []

        with torch.no_grad():
            for i, batch_data in enumerate(self.val_dataloader, 1):
                inputs = batch_data[0].to(self.device)
                labels = batch_data[1].to(self.device)

                outputs = self.model(inputs)
                loss = self.criterion(outputs, labels)

                running_loss.append(loss.item())

                if i == self.val_steps:
                    break

        epoch_loss = np.mean(running_loss)
        self.loss["val"].append(epoch_loss)

    def _save_checkpoint(self, epoch):
        """Save model checkpoint to `self.model_dir` directory"""
        epoch_num = epoch + 1
        if epoch_num % self.checkpoint_frequency == 0:
            model_path = "checkpoint_{}.pt".format(str(epoch_num).zfill(3))
            model_path = os.path.join(self.model_dir, model_path)
            torch.save(self.model, model_path)

    def save_model(self):
        torch.save({
            'model_state_dict': self.model.state_dict(),
            'optimizer_state_dict': self.optimizer.state_dict(),
        }, './model.pt')

    def save_loss(self):
        """Save train/val loss as json file to `self.model_dir` directory"""
        loss_path = os.path.join(self.model_dir, "loss.json")
        with open(loss_path, "w") as fp:
            json.dump(self.loss, fp)

In [16]:
trainer = Trainer(
    model=model,
    epochs=20,
    train_dataloader=train_dataloader,
    train_steps=100,
    val_dataloader=val_dataloader,
    val_steps=100,
    checkpoint_frequency=20,
    criterion=criterion,
    optimizer=optimizer,
    device=device,
    model_dir='.',
    model_name='c',
)

trainer.train()
print("Training finished.")

trainer.save_model()
trainer.save_loss()
    

Adjusting learning rate of group 0 to 2.5000e-02.
Current LR: 0.025
Epoch: 1/20, Train Loss=5.80971, Val Loss=5.64698
Adjusting learning rate of group 0 to 2.0000e-02.
Adjusting learning rate of group 0 to 2.5000e-02.
Current LR: 0.025
Epoch: 2/20, Train Loss=5.58196, Val Loss=5.63213
Adjusting learning rate of group 0 to 2.0000e-02.
Adjusting learning rate of group 0 to 2.5000e-02.
Current LR: 0.025
Epoch: 3/20, Train Loss=5.56033, Val Loss=5.61181
Adjusting learning rate of group 0 to 2.0000e-02.
Adjusting learning rate of group 0 to 2.5000e-02.
Current LR: 0.025
Epoch: 4/20, Train Loss=5.54211, Val Loss=5.59848
Adjusting learning rate of group 0 to 2.0000e-02.
Adjusting learning rate of group 0 to 2.5000e-02.
Current LR: 0.025
Epoch: 5/20, Train Loss=5.53832, Val Loss=5.59105
Adjusting learning rate of group 0 to 2.0000e-02.
Adjusting learning rate of group 0 to 2.5000e-02.
Current LR: 0.025
Epoch: 6/20, Train Loss=5.52926, Val Loss=5.57200
Adjusting learning rate of group 0 to 2.00

In [27]:
#推理阶段
model.eval()
model.to(device)
def preprocess_text(text, vocab, context_window=1):
    token_indices =  text_pipeline(text)
    inputs, targets = [], []
    for i in range(context_window, len(token_indices) - context_window):
        context = token_indices[i-context_window : i] + token_indices[i+1 : i+context_window+1]
        inputs.append(context)
    return torch.tensor(inputs,dtype=torch.long)  
def predict_word(model, context_tensor, vocab, top_k=1):
    with torch.no_grad():
        logits = model(context_tensor)  # [num_samples, vocab_size]
        probs = torch.softmax(logits, dim=1)  # [num_samples, vocab_size]
    
    # 对每个样本分别取 top_k
    top_probs, top_indices = torch.topk(probs, k=top_k)  # [num_samples, k]
    
    idx_to_word = vocab.get_itos()
    predictions = []
    for i in range(top_probs.shape[0]):  # 遍历每个样本
        sample_preds = [
            (idx_to_word[idx.item()], prob.item()) 
            for idx, prob in zip(top_indices[i], top_probs[i])
        ]
        predictions.extend(sample_preds)
    
    return predictions

# 使用示例
text = "I wonder who am i in this world"
context_tensor = preprocess_text(text, vocab) 
context_tensor = context_tensor.to(device)
predictions = predict_word(model, context_tensor, vocab)
print("Top predictions:", predictions)

Top predictions: [('outbreak', 0.9938902854919434), ('pitcher', 0.9838512539863586), ('cong', 0.9765623211860657), ('i', 0.9908341765403748), ('pitcher', 0.9838512539863586), ('outbreak', 0.9938902854919434), ('i', 0.9926117062568665), ('communication', 0.9883968234062195), ('95', 0.986205518245697), ('occasion', 0.9820840954780579), ('knots', 0.9804263114929199), ('war', 0.9809697866439819)]
