In [51]:
import numpy as np
from collections import defaultdict
import string
import tqdm as tqdm
import pickle


In [21]:
# Đọc nội dung từ tệp raw_content.txt
with open('raw_content_small.txt', 'r', encoding='utf-8') as file:
    text = file.read()

print(text[:100])


Project Gutenberg's The Adventures of Sherlock Holmes, by Arthur Conan Doyle

This eBook is for the


In [7]:
def preprocess_text(text):
    text = text.lower()
    translator = str.maketrans('', '', string.punctuation)
    text = text.translate(translator)
    return text.split()

text_preprocessed = preprocess_text(text)
print(text_preprocessed[:100])

['\ufeff', 'project', 'gutenbergs', 'the', 'adventures', 'of', 'sherlock', 'holmes', 'by', 'arthur', 'conan', 'doyle', 'this', 'ebook', 'is', 'for', 'the', 'use', 'of', 'anyone', 'anywhere', 'at', 'no', 'cost', 'and', 'with', 'almost', 'no', 'restrictions', 'whatsoever', 'you', 'may', 'copy', 'it', 'give', 'it', 'away', 'or', 'reuse', 'it', 'under', 'the', 'terms', 'of', 'the', 'project', 'gutenberg', 'license', 'included', 'with', 'this', 'ebook', 'or', 'online', 'at', 'wwwgutenbergnet', 'title', 'the', 'adventures', 'of', 'sherlock', 'holmes', 'author', 'arthur', 'conan', 'doyle', 'release', 'date', 'november', '29', '2002', 'ebook', '1661', 'last', 'updated', 'may', '20', '2019', 'language', 'english', 'character', 'set', 'encoding', 'utf8', 'start', 'of', 'this', 'project', 'gutenberg', 'ebook', 'the', 'adventures', 'of', 'sherlock', 'holmes', 'produced', 'by', 'an', 'anonymous', 'project']


In [34]:
vocab = set()
word2index = {}
index2word = {}
word_freq = defaultdict(int)

def build_vocab(text):
    for word in text:
        vocab.add(word)
        word_freq[word] += 1



build_vocab(text_preprocessed)
print(vocab)
word2index = {word: i for i, word in enumerate(vocab)}
index2word = {i: word for word, i in word2index.items()}



In [16]:
window_size=2

def generate_training_data(text):
    training_data = []
    words = preprocess_text(text)

    for i, target_word in enumerate(words):
        context_words = []
        for j in range(max(0, i - window_size), min(len(words), i + window_size + 1)):
            if j != i:
                context_words.append(words[j])
        if context_words:
            training_data.append((context_words, target_word))

    return training_data

training_data = generate_training_data(text)
print(training_data[:10])

[(['project', 'gutenbergs'], '\ufeff'), (['\ufeff', 'gutenbergs', 'the'], 'project'), (['\ufeff', 'project', 'the', 'adventures'], 'gutenbergs'), (['project', 'gutenbergs', 'adventures', 'of'], 'the'), (['gutenbergs', 'the', 'of', 'sherlock'], 'adventures'), (['the', 'adventures', 'sherlock', 'holmes'], 'of'), (['adventures', 'of', 'holmes', 'by'], 'sherlock'), (['of', 'sherlock', 'by', 'arthur'], 'holmes'), (['sherlock', 'holmes', 'arthur', 'conan'], 'by'), (['holmes', 'by', 'conan', 'doyle'], 'arthur')]


In [52]:
class CBOW:
    def __init__(self, window_size=2, embedding_size=100, learning_rate=0.001):
        self.window_size = window_size
        self.embedding_size = embedding_size
        self.learning_rate = learning_rate
        self.vocab = set()
        self.word2index = {}
        self.index2word = {}
        self.word_freq = defaultdict(int)
        self.W1 = None
        self.W2 = None

    def preprocess_text(self, text):
        text = text.lower()
        translator = str.maketrans('', '', string.punctuation)
        text = text.translate(translator)
        return text.split()

    def build_vocab(self, text):
        for word in text:
            self.vocab.add(word)
            self.word_freq[word] += 1

        self.word2index = {word: i for i, word in enumerate(self.vocab)}
        self.index2word = {i: word for word, i in self.word2index.items()}

    def generate_training_data(self, text):
        training_data = []
        words = self.preprocess_text(text)

        for i, target_word in enumerate(words):
            context_words = []
            for j in range(max(0, i - self.window_size), min(len(words), i + self.window_size + 1)):
                if j != i:
                    context_words.append(words[j])
            if context_words:
                training_data.append((context_words, target_word))

        return training_data

    def initialize_weights(self):
        vocab_size = len(self.vocab)
        self.W1 = np.random.uniform(-0.5/self.embedding_size, 0.5/self.embedding_size, (vocab_size, self.embedding_size))
        self.W2 = np.random.uniform(-0.5/self.embedding_size, 0.5/self.embedding_size, (self.embedding_size, vocab_size))

    def softmax(self, x):
        exp_x = np.exp(x - np.max(x))
        return exp_x / exp_x.sum(axis=0)

    def forward(self, context_words):
        context_word_vectors = np.mean([self.W1[self.word2index[word]] for word in context_words if word in self.word2index], axis=0)
        output = np.dot(self.W2.T, context_word_vectors)
        return self.softmax(output)

    def backward(self, context_words, target_word, y_pred):
        y_true = np.zeros(len(self.vocab))
        y_true[self.word2index[target_word]] = 1
        error = y_pred - y_true

        context_word_vectors = np.mean([self.W1[self.word2index[word]] for word in context_words if word in self.word2index], axis=0)
        dW2 = np.outer(context_word_vectors, error)
        dW1 = np.dot(self.W2, error)

        for word in context_words:
            self.W1[self.word2index[word]] -= self.learning_rate * dW1
        self.W2 -= self.learning_rate * dW2

    def train(self, text, epochs=10):
        text_preprocessed = preprocess_text(text)
        self.build_vocab(text_preprocessed)
        
        self.initialize_weights()
        training_data = self.generate_training_data(text)

        for epoch in range(epochs):
            print(epoch)
            total_loss = 0
            np.random.shuffle(training_data)
            
            for context_words, target_word in training_data:
                
                y_pred = self.forward(context_words)
                self.backward(context_words, target_word, y_pred)
                total_loss += -np.log(y_pred[self.word2index[target_word]])

            print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss:.4f}")

    def most_similar(self, word, top_n=5):
        if word not in self.word2index:
            return []
        word_vector = self.W1[self.word2index[word]]
        similarity_scores = np.dot(self.W1, word_vector)
        sorted_indices = np.argsort(similarity_scores)[::-1]
        similar_words = [self.index2word[i] for i in sorted_indices[:top_n]]
        return similar_words
    
    def save_model(self, file_name):
        model_state = {
            'vocab': self.vocab,
            'word2index': self.word2index,
            'index2word': self.index2word,
            'word_freq': self.word_freq,
            'W1': self.W1,
            'W2': self.W2
        }
        with open(file_name, 'wb') as file:
            pickle.dump(model_state, file)


In [54]:
# Đọc nội dung từ tệp raw_content.txt
with open('raw_content.txt', 'r', encoding='utf-8') as file:
    text = file.read()
# Khởi tạo và huấn luyện mô hình CBOW
cbow_model = CBOW(window_size=2, embedding_size=100, learning_rate=0.001)
cbow_model.train(text, epochs=1)

# In ra các từ tương tự với từ 'sherlock' trong top 5
print("Words similar to 'sherlock':", cbow_model.most_similar('sherlock', top_n=5))

0


In [53]:
cbow_model.save_model('cbow_model.pkl')

AttributeError: 'CBOW' object has no attribute 'save_model'