# Training Word Embedding Models

### Data

In [6]:
from datasets import load_from_disk
import gensim
preprocessed_dataset = load_from_disk('../data/preprocessed_data')
preprocessed_df = preprocessed_dataset.to_pandas()
preprocessed_df = preprocessed_df.drop_duplicates(subset=['text'])
laptop_df = preprocessed_df[preprocessed_df['domain'] == 'laptops']
restaurant_df = preprocessed_df[preprocessed_df['domain'] == 'restaurants']

In [7]:
gensim_data = list(preprocessed_df['text'].apply(gensim.utils.simple_preprocess))
gensim_laptop = list(laptop_df['text'].apply(gensim.utils.simple_preprocess))
gensim_restaurant = list(restaurant_df['text'].apply(gensim.utils.simple_preprocess))

### Train Word2Vec

In [45]:
import gensim.downloader
from gensim.models import KeyedVectors
model_path = gensim.downloader.load('word2vec-google-news-300', return_path=True)
print(model_path)
# load the model to keyedvector and save it as keyedvector
vectors = gensim.models.KeyedVectors.load_word2vec_format(model_path, binary=True)

C:\Users\User/gensim-data\word2vec-google-news-300\word2vec-google-news-300.gz


#### Train on all semeval

In [46]:
import multiprocessing
cores = multiprocessing.cpu_count() # Count the number of cores in a computer
model = gensim.models.Word2Vec(
    window=2,
    min_count=5,
#     sample=6e-5, 
    alpha=0.03, 
    min_alpha=0.0007, 
    workers=cores-1,
    epochs=50
)
model.build_vocab(gensim_data, progress_per=10)
total_examples = model.corpus_count

In [47]:
model.build_vocab([list(vectors.key_to_index.keys())], update=True)
# train on your data
model.train(gensim_data, total_examples=total_examples, epochs=model.epochs)
model_wv = model.wv
model_wv.save_word2vec_format('E:/UQ/REIT4882/unsupervised-absa/models/further_pretrain/word2vec_semeval.gensim')
word_vectors = gensim.models.KeyedVectors.load_word2vec_format('E:/UQ/REIT4882/unsupervised-absa/models/further_pretrain/word2vec_semeval.gensim', binary=False)
word_vectors.save('E:/UQ/REIT4882/unsupervised-absa/models/further_pretrain/word2vec_semeval.gensim', pickle_protocol=4)

#### Train on laptop

In [48]:
import multiprocessing
cores = multiprocessing.cpu_count() # Count the number of cores in a computer
laptop_model = gensim.models.Word2Vec(
    window=2,
    min_count=5,
#     sample=6e-5, 
    alpha=0.03, 
    min_alpha=0.0007, 
    workers=cores-1,
    epochs=50
)
laptop_model.build_vocab(gensim_laptop, progress_per=10)
total_examples = laptop_model.corpus_count

In [49]:
laptop_model.build_vocab([list(vectors.key_to_index.keys())], update=True)
# train on your data
laptop_model.train(gensim_data, total_examples=total_examples, epochs=laptop_model.epochs)
model_wv = laptop_model.wv
model_wv.save_word2vec_format('E:/UQ/REIT4882/unsupervised-absa/models/further_pretrain/word2vec_laptop.gensim')
word_vectors = gensim.models.KeyedVectors.load_word2vec_format('E:/UQ/REIT4882/unsupervised-absa/models/further_pretrain/word2vec_laptop.gensim', binary=False)
word_vectors.save('E:/UQ/REIT4882/unsupervised-absa/models/further_pretrain/word2vec_laptop.gensim', pickle_protocol=4)

#### Train on restaurant

In [50]:
import multiprocessing
cores = multiprocessing.cpu_count() # Count the number of cores in a computer
restaurant_model = gensim.models.Word2Vec(
    window=2,
    min_count=5,
#     sample=6e-5, 
    alpha=0.03, 
    min_alpha=0.0007, 
    workers=cores-1,
    epochs=50
)

In [52]:

restaurant_model.build_vocab(gensim_restaurant, progress_per=10)
total_examples = restaurant_model.corpus_count
restaurant_model.build_vocab([list(vectors.key_to_index.keys())], update=True)
# train on your data
restaurant_model.train(gensim_data, total_examples=total_examples, epochs=restaurant_model.epochs)
model_wv = restaurant_model.wv
model_wv.save_word2vec_format('E:/UQ/REIT4882/unsupervised-absa/models/further_pretrain/word2vec_restaurant.gensim')
word_vectors = gensim.models.KeyedVectors.load_word2vec_format('E:/UQ/REIT4882/unsupervised-absa/models/further_pretrain/word2vec_restaurant.gensim', binary=False)
word_vectors.save('E:/UQ/REIT4882/unsupervised-absa/models/further_pretrain/word2vec_restaurant.gensim', pickle_protocol=4)

### Train FastText

#### Train on SemEval

In [53]:
import gensim
from gensim.test.utils import datapath
model = gensim.models.fasttext.load_facebook_model(datapath('E:/UQ/REIT4882/unsupervised-absa/models/cc.en.300.bin'))

In [54]:
model.build_vocab(gensim_data, update=True)
model.train(corpus_iterable=gensim_data, total_examples=len(gensim_data), epochs=50)

(751075, 4082300)

In [55]:
model.wv.save_word2vec_format('E:/UQ/REIT4882/unsupervised-absa/models/further_pretrain/fast_text_semeval.gensim')
word_vectors = gensim.models.KeyedVectors.load_word2vec_format('E:/UQ/REIT4882/unsupervised-absa/models/further_pretrain/fast_text_semeval.gensim', binary=False)
word_vectors.save('E:/UQ/REIT4882/unsupervised-absa/models/further_pretrain/fast_text_semeval.gensim', pickle_protocol=4)

#### Train on Laptops

In [60]:
import gensim
from gensim.test.utils import datapath
laptop_model = gensim.models.fasttext.load_facebook_model(datapath('E:/UQ/REIT4882/unsupervised-absa/models/cc.en.300.bin'))

In [61]:
laptop_model.build_vocab(gensim_laptop, update=True)
laptop_model.train(corpus_iterable=gensim_laptop, total_examples=len(gensim_laptop), epochs=50)

(351171, 2111450)

In [62]:
laptop_model.wv.save_word2vec_format('E:/UQ/REIT4882/unsupervised-absa/models/further_pretrain/fast_text_laptop.gensim')
word_vectors = gensim.models.KeyedVectors.load_word2vec_format('E:/UQ/REIT4882/unsupervised-absa/models/further_pretrain/fast_text_laptop.gensim', binary=False)
word_vectors.save('E:/UQ/REIT4882/unsupervised-absa/models/further_pretrain/fast_text_laptop.gensim', pickle_protocol=4)

### Train on Restaurant

In [63]:
import gensim
from gensim.test.utils import datapath
restaurant_model = gensim.models.fasttext.load_facebook_model(datapath('E:/UQ/REIT4882/unsupervised-absa/models/cc.en.300.bin'))

In [64]:
restaurant_model.build_vocab(gensim_restaurant, update=True)
restaurant_model.train(corpus_iterable=gensim_restaurant, total_examples=len(gensim_restaurant), epochs=50)

(364849, 1970850)

In [65]:
restaurant_model.wv.save_word2vec_format('E:/UQ/REIT4882/unsupervised-absa/models/further_pretrain/fast_text_restaurant.gensim')
word_vectors = gensim.models.KeyedVectors.load_word2vec_format('E:/UQ/REIT4882/unsupervised-absa/models/further_pretrain/fast_text_restaurant.gensim', binary=False)
word_vectors.save('E:/UQ/REIT4882/unsupervised-absa/models/further_pretrain/fast_text_restaurant.gensim', pickle_protocol=4)

### Train Glove

In [4]:
import gensim

In [23]:
from gensim.test.utils import datapath, get_tmpfile
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models import KeyedVectors

glove_file = datapath('E:/UQ/REIT4882/unsupervised-absa/examples/glove.gensim')
tmp_file = get_tmpfile("test_word2vec.txt")

_ = glove2word2vec(glove_file, tmp_file)

glove_vectors = KeyedVectors.load_word2vec_format(tmp_file)

  _ = glove2word2vec(glove_file, tmp_file)


UnicodeDecodeError: 'utf-8' codec can't decode byte 0x80 in position 0: invalid start byte

In [10]:
gensim_data = list(preprocessed_df['text'].apply(gensim.utils.simple_preprocess))

In [12]:
import multiprocessing
cores = multiprocessing.cpu_count() # Count the number of cores in a computer
model = gensim.models.Word2Vec(
    window=2,
    min_count=5,
#     sample=6e-5, 
    alpha=0.03, 
    min_alpha=0.0007, 
    negative=0,
    workers=cores-1
)
model.build_vocab(gensim_data, progress_per=10)

In [None]:
model.build_vocab([list(glove_vectors.vocab.keys())], update=True)