# Training Word Embedding Models

### Data

In [4]:
from datasets import load_from_disk
import gensim
preprocessed_dataset = load_from_disk('../data/preprocessed_data')
preprocessed_df = preprocessed_dataset.to_pandas()
preprocessed_df = preprocessed_df.drop_duplicates(subset=['text'])
laptop_df = preprocessed_df[preprocessed_df['domain'] == 'laptops']
restaurant_df = preprocessed_df[preprocessed_df['domain'] == 'restaurants']

In [5]:
gensim_data = list(preprocessed_df['text'].apply(gensim.utils.simple_preprocess))
gensim_laptop = list(laptop_df['text'].apply(gensim.utils.simple_preprocess))
gensim_restaurant = list(restaurant_df['text'].apply(gensim.utils.simple_preprocess))

### Train Word2Vec

In [45]:
import gensim.downloader
from gensim.models import KeyedVectors
model_path = gensim.downloader.load('word2vec-google-news-300', return_path=True)
print(model_path)
# load the model to keyedvector and save it as keyedvector
vectors = gensim.models.KeyedVectors.load_word2vec_format(model_path, binary=True)

C:\Users\User/gensim-data\word2vec-google-news-300\word2vec-google-news-300.gz


#### Train on all semeval

In [46]:
import multiprocessing
cores = multiprocessing.cpu_count() # Count the number of cores in a computer
model = gensim.models.Word2Vec(
    window=2,
    min_count=5,
#     sample=6e-5, 
    alpha=0.03, 
    min_alpha=0.0007, 
    workers=cores-1,
    epochs=50
)
model.build_vocab(gensim_data, progress_per=10)
total_examples = model.corpus_count

In [47]:
model.build_vocab([list(vectors.key_to_index.keys())], update=True)
# train on your data
model.train(gensim_data, total_examples=total_examples, epochs=model.epochs)
model_wv = model.wv
model_wv.save_word2vec_format('E:/UQ/REIT4882/unsupervised-absa/models/further_pretrain/word2vec_semeval.gensim')
word_vectors = gensim.models.KeyedVectors.load_word2vec_format('E:/UQ/REIT4882/unsupervised-absa/models/further_pretrain/word2vec_semeval.gensim', binary=False)
word_vectors.save('E:/UQ/REIT4882/unsupervised-absa/models/further_pretrain/word2vec_semeval.gensim', pickle_protocol=4)

#### Train on laptop

In [48]:
import multiprocessing
cores = multiprocessing.cpu_count() # Count the number of cores in a computer
laptop_model = gensim.models.Word2Vec(
    window=2,
    min_count=5,
#     sample=6e-5, 
    alpha=0.03, 
    min_alpha=0.0007, 
    workers=cores-1,
    epochs=50
)
laptop_model.build_vocab(gensim_laptop, progress_per=10)
total_examples = laptop_model.corpus_count

In [49]:
laptop_model.build_vocab([list(vectors.key_to_index.keys())], update=True)
# train on your data
laptop_model.train(gensim_data, total_examples=total_examples, epochs=laptop_model.epochs)
model_wv = laptop_model.wv
model_wv.save_word2vec_format('E:/UQ/REIT4882/unsupervised-absa/models/further_pretrain/word2vec_laptop.gensim')
word_vectors = gensim.models.KeyedVectors.load_word2vec_format('E:/UQ/REIT4882/unsupervised-absa/models/further_pretrain/word2vec_laptop.gensim', binary=False)
word_vectors.save('E:/UQ/REIT4882/unsupervised-absa/models/further_pretrain/word2vec_laptop.gensim', pickle_protocol=4)

#### Train on restaurant

In [50]:
import multiprocessing
cores = multiprocessing.cpu_count() # Count the number of cores in a computer
restaurant_model = gensim.models.Word2Vec(
    window=2,
    min_count=5,
#     sample=6e-5, 
    alpha=0.03, 
    min_alpha=0.0007, 
    workers=cores-1,
    epochs=50
)

In [52]:

restaurant_model.build_vocab(gensim_restaurant, progress_per=10)
total_examples = restaurant_model.corpus_count
restaurant_model.build_vocab([list(vectors.key_to_index.keys())], update=True)
# train on your data
restaurant_model.train(gensim_data, total_examples=total_examples, epochs=restaurant_model.epochs)
model_wv = restaurant_model.wv
model_wv.save_word2vec_format('E:/UQ/REIT4882/unsupervised-absa/models/further_pretrain/word2vec_restaurant.gensim')
word_vectors = gensim.models.KeyedVectors.load_word2vec_format('E:/UQ/REIT4882/unsupervised-absa/models/further_pretrain/word2vec_restaurant.gensim', binary=False)
word_vectors.save('E:/UQ/REIT4882/unsupervised-absa/models/further_pretrain/word2vec_restaurant.gensim', pickle_protocol=4)

### Train FastText

#### Train on SemEval

In [53]:
import gensim
from gensim.test.utils import datapath
model = gensim.models.fasttext.load_facebook_model(datapath('E:/UQ/REIT4882/unsupervised-absa/models/cc.en.300.bin'))

In [54]:
model.build_vocab(gensim_data, update=True)
model.train(corpus_iterable=gensim_data, total_examples=len(gensim_data), epochs=50)

(751075, 4082300)

In [55]:
model.wv.save_word2vec_format('E:/UQ/REIT4882/unsupervised-absa/models/further_pretrain/fast_text_semeval.gensim')
word_vectors = gensim.models.KeyedVectors.load_word2vec_format('E:/UQ/REIT4882/unsupervised-absa/models/further_pretrain/fast_text_semeval.gensim', binary=False)
word_vectors.save('E:/UQ/REIT4882/unsupervised-absa/models/further_pretrain/fast_text_semeval.gensim', pickle_protocol=4)

#### Train on Laptops

In [60]:
import gensim
from gensim.test.utils import datapath
laptop_model = gensim.models.fasttext.load_facebook_model(datapath('E:/UQ/REIT4882/unsupervised-absa/models/cc.en.300.bin'))

In [61]:
laptop_model.build_vocab(gensim_laptop, update=True)
laptop_model.train(corpus_iterable=gensim_laptop, total_examples=len(gensim_laptop), epochs=50)

(351171, 2111450)

In [62]:
laptop_model.wv.save_word2vec_format('E:/UQ/REIT4882/unsupervised-absa/models/further_pretrain/fast_text_laptop.gensim')
word_vectors = gensim.models.KeyedVectors.load_word2vec_format('E:/UQ/REIT4882/unsupervised-absa/models/further_pretrain/fast_text_laptop.gensim', binary=False)
word_vectors.save('E:/UQ/REIT4882/unsupervised-absa/models/further_pretrain/fast_text_laptop.gensim', pickle_protocol=4)

#### Train on Restaurant

In [63]:
import gensim
from gensim.test.utils import datapath
restaurant_model = gensim.models.fasttext.load_facebook_model(datapath('E:/UQ/REIT4882/unsupervised-absa/models/cc.en.300.bin'))

In [64]:
restaurant_model.build_vocab(gensim_restaurant, update=True)
restaurant_model.train(corpus_iterable=gensim_restaurant, total_examples=len(gensim_restaurant), epochs=50)

(364849, 1970850)

In [65]:
restaurant_model.wv.save_word2vec_format('E:/UQ/REIT4882/unsupervised-absa/models/further_pretrain/fast_text_restaurant.gensim')
word_vectors = gensim.models.KeyedVectors.load_word2vec_format('E:/UQ/REIT4882/unsupervised-absa/models/further_pretrain/fast_text_restaurant.gensim', binary=False)
word_vectors.save('E:/UQ/REIT4882/unsupervised-absa/models/further_pretrain/fast_text_restaurant.gensim', pickle_protocol=4)

### Train Glove

#### Training Function

In [30]:
from gensim.models.phrases import Phrases, Phraser
from gensim.models import Word2Vec


def train_gensim(gensim_data, glove_vectors):
    bigram = Phrases(gensim_data, min_count=5)
    bigram_phraser = Phraser(bigram)

    bigramed_tokens = []
    for sent in gensim_data:
        tokens = bigram_phraser[sent]
        bigramed_tokens.append(tokens)

    # run again to get trigrams
    trigram = Phrases(bigramed_tokens, min_count=5)
    trigram_phraser = Phraser(trigram)

    trigramed_tokens = []
    for sent in bigramed_tokens:
        tokens = trigram_phraser[sent]
        trigramed_tokens.append(tokens)

    # build a toy model to update with
    model = Word2Vec(
        window=2,
        min_count=5,
        #     sample=6e-5,
        alpha=0.03,
        min_alpha=0.0007,
        epochs=50,
    )
    model.build_vocab(trigramed_tokens)
    total_examples = model.corpus_count

    # add GloVe's vocabulary & weights
    model.build_vocab([list(vectors.key_to_index.keys())], update=True)

    # train on our data
    model.train(trigramed_tokens, total_examples=total_examples, epochs=model.epochs)
    model_wv = model.wv

    # delete the model to save memory, and return word vectors for analysis
    del model
    return model_wv

In [3]:
import gensim.downloader
from gensim.models import KeyedVectors
model_path = gensim.downloader.load('glove-wiki-gigaword-300', return_path=True)
print(model_path)
# load the model to keyedvector and save it as keyedvector
vectors = gensim.models.KeyedVectors.load_word2vec_format(model_path, binary=False)

C:\Users\User/gensim-data\glove-wiki-gigaword-300\glove-wiki-gigaword-300.gz


#### Train All

In [31]:
model = train_gensim(gensim_data, vectors)
model.save_word2vec_format('E:/UQ/REIT4882/unsupervised-absa/models/further_pretrain/glove_semeval.gensim')
word_vectors = gensim.models.KeyedVectors.load_word2vec_format('E:/UQ/REIT4882/unsupervised-absa/models/further_pretrain/glove_semeval.gensim', binary=False)
word_vectors.save('E:/UQ/REIT4882/unsupervised-absa/models/further_pretrain/glove_semeval.gensim', pickle_protocol=4)

#### Train Laptop

In [32]:
model = train_gensim(gensim_laptop, vectors)
model.save_word2vec_format('E:/UQ/REIT4882/unsupervised-absa/models/further_pretrain/glove_laptop.gensim')
word_vectors = gensim.models.KeyedVectors.load_word2vec_format('E:/UQ/REIT4882/unsupervised-absa/models/further_pretrain/glove_laptop.gensim', binary=False)
word_vectors.save('E:/UQ/REIT4882/unsupervised-absa/models/further_pretrain/glove_laptop.gensim', pickle_protocol=4)

#### Train Restaurant

In [33]:
model = train_gensim(gensim_restaurant, vectors)
model.save_word2vec_format('E:/UQ/REIT4882/unsupervised-absa/models/further_pretrain/glove_restaurant.gensim')
word_vectors = gensim.models.KeyedVectors.load_word2vec_format('E:/UQ/REIT4882/unsupervised-absa/models/further_pretrain/glove_restaurant.gensim', binary=False)
word_vectors.save('E:/UQ/REIT4882/unsupervised-absa/models/further_pretrain/glove_restaurant.gensim', pickle_protocol=4)