In [8]:
import numpy as np
import pandas as pd
import json
import gensim
import config
import pickle
import torch
from gensim.models import KeyedVectors
from tqdm.notebook import tqdm
from helper import words_sentence,build_word2vec
from model import BCModel

In [9]:
dataframe = pd.read_csv(config.preprocessed_dataset_file)

In [10]:
all_words,all_sentence = words_sentence(dataframe)
build_word2vec(all_sentence,config.EMBED_SIZE)

Total number of words : 86254
Total number of sentence : 49582
Length of samples :  49582
Length of vocab   :  86254
Training and saving model...
Done
Model saved to Chapter - 1 : Coding a basic project/dataset/prep_emb_vec.pkl


In [11]:
def tsne_data_create_w2v():
    pickle_data = pickle.load(open(config.emb_vec_file,'rb'))
    local_vocab = json.load(open(config.vocab_file_name,"r"))
    index2word = local_vocab['index2word']
    word_vectors = pd.DataFrame(pickle_data['embedding_vector']).iloc[:10000,:]
    word_vectors.to_csv("checkpoints/tsne_embeddings/imdb_w2v_embedding_matrix_w2v.tsv",sep ="\t",header = None,index = False)
    only_words = [index2word[str(x)] for x in range(len(index2word))]
    word_vectors = {}
    word_vectors["metadata"] =only_words
    word_vectors = pd.DataFrame(word_vectors).iloc[:10000,:]
    word_vectors.to_csv("checkpoints/tsne_embeddings/imdb_w2v_meta_data_w2v.tsv",sep ="\t",header=None,index = False)
    print("Ready for TSNE")

In [14]:
def tsne_data_create_nn_embedding():
    checkpoint = torch.load("checkpoints/bidi_lstm_max_mean_pool_concat_pretrained_w2v__seq2seq_hidden_64_embed_32.pt",map_location=torch.device('cpu'))
    params = checkpoint['params']
    pickle_data =pickle.load(open(config.emb_vec_file,'rb'))
    my_model = BCModel(params,params['vocab_len'])

    my_model.load_state_dict(checkpoint['model_state_dict'])
    emb_matrix = my_model.embedding.weight.detach().numpy()
    emb = pd.DataFrame(emb_matrix).iloc[:10000,:]
    print("Saving nn.Module embeddings and metadadta...")
    emb.to_csv("checkpoints/tsne_embeddings/imdb_checkpoints_embedding_matrix.tsv",sep = '\t',header=None,index = False)

    vocab = json.load(open(params["vocab_file_name"],"r"))
    only_words = [vocab["index2word"][str(x)] for x in range(len(vocab["index2word"]))]

    word_vectors = {}
    word_vectors["metadata"] = only_words
    word_vectors_df = pd.DataFrame(word_vectors).iloc[:10000,:]
    assert emb.shape[0] == word_vectors_df.shape[0]
    word_vectors_df.to_csv("checkpoints/tsne_embeddings/imdb_checkpoints_metadata.tsv",sep ="\t",header=None,index = False) #tsne_embeddings
    print("Saved sucussfully")

In [15]:
# tsne_data_create_w2v()
tsne_data_create_nn_embedding()

Saving nn.Module embeddings and metadadta...
Saved sucussfully


#### Sanitity check : all words exist in both pretrained and randomly trained data

In [None]:
# loading word2vec vocabs and get word2index
w2v_vocab = pickle.load(open(config.emb_vec_file,'rb'))
# loading local vocabs and get word2index
local_vocab = json.load(open(config.vocab_file_name,"r"))
# creating list of words, of word2index from word2vec.
w2v_words = set(w2v_vocab['word2index'].keys())
# loop through local vocab words and check, words are in list(w2v) or not.
not_found_words = list()
for word in tqdm(local_vocab['word2index'].keys()):
    try:
        w2v_vocab['word2index'][word]
    except KeyError:
        not_found_words.append(word)

    # if word not in w2v_words:
    #     not_found_words.append(word)
print(len(not_found_words))

In [51]:
not_found_words

['PAD', 'SOS', 'EOS', 'UNK']