In [None]:
import re
from collections import Counter
import random

import gensim.downloader as api

import torch
from torch import nn
from torch.nn.functional import normalize
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import time

import string
import nltk
from nltk.corpus import stopwords 

import swifter
import pandas as pd
import numpy as np

In [1]:
import pandas as pd

In [None]:
df_data = pd.read_csv('../data/V_NOTE_MHAV.csv')

In [None]:
def extract_content(text):
    pattern_date = r'\s*\w+\s\(\s*[\w]+[-]*\w*[,]*\s*[\w.]*\s*[\w\s]*\w+[,]*\s*\w*[\w.]*\s*[-]*\w*\)\s+\d{4}/\d{2}/\d{2}\s+\d{2}:\d{2}:\s+'
    pattern_patient = r'<toPt\s*\w*>'
    
    messages = re.split(pattern_date, text)
    messages_final = []
    messages_roles = []
    
    for message in messages:
        if message.strip() == '':
            continue
        
        role = 1 if '<mhav:' in message else 0
        messages_roles.append(role)
        
        match = re.search(pattern_patient, message)
        
        message = message.strip() if match is None else message[match.end():].strip()
        messages_final.append(message)
    return messages_final


In [None]:
df_data['EXTRACT_NOTE_TEXT'] = df_data.NOTE_TEXT.swifter.apply(lambda x: extract_content(x))

In [None]:
df_data_for_w2v = df_data[['PERSON_ID','NOTE_DATE','EXTRACT_NOTE_TEXT']]

In [None]:
df_data_for_w2v = df_data_for_w2v.explode('EXTRACT_NOTE_TEXT').reset_index(drop=True)

In [None]:
len(df_data_for_w2v)

In [None]:
from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons

In [None]:
text_processor = TextPreProcessor(
    # terms that will be normalized
    normalize=['url', 'email', 'percent', 'money', 'phone', 'user',
        'time', 'date', 'number'],
    # terms that will be annotated
#     annotate={"hashtag", "allcaps", "elongated", "repeated",
#         'emphasis', 'censored'},
    fix_html=True,  # fix HTML tokens
    
    # corpus from which the word statistics are going to be used 
    # for word segmentation 
    segmenter="english", 
    
    # corpus from which the word statistics are going to be used 
    # for spell correction
    corrector="english", 
    
    unpack_hashtags=False,  # perform word segmentation on hashtags
    unpack_contractions=True,  # Unpack contractions (can't -> can not)
    spell_correct_elong=True,  # spell correction for elongated words
    
    # select a tokenizer. You can use SocialTokenizer, or pass your own
    # the tokenizer, should take as input a string and return a list of tokens
    tokenizer=SocialTokenizer(lowercase=True).tokenize
    
    # list of dictionaries, for replacing tokens extracted from the text,
    # with other expressions. You can pass more than one dictionaries.
#     dicts=[emoticons]
)


In [None]:
# Download stop words
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

In [None]:
def preprocess(text):
    text = text.replace('\n',' ').translate(str.maketrans(string.punctuation.replace('\'',''), ' ' * (len(string.punctuation) - 1), string.digits))
    text = text_processor.pre_process_doc(text)
    # Remove stop words
    filtered_text = [word for word in text if word not in stop_words and word != '\'']
    return filtered_text

In [None]:
df_data_for_w2v['NOTE_WORD'] = df_data_for_w2v.EXTRACT_NOTE_TEXT.swifter.apply(lambda x: preprocess(x))

In [None]:
vocab_jw = pd.read_csv('./data/R37 vocabulary list with context sentence 2.csv', index_col=0)

In [None]:
first_name = set(vocab_jw[vocab_jw['First name'].isna() == False].WORD.to_list())
last_name = set(vocab_jw[vocab_jw['Last name'].isna() == False].WORD.to_list())
vunetid = set(vocab_jw[vocab_jw['VUNetID'].isna() == False].WORD.to_list())
name = first_name.union(last_name)

In [None]:
drop_word = set(vocab_jw[vocab_jw['Keep'].isna() == False].WORD.to_list())

In [None]:
word_map = dict(zip(vocab_jw[vocab_jw.mapping.isna() == False].WORD, vocab_jw[vocab_jw.mapping.isna() == False].mapping))
mispell = set(word_map.keys())

In [None]:
def process_name(words):
    words = ['<name>' if word in name else word for word in words]
    words = ['<vunetid>' if word in name else word for word in words]
    words = [word_map[word] if word in mispell else word for word in words]
    words = [word for word in words if word not in drop_word]
    return words               

In [None]:
df_data_for_w2v['NOTE_WORD'] = df_data_for_w2v.NOTE_WORD.swifter.apply(lambda x: process_name(x))

In [None]:
words = df_data_for_w2v.NOTE_WORD.to_list()
words = [w for word in words for w in word]

In [None]:
def create_lookup_tables(words):
    """
    Create lookup tables for vocabulary
    :param words: Input list of words
    :return: Two dictionaries, vocab_to_int, int_to_vocab
    """
    word_counts = Counter(words)
    # sorting the words from most to least frequent in text occurrence
    sorted_vocab = sorted(word_counts, key=word_counts.get, reverse=True)
    # create int_to_vocab dictionaries
    int_to_vocab = {ii: word for ii, word in enumerate(sorted_vocab)}
    vocab_to_int = {word: ii for ii, word in int_to_vocab.items()}

    return vocab_to_int, int_to_vocab

In [None]:
# print some stats about this word data
print("Total words in text: {}".format(len(words)))
print("Unique words: {}".format(len(set(words)))) # `set` removes any duplicate words

In [None]:
vocab_to_int, int_to_vocab = create_lookup_tables(words)
int_words = [vocab_to_int[word] for word in words]

In [None]:
word_counts = Counter(int_words)
#print(list(word_counts.items())[0])  # dictionary of int_words, how many times they appear

total_count = len(int_words)
freqs = {word: count/total_count for word, count in word_counts.items()}
# p_drop = {word: 1 - np.sqrt(threshold/freqs[word]) for word in word_counts}
# discard some frequent words, according to the subsampling equation
# create a new list of words for training
train_words = [word for word in int_words if word_counts[word] >= 5]

In [None]:
freq_dist = pd.read_csv("freq_dist.csv")

In [None]:
# print some stats about this word data
print("Total words in text: {}".format(len(train_words)))
print("Unique words: {}".format(len(set(train_words)))) # `set` removes any duplicate words

In [None]:
df_data_for_w2v['NOTE_WORD'] = df_data_for_w2v.NOTE_WORD.apply(lambda x : [vocab_to_int[i] for i in x if word_counts[vocab_to_int[i]] >= 5])

In [None]:
df_data_for_w2v['LEN'] = df_data_for_w2v.NOTE_WORD.apply(lambda x : len(x))

In [None]:
train_data_for_w2v = df_data_for_w2v[df_data_for_w2v['LEN'] >= 2]

In [None]:
def get_context(words, idx, window_size=10):
    ''' Get a list of words in a window around an index. '''
    
    R = np.random.randint(1, window_size // 2 + 1)
    start = idx - R if (idx - R) > 0 else 0
    stop = idx + R
    context_words = words[start:idx] + words[idx+1:stop+1]
    
    return list(context_words)

In [None]:
def word_pair(words):
    pairs = []
    for idx in range(len(words)):
        context = get_context(words, idx)
        pairs.extend(list(zip([words[idx]] * len(context),context)))
    return pairs

In [None]:
random.seed(0)
train_data_for_w2v['WORD_PAIR'] = train_data_for_w2v.NOTE_WORD.swifter.apply(lambda x : word_pair(x))

In [None]:
word_pairs = train_data_for_w2v.WORD_PAIR.to_list()

In [None]:
word_pairs = [wp for word_pair in word_pairs for wp in word_pair]

In [None]:
len(word_pairs)

In [None]:
word_freqs = np.array(sorted([val for val in freqs.values() if val >= 5 / total_count ], reverse=True))
unigram_dist = word_freqs/word_freqs.sum()
noise_dist = torch.from_numpy(unigram_dist**(0.75)/np.sum(unigram_dist**(0.75)))

In [None]:
vocab_to_int = {word:idx for word, idx in vocab_to_int.items() if idx < 10683}

In [None]:
import gensim.downloader as api
google = api.load('word2vec-google-news-300')

In [None]:
word_df = pd.DataFrame({'Word':[int_to_vocab[i] for i in list(set(train_words))]})

In [None]:
def get_emb(x):
    try:
        return google[x]
    except KeyError:
        return np.nan

In [None]:
word_df['embed'] = word_df.Word.swifter.apply(lambda x : get_emb(x))

In [None]:
word_df = word_df[word_df['embed'].isna() == False]

In [None]:
word_df['is_google'] = word_df.embed.isna()

In [None]:
word_df['is_medi'] = word_df['Word'].isin(med_word)

In [None]:
def is_google(x):
    if x['is_medi']:
        return False
    else:
        return not x['is_google']

In [None]:
word_df['is_google'] = word_df.apply(lambda x: is_google(x),axis=1)

In [None]:
word_df['freq'] = [freqs[i] for i in range(10683)]

In [None]:
word_df['log_freq'] = word_df.freq.apply(lambda x: np.log(x))

In [None]:
weights = np.array(word_df.embed.to_list())
nonmed_words = np.array(word_df.Word.to_list())

In [None]:
def most_similar(x):
    word_vec = np.array(x)
    normalize_word = word_vec / np.linalg.norm(word_vec)
    normalize_vec = weights / np.linalg.norm(weights, axis=1, keepdims=True)
    consine = np.dot(normalize_vec, np.expand_dims(normalize_word, axis=1))
    values = np.sort(consine.squeeze())[::-1]
    values = [i for i in values if i >= 0.5]
    index = np.argsort(consine.squeeze())[::-1]
    return nonmed_words[index][1:len(values)]

In [None]:
def most_similar_cos(x):
    word_vec = np.array(x)
    normalize_word = word_vec / np.linalg.norm(word_vec)
    normalize_vec = weights / np.linalg.norm(weights, axis=1, keepdims=True)
    consine = np.dot(normalize_vec, np.expand_dims(normalize_word, axis=1))
    values = np.sort(consine.squeeze())[::-1]
    values = [i for i in values if i >= 0.5]
    index = np.argsort(consine.squeeze())[::-1]
    return values[1:]

In [None]:
word_df['similar'] = word_df.embed.swifter.apply(lambda x : most_similar(x))

In [None]:
word_df['cosine'] = word_df.embed.swifter.apply(lambda x : most_similar_cos(x))

In [None]:
import ast
med_df = pd.read_csv('./data/medical_word_similar_06.csv',converters={'SIMILAR':ast.literal_eval, 'COS':ast.literal_eval})

In [None]:
med_word = med_df.WORD.to_list()

In [None]:
def softmax(x):
    return np.exp(x) / sum(np.exp(x))

In [None]:
word_df['cosine'] = word_df.cosine.swifter.apply(lambda x:  softmax(x))

In [None]:
word_df['similar'] = word_df.similar.swifter.apply(lambda x: [vocab_to_int[xx] for xx in x])

In [None]:
med_df['similar'] = med_df.SIMILAR.swifter.apply(lambda x: [vocab_to_int[xx] for xx in x])

In [None]:
med_df['cosine'] = med_df.COS.swifter.apply(lambda x:  softmax(x))

In [None]:
similar_word = []
cosine = []
for i in range(10683):
    if len(med_df[med_df['WORD'] == int_to_vocab[i]]) == 0:
        try:
            similar_word.append(word_df.loc[word_df['Word'] == int_to_vocab[i], 'similar'].values[0])
            cosine.append(list(word_df.loc[word_df['Word'] == int_to_vocab[i], 'cosine'].values[0]))
        except IndexError:
            similar_word.append([])
            cosine.append([])
    else:
        similar_word.append(med_df.loc[med_df['WORD'] == int_to_vocab[i], 'similar'].values[0])
        cosine.append(list(med_df.loc[med_df['WORD'] == int_to_vocab[i], 'cosine'].values[0]))

In [None]:
similar_list = similar_word

In [None]:
no_prior = [i for i, x in enumerate(similar_list) if len(x) == 0]
prior = [i for i, x in enumerate(similar_list) if len(x) > 0]

In [None]:
softmax_list = cosine

In [None]:
pd.DataFrame([len(x)>0 for x in softmax_list]).to_csv('has_prior_knowledge.csv')

In [None]:
gamma = {word: 1 / freqs[word] for word in word_counts}

In [None]:
class Dataset(Dataset):
    def __init__(self, word_pairs, n_samples=5, n_similar=5):
        self.word_pairs = word_pairs
        self.n_samples = n_samples
        self.n_similar = n_similar

    def __len__(self):
        return len(self.word_pairs)

    def __getitem__(self, idx):
#         t1 = time.time()
        center, context = self.word_pairs[idx]
        print
        g = gamma[center]
#         t2 = time.time()
        noise = torch.zeros(self.n_samples, dtype=torch.int64)
        # noise words
        ni = 0
        while ni < self.n_samples:
            neg = torch.multinomial(noise_dist,1,replacement=True)
            if neg == center or neg == context:
                continue
            noise[ni] = neg
            ni += 1
        # similar words
#         t3 = time.time()
        if len(similar_list[center]) > 0:
            similar = similar_list[center][torch.multinomial(torch.Tensor(softmax_list[center]),1,replacement=True)]
        else:
            similar = center
#         t4 = time.time()
#         print('noise: ', t3 - t2)
#         print('similar: ', t4 - t3)
        return center, context, noise, similar, g


In [None]:
dataset = Dataset(word_pairs)
dataloader = DataLoader(dataset, batch_size=5, shuffle=True, num_workers=12, drop_last=True)

In [None]:
class SkipGramNeg(nn.Module):
    def __init__(self, n_vocab, n_embed, vocab_to_int=vocab_to_int, int_to_vocab=int_to_vocab, noise_dist=None):
        super().__init__()
        
        self.n_vocab = n_vocab
        self.n_embed = n_embed
        self.noise_dist = noise_dist
        self.vocab_to_int = vocab_to_int
        self.int_to_vocab = int_to_vocab
        
        # define embedding layers for input and output words
        self.in_embed = nn.Embedding(n_vocab,n_embed)
        self.out_embed = nn.Embedding(n_vocab,n_embed)
        
        # Initialize both embedding tables with uniform distribution
        self.in_embed.weight.data.uniform_(-1,1)
        self.out_embed.weight.data.uniform_(-1,1)
    def create_lookup_tables(words):
        
        word_counts = Counter(words)
        # sorting the words from most to least frequent in text occurrence
        sorted_vocab = sorted(word_counts, key=word_counts.get, reverse=True)
        # create int_to_vocab dictionaries
        int_to_vocab = {ii: word for ii, word in enumerate(sorted_vocab)}
        vocab_to_int = {word: ii for ii, word in int_to_vocab.items()}

        return vocab_to_int, int_to_vocab
    
    def forward_input(self, input_words):
        # return input vector embeddings
        input_vector = self.in_embed(input_words)
        return input_vector
    
    def forward_output(self, output_words):
        # return output vector embeddings
        output_vector = self.out_embed(output_words)
        return output_vector
    
    def forward_noise(self, batch_size, n_samples, noise_words):
        noise_vector = self.out_embed(noise_words).view(batch_size,n_samples,self.n_embed)        
        return noise_vector
    
    def most_similar(self, word, n_similarity):
        device = 'cuda' if torch.cuda.is_available() else 'cpu'
        model.to(device)
        word_vec = model.in_embed(torch.tensor(self.vocab_to_int[word]))
        normalize_word = word_vec / word_vec.norm()
        weights = model.in_embed.weight.data
        normalize_vec = weights / weights.norm(dim=1, keepdim=True)
        cosine = normalize_vec.mm(normalize_word.unsqueeze(1))
        values, indices = torch.sort(cosine.squeeze(),descending=True)
        return dict(zip([self.int_to_vocab[x.item()] for x in indices[:n_similarity]], [y.item() for y in values[:n_similarity]]))

In [None]:
class ExpNegLoss(nn.Module):
'''PK-word2vec model Loss calcuation
'''
    def __init__(self):
        super().__init__()

    def forward(self, input_vectors, output_vectors, noise_vectors, similar_input_vectors,similar_output_vectors, g):
        
#         start_nsl = time.time()
        batch_size, embed_size = input_vectors.shape
        
        # Input vectors should be a batch of column vectors
        input_vectors = input_vectors.view(batch_size, embed_size, 1)
        
        # Output vectors should be a batch of row vectors
        output_vectors = output_vectors.view(batch_size, 1, embed_size)
        
        similar_input_vectors = similar_input_vectors.view(batch_size, 1, embed_size)
        
        similar_output_vectors = similar_output_vectors.view(batch_size, embed_size, 1)
    
        
        # bmm = batch matrix multiplication
        # correct log-sigmoid loss
        out_loss = torch.bmm(output_vectors, input_vectors).sigmoid().log()
        out_loss = out_loss.squeeze()
        
        # similar input loss
        similar_input_loss = torch.bmm(normalize(similar_input_vectors, dim=2), normalize(input_vectors,dim=1))
        similar_input_loss =  similar_input_loss.squeeze()
        
        # similar output loss
#         print(similar_output_vectors.size())
#         print(output_vectors.size())
#         print(similar_input_vectors.size())
#         print(input_vectors.size())
        
        similar_output_loss = torch.bmm(normalize(output_vectors,dim=2), normalize(similar_output_vectors,dim=1))
        similar_output_loss = similar_output_loss.squeeze()
        
        similar_loss = (1 - similar_input_loss) + (1 - similar_output_loss)
        
        # incorrect log-sigmoid loss
        noise_loss = torch.bmm(noise_vectors.neg(), input_vectors).sigmoid().log()
        noise_loss = noise_loss.squeeze().sum(1)  # sum the losses over the sample of noise vectors
        
#         print(similar_loss.shape)
#         print(noise)
#         end_nsl = time.time()
#         print('nsl:  ' + str(start_nsl-end_nsl))

        # negate and sum correct and noisy log-sigmoid losses
        # return average batch loss
        return (-out_loss - noise_loss + alpha * g * similar_loss).mean()

In [None]:
class ExpNegLossNC(nn.Module):
'''Comparison Model without context vector regulation
'''
    def __init__(self):
        super().__init__()

    def forward(self, input_vectors, output_vectors, noise_vectors, similar_input_vectors,similar_output_vectors, g):
        
#         start_nsl = time.time()
        batch_size, embed_size = input_vectors.shape
        
        # Input vectors should be a batch of column vectors
        input_vectors = input_vectors.view(batch_size, embed_size, 1)
        
        # Output vectors should be a batch of row vectors
        output_vectors = output_vectors.view(batch_size, 1, embed_size)
        
        similar_input_vectors = similar_input_vectors.view(batch_size, 1, embed_size)
    
        
        # bmm = batch matrix multiplication
        # correct log-sigmoid loss
        out_loss = torch.bmm(output_vectors, input_vectors).sigmoid().log()
        out_loss = out_loss.squeeze()
        
        # similar input loss
        similar_input_loss = torch.bmm(normalize(similar_input_vectors, dim=2), normalize(input_vectors,dim=1))
        similar_input_loss =  similar_input_loss.squeeze()
        
        similar_loss = (1 - similar_input_loss)
        
        # incorrect log-sigmoid loss
        noise_loss = torch.bmm(noise_vectors.neg(), input_vectors).sigmoid().log()
        noise_loss = noise_loss.squeeze().sum(1)  # sum the losses over the sample of noise vectors
        
#         print(similar_loss.shape)
#         print(noise)
#         end_nsl = time.time()
#         print('nsl:  ' + str(start_nsl-end_nsl))     

        # negate and sum correct and noisy log-sigmoid losses
        # return average batch loss
        return (-out_loss - noise_loss + alpha * g * similar_loss).mean()

In [None]:
class ExpNegLossND(nn.Module):
'''Comparison Model without downsampling
'''
    def __init__(self):
        super().__init__()

    def forward(self, input_vectors, output_vectors, noise_vectors, similar_input_vectors, similar_output_vectors, g):
        
#         start_nsl = time.time()
        batch_size, embed_size = input_vectors.shape
        
        # Input vectors should be a batch of column vectors
        input_vectors = input_vectors.view(batch_size, embed_size, 1)
        
        # Output vectors should be a batch of row vectors
        output_vectors = output_vectors.view(batch_size, 1, embed_size)
        
        similar_input_vectors = similar_input_vectors.view(batch_size, 1, embed_size)
        
        similar_output_vectors = similar_output_vectors.view(batch_size, embed_size, 1)
    
        
        # bmm = batch matrix multiplication
        # correct log-sigmoid loss
        out_loss = torch.bmm(output_vectors, input_vectors).sigmoid().log()
        out_loss = out_loss.squeeze()
        
        # similar input loss
        similar_input_loss = torch.bmm(normalize(similar_input_vectors, dim=2), normalize(input_vectors,dim=1))
        similar_input_loss =  similar_input_loss.squeeze()
        
        # similar output loss
#         print(similar_output_vectors.size())
#         print(output_vectors.size())
#         print(similar_input_vectors.size())
#         print(input_vectors.size())
        
        similar_output_loss = torch.bmm(normalize(output_vectors,dim=2), normalize(similar_output_vectors,dim=1))
        similar_output_loss = similar_output_loss.squeeze()
        
        similar_loss = (1 - similar_input_loss) + (1 - similar_output_loss)
        
        # incorrect log-sigmoid loss
        noise_loss = torch.bmm(noise_vectors.neg(), input_vectors).sigmoid().log()
        noise_loss = noise_loss.squeeze().sum(1)  # sum the losses over the sample of noise vectors
        
#         print(similar_loss.shape)
#         print(noise)
#         end_nsl = time.time()
#         print('nsl:  ' + str(start_nsl-end_nsl))

        # negate and sum correct and noisy log-sigmoid losses
        # return average batch loss
        return (-out_loss - noise_loss + 10000 * alpha * similar_loss).mean()

In [None]:
a_list = [0.0001,0.00005,0.0005,0.001,0.1,0.00001,0.000001,0]

In [None]:
high_freq_word = list(range(1000))
mid_freq_word = list(range(4842,5842))
low_freq_word = list(range(9683,10683))
prior_word = random.sample(prior, 2000)
no_prior_word = random.sample(no_prior, 2000)

In [None]:
def udf_similar(model, word, n_similarity):
    model = model.to('cpu')
    word_vec = model.in_embed(torch.tensor(word))
    normalize_word = word_vec / word_vec.norm()
    weights = model.in_embed.weight.data
    normalize_vec = weights / weights.norm(dim=1, keepdim=True)
    cosine = normalize_vec.mm(normalize_word.unsqueeze(1))
    values, indices = torch.sort(cosine.squeeze(),descending=True)
#     return  list(zip([model.int_to_vocab[y.item()] for y in indices[1:n_similarity + 1]], [y.item() for y in values[1:n_similarity + 1]]))
    return [y.item() for y in values[1:n_similarity + 1]]

In [None]:
cos_emb = pd.DataFrame(columns=['Frequency','Model','Mean Cosine Similarity','epoch'])
cos_emb_pk = pd.DataFrame(columns=['Prior Knowledge','Model','Mean Cosine Similarity','epoch'])

In [None]:
def mean(lst):
    return sum(lst) / len(lst)

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

for a in a_list:
#     random.seed(17)
    # instantiating the model
    embedding_dim = 45
    batch_size = 10000
    alpha = a
    n_samples = 5
    model = SkipGramNeg(len(noise_dist), embedding_dim, noise_dist=noise_dist).to(device)
    modelnc = SkipGramNeg(len(noise_dist), embedding_dim, noise_dist=noise_dist).to(device)
    modelnd = SkipGramNeg(len(noise_dist), embedding_dim, noise_dist=noise_dist).to(device)
    dataset = Dataset(word_pairs)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=12, drop_last=True)

    # using the loss that we defined
    criterion1 = ExpNegLoss()
    optimizer1 = optim.Adam(model.parameters(), lr=0.005)
    criterion2 = ExpNegLossNC()
    optimizer2 = optim.Adam(modelnc.parameters(), lr=0.005)
    criterion3 = ExpNegLossND()
    optimizer3 = optim.Adam(modelnd.parameters(), lr=0.005)
    
    epochs = 10

    # train for some number of epochs
    for e in range(epochs):

        # get our input, target batches
        for input_words, target_words, noise_words, similar_words, g in dataloader:
            
            model = model.to(device)
            modelnc = modelnc.to(device)
            modelnd = modelnd.to(device)
            
            inputs, targets, noises, similars, g = torch.LongTensor(input_words), torch.LongTensor(target_words), torch.LongTensor(noise_words), torch.LongTensor(similar_words), torch.DoubleTensor(g)
            inputs, targets, noises, similars, g = inputs.to(device), targets.to(device), noises.to(device), similars.to(device), g.to(device)

            # final
            input_vectors = model.forward_input(inputs)
            output_vectors = model.forward_output(targets)
            noise_vectors = model.forward_noise(batch_size, n_samples, noises)
            similar_input_vectors = model.forward_input(similars)
            similar_output_vectors = model.forward_output(similars)

            # negative sampling loss
            loss = criterion1(input_vectors, output_vectors, noise_vectors, similar_input_vectors, similar_output_vectors, g)

            optimizer1.zero_grad()
            loss.backward()
            optimizer1.step()
            
            # NC
            input_vectors = modelnc.forward_input(inputs)
            output_vectors = modelnc.forward_output(targets)
            noise_vectors = modelnc.forward_noise(batch_size, n_samples, noises)
            similar_input_vectors = modelnc.forward_input(similars)
            similar_output_vectors = modelnc.forward_output(similars)

            # negative sampling loss
            loss = criterion2(input_vectors, output_vectors, noise_vectors, similar_input_vectors, similar_output_vectors, g)

            optimizer2.zero_grad()
            loss.backward()
            optimizer2.step()
            
            # ND
            input_vectors = modelnd.forward_input(inputs)
            output_vectors = modelnd.forward_output(targets)
            noise_vectors = modelnd.forward_noise(batch_size, n_samples, noises)
            similar_input_vectors = modelnd.forward_input(similars)
            similar_output_vectors = modelnd.forward_output(similars)

            # negative sampling loss
            loss = criterion3(input_vectors, output_vectors, noise_vectors, similar_input_vectors, similar_output_vectors, g)

            optimizer3.zero_grad()
            loss.backward()
            optimizer3.step()
        
        m_hf = []
        mnd_hf = []
        
        for w in high_freq_word:
            m_hf.extend(udf_similar(model,w,10))
            mnd_hf.extend(udf_similar(modelnd,w,10))
            
        cos_emb = cos_emb.append({'Frequency': 'High','Model': 'Proposed Model','Mean Cosine Similarity': mean(m_hf),'epoch':e},ignore_index=True)
        cos_emb = cos_emb.append({'Frequency': 'High','Model': 'Without Downsampling','Mean Cosine Similarity': mean(mnd_hf),'epoch':e},ignore_index=True)
        
        m_mf = []
        mnd_mf = []
        
        for w in mid_freq_word:
            m_mf.extend(udf_similar(model,w,10))
            mnd_mf.extend(udf_similar(modelnd,w,10))
            
        cos_emb = cos_emb.append({'Frequency': 'Mid','Model': 'Proposed Model','Mean Cosine Similarity': mean(m_mf),'epoch':e},ignore_index=True)
        cos_emb = cos_emb.append({'Frequency': 'Mid','Model': 'Without Downsampling','Mean Cosine Similarity': mean(mnd_mf),'epoch':e},ignore_index=True)
        
        m_lf = []
        mnd_lf = []
        
        for w in low_freq_word:
            m_lf.extend(udf_similar(model,w,10))
            mnd_lf.extend(udf_similar(modelnd,w,10))
            
        cos_emb = cos_emb.append({'Frequency': 'Low','Model': 'Proposed Model','Mean Cosine Similarity': mean(m_lf),'epoch':e},ignore_index=True)
        cos_emb = cos_emb.append({'Frequency': 'Low','Model': 'Without Downsampling','Mean Cosine Similarity': mean(mnd_lf),'epoch':e},ignore_index=True)

        m_pk = []
        mnc_pk= []
        
        for w in prior_word:
            m_pk.extend(udf_similar(model,w,10))
            mnc_pk.extend(udf_similar(modelnc,w,10))
            
        cos_emb_pk = cos_emb_pk.append({'Prior Knowledge':'True','Model': 'Proposed Model','Mean Cosine Similarity': mean(m_pk),'epoch':e},ignore_index=True)
        cos_emb_pk = cos_emb_pk.append({'Prior Knowledge':'True','Model': 'Without Context Vector','Mean Cosine Similarity': mean(mnc_pk),'epoch':e},ignore_index=True)
        
        m_npk = []
        mnc_npk= []
        
        for w in no_prior_word:
            m_npk.extend(udf_similar(model,w,10))
            mnc_npk.extend(udf_similar(modelnc,w,10))
            
        cos_emb_pk = cos_emb_pk.append({'Prior Knowledge':'False','Model': 'Proposed Model','Mean Cosine Similarity': mean(m_npk),'epoch':e},ignore_index=True)
        cos_emb_pk = cos_emb_pk.append({'Prior Knowledge':'False','Model': 'Without Context Vector','Mean Cosine Similarity': mean(mnc_npk),'epoch':e},ignore_index=True)
        
        print('epoch : ', e)
#             # loss stats
#             if steps % log_every == 0:
#                 print("Epoch: {}/{}".format(e+1, epochs))
#                 print("Loss: ", loss.item()) # avg batch loss at this point in training
        
#     torch.save(model, './sample_1_similar/word2vec_cosine_a_{}_g_45.model'.format(a))

In [None]:
cos_emb.to_csv('no_downsampling.csv')

In [None]:
def change_name(x):
    if x == "Proposed Model":
        return "With down-weighting"
    else:
        return "Without down-weighting"

In [None]:
cos_emb['Model'] = cos_emb.Model.swifter.apply(lambda x: change_name(x))

In [None]:
fig, axs = plt.subplots(1, 1, figsize=(10, 6.5))
sns.set(rc={"figure.dpi":300, 'figure.figsize':(7,6)})
sns.set_theme(style="ticks", rc={'axes.edgecolor':'black'})
ax = sns.lineplot(data=cos_emb, x="epoch", y="Mean Cosine Similarity", hue="Model", style="Frequency", ax=axs, palette=["#82b0d4", "#fc8072"])
axs.tick_params(axis='both', which='major', labelsize=18)
axs.grid()
axs.set_xlabel("Training epoch",fontsize=18)
axs.set_ylabel("Mean cosine similarity", fontsize=18)
plt.legend(fontsize=18, bbox_to_anchor=(0, 1.01, 1, 0.2), loc="lower left", mode="expand", ncol=3)

In [None]:
torch.save(model, './sample_1_similar/word2vec_cosine_a_01_ng.model')

In [None]:
def udf_most_similar(model, word, n_similarity):
    word_vec = model.in_embed(torch.tensor(model.vocab_to_int[word]))
    normalize_word = word_vec / word_vec.norm()
    weights = model.in_embed.weight.data
    normalize_vec = weights / weights.norm(dim=1, keepdim=True)
    cosine = normalize_vec.mm(normalize_word.unsqueeze(1))
    values, indices = torch.sort(cosine.squeeze(),descending=True)
    return dict(zip([model.int_to_vocab[x.item()] for x in indices[:n_similarity]], [y.item() for y in values[:n_similarity]]))

In [None]:
udf_most_similar(model.to('cpu'),'tamoxifen',10)

In [None]:
model.most_similar('tamoxifen',10)