In [1]:
import numpy as np
import pickle as pkl
import pandas as pd

In [2]:
wrod_dict = np.load('word_dict.npz',allow_pickle=True)
wrod_dict = wrod_dict['word_dict'].item()
pca_weights_100 = pd.read_csv('pca_weights_100.csv')
pca_weights_300 = pd.read_csv('pca_weights_300.csv')
ldia_weights_100 = pd.read_csv('ldia_weights_100.csv')
ldia_weights_300 = pd.read_csv('ldia_weights_300.csv')

In [3]:
vocab_path = 'pygp_data/data/vocab.pkl'
vocab_dict = pkl.load(open(vocab_path, 'rb'))
vocab_dict_1 = vocab_dict.copy()

In [4]:
vocabs = list(vocab_dict.keys())
len(vocabs)
sentiment_dict = np.load('sentiment_dict.npy',allow_pickle=True).item()
sentiments = []
for i in range(len(vocabs)):
    if vocabs[i] in sentiment_dict.keys():
        sentiments.append(sentiment_dict[vocabs[i]])
    else:
        sentiments.append(0.5)

In [5]:
def get_word_vector(word):
    vec = np.zeros(300)
    if word in wrod_dict:
        vec = wrod_dict[word]
        vocab_dict[word] = wrod_dict[word]
    return vec

def get_pca_vector(word,dim):
    vec = np.zeros(dim)
    if dim == 100:
        if word in pca_weights_100.columns:
            vec = pca_weights_100[word].to_numpy()
    if dim == 300:
        if word in pca_weights_300.columns:
            vec = pca_weights_300[word].to_numpy()
    return vec

def get_ldia_vector(word,dim):
    vec = np.zeros(dim)
    if dim == 100:
        if word in ldia_weights_100.columns:
            vec = ldia_weights_100[word].to_numpy()
    if dim == 300:
        if word in ldia_weights_300.columns:
            vec = ldia_weights_300[word].to_numpy()
    return vec

In [6]:
embeddings= []
pca_embeddings_100 = []
pca_embeddings_300 = []
ldia_embeddings_100 = []
ldia_embeddings_300 = []
for vocab in vocabs:
    vec = get_word_vector(vocab)
    pca_vec_100 = get_pca_vector(vocab,100)
    pca_vec_300 = get_pca_vector(vocab,300)
    ldia_vec_100 = get_ldia_vector(vocab,100)
    ldia_vec_300 = get_ldia_vector(vocab,300)
    vocab_dict[vocab] = vec
    embeddings.append(vec)
    pca_embeddings_100.append(pca_vec_100)
    pca_embeddings_300.append(pca_vec_300)
    ldia_embeddings_100.append(ldia_vec_100)
    ldia_embeddings_300.append(ldia_vec_300)

In [7]:
np.savez('pygp_data/data/embedding_cc.zh.300.npz',embeddings = embeddings)
np.savez('pygp_data/data/embedding_pca.zh.100.npz',embeddings = pca_embeddings_100)
np.savez('pygp_data/data/embedding_pca.zh.300.npz',embeddings = pca_embeddings_300)
np.savez('pygp_data/data/embedding_ldia.zh.100.npz',embeddings = ldia_embeddings_100)
np.savez('pygp_data/data/embedding_ldia.zh.300.npz',embeddings = ldia_embeddings_300)

In [8]:
fastText_embeddings_300_sentiment =np.array([embeddings[i]*sentiments[i] for i in range(len(sentiments))])
pca_embeddings_300_sentiment =np.array([pca_embeddings_300[i]*sentiments[i] for i in range(len(sentiments))])
pca_embeddings_100_sentiment =np.array([pca_embeddings_100[i]*sentiments[i] for i in range(len(sentiments))])
ldia_embeddings_100_sentiment =np.array([ldia_embeddings_100[i]*sentiments[i] for i in range(len(sentiments))])
ldia_embeddings_300_sentiment =np.array([ldia_embeddings_300[i]*sentiments[i] for i in range(len(sentiments))])
np.savez('pygp_data/data/embedding_cc.zh.300_sentiment.npz',embeddings = fastText_embeddings_300_sentiment)
np.savez('pygp_data/data/embedding_pca.zh.100_sentiment.npz',embeddings = pca_embeddings_100_sentiment)
np.savez('pygp_data/data/embedding_pca.zh.300_sentiment.npz',embeddings = pca_embeddings_300_sentiment)
np.savez('pygp_data/data/embedding_ldia.zh.100_sentiment.npz',embeddings = ldia_embeddings_100_sentiment)
np.savez('pygp_data/data/embedding_ldia.zh.300_sentiment.npz',embeddings = ldia_embeddings_300_sentiment)