In [1]:
import os
import codecs
import re

import numpy as np
import pandas as pd
from gensim.models import KeyedVectors
from sklearn.model_selection import train_test_split

from xclib.data import data_utils #https://github.com/kunaldahiya/pyxclib



In [2]:
labels_nf = []
def format_label_embeddings(embeddings, label_tags_file):

    with open(label_tags_file, 'r', encoding='latin1') as f:
        temp = f.readlines()
    dataset_vocab = [item.rstrip("\n") for item in temp] #list of all words in given dataset
    del temp
#     dataset_vocab = dataset_vocab[:10000] #For Sample Toy
    print(len(dataset_vocab))

    label_embeddings = np.zeros((len(dataset_vocab), embeddings.vector_size))
    not_found_count = 0

    for i in range(len(dataset_vocab)):
        label_embeddings[i, :] = np.zeros(embeddings.vector_size)
        words = re.split(" |_|-", dataset_vocab[i])
        for word in words:
            try:
                label_embeddings[i, :] += embeddings[word]
            except KeyError:
                label_embeddings[i, :] += np.random.randn(embeddings.vector_size, )*0.01
                not_found_count+=1
                labels_nf.append(dataset_vocab[i])
    print("#Words with no word embeddings", not_found_count)
    return label_embeddings

def find(array, neq_val=None, filter_ixs=None):
    """
    params:
    array: 2D array
    neq_val: 0 for user features, labels, and None for label/item features
    """
    vals = array[array!=neq_val]
    vals = np.array(vals).flatten()
    indeces = np.argwhere(array!=neq_val)+1
    indeces_ax0 = indeces[:, 0]
    indeces_ax1 = indeces[:, 1]
    
    return vals, indeces_ax0, indeces_ax1, indeces

def save_labels(tr_points, tr_dims, te_points, te_dims, save_path, save_name, reveal_percent=""):
    np.savetxt(os.path.join(save_path, f"{save_name}_trLabelPoint{reveal_percent}.csv"), tr_points, delimiter="\n", fmt="%-d")
    np.savetxt(os.path.join(save_path, f"{save_name}_trLabelDim{reveal_percent}.csv"), tr_dims, delimiter="\n", fmt="%-d")
    np.savetxt(os.path.join(save_path, f"{save_name}_teLabelPoint{reveal_percent}.csv"), te_points, delimiter="\n", fmt="%-d")
    np.savetxt(os.path.join(save_path, f"{save_name}_teLabelDim{reveal_percent}.csv"), te_dims, delimiter="\n", fmt="%-d")

def format_data(data_path, embeddings, save_path, save_name, seed=2):
    tr_features, tr_labels, _, _, _ = data_utils.read_data(f'{data_path}/train.txt')
    te_features, te_labels, _, _, _ = data_utils.read_data(f'{data_path}/test.txt')
    label_features = format_label_embeddings(embeddings, f'{data_path}/Yf.txt')
    
    tr_filter_ixs = np.array((tr_labels.sum(axis=1)>5)&(tr_features.sum(axis=1)>0)).flatten()
    tr_features = tr_features[tr_filter_ixs, :]
    tr_labels = tr_labels[tr_filter_ixs, :]
    
    te_filter_ixs = np.array((te_labels.sum(axis=1)>5)&(te_features.sum(axis=1)>0)).flatten()
    te_features = te_features[te_filter_ixs, :]
    te_labels = te_labels[te_filter_ixs, :]
    
    _, tr_y_indeces_ax0, tr_y_indeces_ax1, tr_y_indeces = find(tr_labels, 0)
    _, te_y_indeces_ax0, te_y_indeces_ax1, te_y_indeces = find(te_labels, 0)
    
    tr_x_values, tr_x_indeces_ax0, tr_x_indeces_ax1, _ = find(tr_features, 0, tr_filter_ixs)
    te_x_values, te_x_indeces_ax0, te_x_indeces_ax1, _ = find(te_features, 0, te_filter_ixs)
    
    assert set(tr_x_indeces_ax0)==set(tr_y_indeces_ax0)
    assert set(te_x_indeces_ax0)==set(te_y_indeces_ax0)
    
    np.savetxt(os.path.join(save_path, f"{save_name}_trDataValue.csv"), tr_x_values, delimiter="\n")
    np.savetxt(os.path.join(save_path, f"{save_name}_trDataPoint.csv"), tr_x_indeces_ax0, delimiter="\n", fmt="%-d")
    np.savetxt(os.path.join(save_path, f"{save_name}_trDataDim.csv"), tr_x_indeces_ax1, delimiter="\n", fmt="%-d")
    np.savetxt(os.path.join(save_path, f"{save_name}_teDataValue.csv"), te_x_values, delimiter="\n")
    np.savetxt(os.path.join(save_path, f"{save_name}_teDataPoint.csv"), te_x_indeces_ax0, delimiter="\n", fmt="%-d")
    np.savetxt(os.path.join(save_path, f"{save_name}_teDataDim.csv"), te_x_indeces_ax1, delimiter="\n", fmt="%-d")
    save_labels(tr_y_indeces_ax0, tr_y_indeces_ax1, te_y_indeces_ax0, te_y_indeces_ax1, save_path, save_name)
    
    np.random.seed(seed)
    print(f"TrC-{tr_y_indeces.shape[0]}")
    print(f"TeC-{te_y_indeces.shape[0]}")
    for reveal_percent in [0.2, 0.4, 0.6, 0.8]: 
        tr_yr_indeces, tr_yh_indeces = train_test_split(tr_y_indeces, stratify=tr_y_indeces_ax0, test_size=1-reveal_percent)
        tr_yr_indeces = pd.DataFrame(tr_yr_indeces).sort_values([0,1]).values
        tr_yr_indeces_ax0 = tr_yr_indeces[:, 0]
        tr_yr_indeces_ax1 = tr_yr_indeces[:, 1]
#         tr_yh_indeces = pd.DataFrame(tr_yh_indeces).sort_values([0,1]).values
#         tr_yh_indeces_ax0 = tr_yh_indeces[:, 0]
#         tr_yh_indeces_ax1 = tr_yh_indeces[:, 1]
        
        te_yr_indeces, te_yh_indeces = train_test_split(te_y_indeces, stratify=te_y_indeces_ax0, test_size=1-reveal_percent)
        te_yr_indeces = pd.DataFrame(te_yr_indeces).sort_values([0,1]).values
        te_yr_indeces_ax0 = te_yr_indeces[:, 0]
        te_yr_indeces_ax1 = te_yr_indeces[:, 1]
#         te_yh_indeces = pd.DataFrame(te_yh_indeces).sort_values([0,1]).values
#         te_yh_indeces_ax0 = te_yh_indeces[:, 0]
#         te_yh_indeces_ax1 = te_yh_indeces[:, 1]
        

        print(f"train {reveal_percent} count - {tr_yr_indeces_ax0.shape}")
        print(f"test {reveal_percent} count - {te_yr_indeces_ax0.shape}")
        print(len(set(tr_yr_indeces_ax0)))
        print(len(set(tr_y_indeces_ax0)))
        assert set(tr_yr_indeces_ax0)==set(tr_y_indeces_ax0)
        assert set(te_yr_indeces_ax0)==set(te_y_indeces_ax0)
        save_labels(tr_yr_indeces_ax0, tr_yr_indeces_ax1, te_yr_indeces_ax0, te_yr_indeces_ax1, save_path, save_name, reveal_percent)
#         save_labels(tr_yh_indeces_ax0, tr_yh_indeces_ax1, te_yh_indeces_ax0, te_yh_indeces_ax1, save_path, save_name, reveal_percent+1)
    
    lf_values, lf_indeces_ax0, lf_indeces_ax1, _ = find(label_features, None)
    
    np.savetxt(os.path.join(save_path, f"{save_name}_LFValue.csv"), lf_values, delimiter="\n")
    np.savetxt(os.path.join(save_path, f"{save_name}_LFPoint.csv"), lf_indeces_ax0, delimiter="\n", fmt="%-d")
    np.savetxt(os.path.join(save_path, f"{save_name}_LFDim.csv"), lf_indeces_ax1, delimiter="\n", fmt="%-d")




In [3]:
word2vec500 = KeyedVectors.load_word2vec_format("word2vec/enwiki_20180420_500d.txt")

In [8]:
format_data("AmazonCat-13K.bow", word2vec500, "formatted_data/amazonCat13/", "amz13")

13330
#Words with no word embeddings 2873
TrC-3074000
TeC-816410
train 0.2 count - (614800,)
test 0.2 count - (163282,)
329225
329225
train 0.4 count - (1229600,)
test 0.4 count - (326564,)
329225
329225
train 0.6 count - (1844400,)
test 0.6 count - (489846,)
329225
329225
train 0.8 count - (2459200,)
test 0.8 count - (653128,)
329225
329225


In [19]:
tr_features, tr_labels, _, _, _ = data_utils.read_data(f'Wiki10/train.txt')

In [None]:
tr_labels

In [15]:
embeddings_dict = format_label_embeddings(word2vec500, "Wiki10/Yf.txt")

30938
#Words with no word embeddings 8623


In [17]:
word2vec500["dvd"]

array([-0.12  , -0.0832, -0.5693, -0.4079,  0.0487,  0.3767,  0.1592,
       -0.1209,  0.0719,  0.3957, -0.1751, -0.1508,  0.1907,  0.1112,
       -0.435 ,  0.1184,  0.2914, -0.3504, -0.0166, -0.3228,  0.1756,
        0.1289, -0.0596, -0.3531, -0.0995, -0.0564,  0.0473,  0.1466,
       -0.2419,  0.0984,  0.0175, -0.2212,  0.39  , -0.1515, -0.1755,
       -0.2398, -0.0321,  0.2368,  0.1169, -0.1467, -0.0715,  0.3322,
        0.0921, -0.1857,  0.1427, -0.2311, -0.362 , -0.1397, -0.1115,
       -0.0411, -0.4954, -0.1617,  0.1603,  0.1295, -0.1073, -0.1465,
        0.0239,  0.0419,  0.2875, -0.0089, -0.0543,  0.2763, -0.2139,
        0.1094,  0.2552,  0.2026, -0.2435,  0.3659, -0.345 ,  0.0467,
       -0.1206,  0.1929,  0.2215, -0.0417,  0.013 ,  0.3128, -0.1731,
       -0.1439, -0.2799,  0.0568,  0.0817,  0.03  ,  0.1544, -0.3517,
        0.1678,  0.6397, -0.0846,  0.4675,  0.2243, -0.0734,  0.0351,
       -0.3218,  0.6374, -0.2276,  0.022 ,  0.1767,  0.0961,  0.0982,
        0.0184,  0.2

In [18]:
embeddings_dict[9998]

array([-0.12      , -0.0832    , -0.5693    , -0.40790001,  0.0487    ,
        0.37670001,  0.1592    , -0.1209    ,  0.0719    ,  0.39570001,
       -0.1751    , -0.1508    ,  0.19069999,  0.1112    , -0.435     ,
        0.1184    ,  0.29139999, -0.3504    , -0.0166    , -0.32280001,
        0.17560001,  0.12890001, -0.0596    , -0.3531    , -0.0995    ,
       -0.0564    ,  0.0473    ,  0.14659999, -0.2419    ,  0.0984    ,
        0.0175    , -0.2212    ,  0.38999999, -0.1515    , -0.17550001,
       -0.23980001, -0.0321    ,  0.2368    ,  0.1169    , -0.14669999,
       -0.0715    ,  0.33219999,  0.0921    , -0.1857    ,  0.1427    ,
       -0.23109999, -0.36199999, -0.1397    , -0.1115    , -0.0411    ,
       -0.49540001, -0.1617    ,  0.1603    ,  0.1295    , -0.1073    ,
       -0.14650001,  0.0239    ,  0.0419    ,  0.28749999, -0.0089    ,
       -0.0543    ,  0.27630001, -0.2139    ,  0.1094    ,  0.2552    ,
        0.2026    , -0.24349999,  0.36590001, -0.345     ,  0.04