In [1]:
#import necessary libraries
import sys
sys.path.append("..")
from models.models import InnerProduct
import pandas as pd
import torch
import collections
import numpy as np
import torch.nn as nn
import os
import argparse
from data_processing.articles import Articles
from models.models import InnerProduct
import data_processing.dictionaries as dictionary
import sampling.sampler_util as sampler_util
import training.eval_util as eval_util
from pathlib import Path
from torch.utils.tensorboard import SummaryWriter
import json
from pandas import json_normalize

def expand_path(string):
    return Path(os.path.expandvars(string))
#get arguments for script and parse
parser = argparse.ArgumentParser(description='Train model on article data and test evaluation')
parser.add_argument('--model_path',
                    type=expand_path,
                    help="This is required to load model.")

parser.add_argument('--dict_dir',
                    type=expand_path,
                    help="This is required to load dictionaries")

parser.add_argument('--dataset_path',
                    type=expand_path,
                    required=True,
                    help='Path to data to be ranked.')

args = parser.parse_args()

dict_dir = Path(args.dict_dir)
final_word_ids,final_url_ids, final_publication_ids = dictionary.load_dictionaries(dict_dir)
print("Dictionaries loaded.")

data_path = Path(args.dataset_path)
dataset = Articles(data_path)
print("Data loaded.")

dataset.tokenize()
print("Data tokenized.")
word_counter = collections.Counter()
for example in dataset.examples:
    word_counter.update(example['text'])

unique_words = [word for word in word_counter.keys()]
len(set(unique_words))

abs_model_path = Path(args.model_path)
kwargs = dict(n_publications=len(final_publication_ids),
              n_articles=len(final_url_ids),
              n_attributes=len(final_word_ids),
              emb_size=100,
              sparse=False,
              use_article_emb=False,
              mode='mean')
model = InnerProduct(**kwargs)
model.load_state_dict(torch.load(abs_model_path))
print("Model Loaded.")

publication_emb = model.publication_embeddings.weight.data[0].cpu().numpy()
publication_bias = model.publication_bias.weight.data[0].cpu().numpy()
word_emb = model.attribute_emb_sum.weight.data.cpu().numpy()
word_bias = model.attribute_bias_sum.weight.data.cpu().numpy()

unique_words = list(set(unique_words))
word_emb_and_bias_dict = {}
for word in unique_words:
    if final_word_ids.get(word, 'None') != 'None':
        idx = final_word_ids.get(word, 'None')
        current_emb = list(word_emb[idx].astype(float))
        current_bias = list(word_bias[idx].astype(float))[0]
        current_short_dict = {'embedding':current_emb, 'bias':current_bias}
        word_emb_and_bias_dict[word] = current_short_dict

with open("word_to_emb+bias_dict.json", 'w') as file:
    json.dump(word_emb_and_bias_dict, file, separators=(',', ':'))
print("Model Embeddings and Bias Saved!")

pub_dict = {"embedding": list(publication_emb.astype(float)), "bias": list(publication_bias.astype(float))[0]}
with open("pub_emb+bias.json", 'w') as file:
    json.dump(pub_dict, file, separators=(',', ':'))
print("Publication Embeddings Saved!")

df = json_normalize(dataset)
df.drop(columns=['link', 'model_publication'], inplace=True)
df = df[df.text.apply(lambda x: len(x) > 400)]
df.to_json("select_demo_articles.json", orient='records')
print("Demo Articles Saved!")

Dictionaries loaded.
Data loaded.
Data tokenized.
Model Loaded.
Model Embeddings and Bias Saved!
Publication Embeddings Saved!
Demo Articles Saved!


In [2]:
list(publication_bias.astype(float))[0]

0.9955700039863586

In [7]:
#import necessary libraries
import numpy as np
from scipy.sparse import csr_matrix
from pathlib import Path
import sys
sys.path.append("..")
from models.models import InnerProduct
import pandas as pd
import torch
import collections
import numpy as np
import torch.nn as nn
import os
import argparse
from data_processing.articles import Articles
from models.models import InnerProduct
import data_processing.dictionaries as dictionary
from pathlib import Path
import json
from pandas import json_normalize
from scipy import sparse

dict_dir = Path("../../data/dictionaries")
final_word_ids,final_url_ids, final_publication_ids = dictionary.load_dictionaries(dict_dir)
print("Dictionaries loaded.")

data_path = Path('../../data/UI_feeds/scrape_data/browser-rss-articles-info.json')
dataset = Articles(data_path)
print("Data loaded.")

abs_model_path = Path("../../data/gridsearch_results/2020-05-25/optimizer_type=RMS_use_all_words=False_emb_size=100_learning_rate=0.0001_word_embedding_type=mean/model/mean-inner-product-model.pt")
kwargs = dict(n_publications=len(final_publication_ids),
              n_articles=len(final_url_ids),
              n_attributes=len(final_word_ids),
              emb_size=100,
              sparse=False,
              use_article_emb=False,
              mode='mean')
model = InnerProduct(**kwargs)
model.load_state_dict(torch.load(abs_model_path))
print("Model Loaded.")

dataset.tokenize()
proper_data = dataset.map_items(final_word_ids,
                    final_url_ids,
                    final_publication_ids,
                    filter=True,
                    min_length=400)

data_path = Path("/users/rohan/news-classification/Data/feed_ranks/data")
if not data_path.is_dir():
    data_path.mkdir()
mapped_data_path = data_path / "mapped-data"
if not mapped_data_path.is_dir():
    mapped_data_path.mkdir()
train_mapped_path = mapped_data_path / "mapped_dataset.json"
with open(train_mapped_path, "w") as file:
    json.dump(proper_data, file)
raw_data = Articles(train_mapped_path)
print("Final: ", len(raw_data))
print(f"Filtered, Mapped Data saved to {mapped_data_path} directory")
print("-------------------")

word_articles = csr_matrix((len(raw_data), len(final_word_ids)), dtype=np.float32).toarray()

for idx, item in enumerate(raw_data.examples):
    item['text'] = list(set(item['text']))
    for entry in item['text']:
        word_articles[idx][entry] = 1

publication_emb = model.publication_embeddings.weight.data[0].cpu().numpy()
publication_bias = model.publication_bias.weight.data[0].cpu().numpy()
word_emb = model.attribute_emb_sum.weight.data.cpu().numpy()
word_bias = model.attribute_bias_sum.weight.data.cpu().numpy()

np.save("word_articles.npy", word_articles)
print("Article-Word Matrix Saved")

np.save("word_emb.npy", word_emb)
print("Word Embeddings Saved")

np.save("word_bias.npy", word_bias)
print("Word Biases Saved")

Dictionaries loaded.
Data loaded.
Model Loaded.
Final:  8234
Filtered, Mapped Data saved to \users\rohan\news-classification\Data\feed_ranks\data\mapped-data directory
-------------------
Article-Word Matrix Saved
Word Embeddings Saved
Word Biases Saved


In [7]:
publication_bias

array([0.99557], dtype=float32)

In [43]:
article_embeddings = (csr_matrix(word_articles) * csr_matrix(word_emb)).toarray()

emb_times_publication = np.dot(article_embeddings, publication_emb.reshape(100,1))

article_bias = np.dot(word_articles, word_bias)

product_with_bias = emb_times_publication + article_bias

word_counts = word_articles.sum(axis=1).reshape(word_articles.shape[0], 1)

final_logits = np.divide(product_with_bias, word_counts) + float(publication_bias)

In [None]:
np.amax(final_logits)

In [1]:
import numpy as np
from scipy.sparse import csr_matrix

publication_emb = np.asarray([1.0440499, 1.0030843, 1.0340449, 0.992087, 1.0509816, 1.0315005, -1.0493797, -1.0198538, 0.9712321, -1.026394, 
            -0.9687971, 1.0592866, -1.0200703, -1.0423145, 0.9929519, 1.0220934, 1.021279, -1.0265925, 0.9601833, 0.9763889, 
            1.0109168, -0.9728226, 0.97199583, -1.0237931, -0.9996001, 0.9932069, 0.97966635, -0.98893607, -0.9876815, -0.98812914, 
            -0.9625895, 0.99879754, 0.9876508, -0.9581506, -0.95436096, -0.9601925, -1.0134513, -0.98763955, 0.98665, -1.0140482, 
            1.004904, 0.9894275, -1.0044671, -0.9839679, -0.97082543, -0.9798079, 0.9926766, -0.97317344, 0.9797, -0.97642475, 
            -0.99420726, -0.9972062, -1.0104703, 1.0575777, 0.9957696, -1.0413874, -1.0056863, -1.0151271, -0.99969465, 0.97463423, 
            -0.98398715, -1.0211866, -1.0128828, -1.0024365, -0.9800189, 1.0457181, 1.0155835, -1.036794, -1.013707, -1.0498024, 
            -1.0252678, -1.0388161, -0.97501564, 0.97687274, 0.97906756, 1.0536852, 1.0590494, -0.96917725, 1.0247189, -0.9818878, 
            -1.0417286, -1.0204054, -1.0285249, -1.0329671, 0.9705739, 0.96375024, 0.9891868, 0.9892464, 1.039075, 1.0042666,
            0.9786834, 1.0199072, 0.98080486, 0.9698635, -0.99322844, -0.95841753, -0.99150276, 0.97394156, 0.9976019, -1.0375009], dtype=np.float32)

publication_bias = 0.99557
word_articles = np.load('word_articles.npy')
word_emb = np.load('word_emb.npy')
word_bias = np.load('word_bias.npy')

print("Data Loaded Successfully!")

Data Loaded Successfully!


In [5]:
import time
time1 = time.time()
article_embeddings = np.dot(word_articles, word_emb)

emb_times_publication = np.dot(article_embeddings, publication_emb.reshape(100,1))

article_bias = np.dot(word_articles, word_bias)

product_with_bias = emb_times_publication + article_bias

word_counts = word_articles.sum(axis=1).reshape(word_articles.shape[0], 1)

final_logits = np.divide(product_with_bias, word_counts) + float(publication_bias)

word_logits = np.dot(word_emb, publication_emb.reshape(100,1)) + word_bias
broadcasted_words_per_article =word_articles * word_logits.T
sorted_word_indices = broadcasted_words_per_article.argsort(axis=1)

time2 = time.time()

print(time2-time1)

MemoryError: Unable to allocate 27.5 GiB for an array with shape (8234, 448718) and data type int64

In [5]:
broadcasted_words_per_article.shape

(8234, 448718)

In [8]:
len(final_word_ids)

448718

In [9]:
reversed_word_ids = {v: k for k, v in final_word_ids.items()}

In [11]:
with open('/users/rohan/news-classification/data/dictionaries/reversed_word_ids.json', "w") as file:
    json.dump(reversed_word_ids, file)