In [1]:
# Import relevant modules
%matplotlib notebook

import sys
sys.path.append("../")

from Models.SGNS import SourceReceiverConcatClassifier, SourceReceiverConcatModel
from Preprocessing.FullContextProcessor import FullContextProcessor

import numpy as np
import pandas as pd
import torch
import pickle

# SRC CV Results

In [None]:
cv_results = pd.read_csv("src-nicepaths-cv-results.txt", sep="\t")
cv_results.loc[:, ["params", "mean_test_Log-Loss"]].sort_values(by="mean_test_Log-Loss", ascending=False)

In [None]:
print(cv_results.loc[16, "params"])
print(cv_results.loc[34, "params"])
print(cv_results.loc[5, "params"])
print(cv_results.loc[29, "params"])
print(cv_results.loc[23, "params"])
print(cv_results.loc[11, "params"])

# Load Trained SRC Model

In [2]:
# Load mappings and original data
fcp = FullContextProcessor(data_fpath="../Data/OConnor2013/ocon-nicepaths-extracted.txt", sep="\t")

# Create monthly time id's
fcp.createMonthTimeIdx(colname="DATE")

# Create mappings
fcp.createTwoWayMap("SOURCE")
fcp.createTwoWayMap("RECEIVER")
fcp.createTwoWayMap("WORD")
fcp.convertColToIdx("SOURCE")
fcp.convertColToIdx("RECEIVER")
fcp.convertColToIdx("WORD")

In [8]:
fcp.df.t.unique()

array([ 88,  89,  90,  91,  92,  93,  94,  95,  96,  97,  98,  99, 100,
       101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113,
       114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126,
       127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139,
       140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152,
       153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165,
       166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178,
       179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191,
       192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204,
       205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217,
       218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230,
       231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243,
       244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256,
       257, 258, 259, 260, 261, 262, 263,   0,   1,   2,   3,   

In [None]:
# Load model
model = SourceReceiverConcatModel(s_cnt=len(fcp.df["SOURCE"].unique()),
                            r_cnt=len(fcp.df["RECEIVER"].unique()),
                            w_cnt=len(fcp.df["WORD"].unique()),
                            K_s=100,
                            K_r=100,
                            K_w=200)

model.load_state_dict(torch.load("src-K200-lr1-wd1e-6-bs32.pt", map_location="cpu"))

In [None]:
# Extract the emebddings into numpy arrays
s_embeds = model.s_embeds.weight.detach().numpy()
r_embeds = model.r_embeds.weight.detach().numpy()
w_embeds = model.w_embeds.weight.detach().numpy()

# Word and SR Tensorflow projection generation

In [None]:
# Write word vectors out for tensorflow projector
np.savetxt(fname="w_embeds.txt",
           X=w_embeds,
           fmt="%.8f",
           delimiter="\t",)

with open("w_labels.txt", "w") as f:
    for idx, w in fcp.twoway_maps["WORD"]["idx_to_col"].items():
        f.write(str(w) + "\n")

In [None]:
# Write source vectors out for tensorflow projector
np.savetxt(fname="s_embeds.txt",
           X=s_embeds,
           fmt="%.8f",
           delimiter="\t",)

with open("s_labels.txt", "w") as f:
    for idx, s in fcp.twoway_maps["SOURCE"]["idx_to_col"].items():
        f.write(str(s) + "\n")

In [None]:
# Write source vectors out for tensorflow projector
np.savetxt(fname="r_embeds.txt",
           X=r_embeds,
           fmt="%.8f",
           delimiter="\t",)

with open("r_labels.txt", "w") as f:
    for idx, s in fcp.twoway_maps["RECEIVER"]["idx_to_col"].items():
        f.write(str(s) + "\n")

In [None]:
with open("sr_embeds.txt", "w") as embeds_file, open("sr_labels.txt", "w") as labels_file:
    for (s, r), df in fcp.df.groupby(["SOURCE", "RECEIVER"]):
        sr_embed = np.concatenate((s_embeds[s, :], r_embeds[r, :]))
        embeds_file.write("\t".join([str(sr_val) for sr_val in sr_embed]) + "\n")
        labels_file.write(
            fcp.twoway_maps["SOURCE"]["idx_to_col"][s] + \
            "-" + \
            fcp.twoway_maps["RECEIVER"]["idx_to_col"][r] + "\n")    
        

# Predicate Path Analysis per SR pair

In [None]:
# Obtain pred path count rankings
# Highest predicate_path counts per (s, r) will receive lowest numerical rank (e.g. highest count gets rank 1)
# For each group of tied predicates, rank is the mean numerical rank among the group (e.g. if 3 things have the highest count, they all get rank (1+2+3)/3=2)
sr_w_rankings = fcp.df.pivot_table(index=["SOURCE", "RECEIVER"],
                                   columns="WORD",
                                   fill_value=0,
                                   aggfunc="size").rank(axis=1,
                                                        method="min",
                                                        ascending=False,
                                                        pct=False) 

In [None]:
sr_w_counts = fcp.df.pivot_table(index=["SOURCE", "RECEIVER"],
                                   columns="WORD",
                                   fill_value=0,
                                   aggfunc="size")

In [None]:
# Compare for each (s,r) in data, most important pred_pathes dicated by model and simple count

top_words = 5
for (s, r), df in fcp.df.groupby(["SOURCE", "RECEIVER"]):
    # Calc what model would consider as top words for given (s,r)
    sr_embed = np.concatenate((s_embeds[s, :], r_embeds[r, :]))
    sr_word_prod = np.dot(sr_embed, w_embeds.T)
    top_model_words = [(word_idx, fcp.twoway_maps["WORD"]["idx_to_col"][word_idx]) 
                        for word_idx in np.argsort(sr_word_prod)][-top_words:]
    
    # Calc top words by simple count
    top_count_words = [fcp.twoway_maps["WORD"]["idx_to_col"][word_idx] 
                       for word_idx in np.argsort(sr_w_counts.loc[(s, r)].values)[-top_words:]]    
    
    
    print(fcp.twoway_maps["SOURCE"]["idx_to_col"][s], fcp.twoway_maps["RECEIVER"]["idx_to_col"][r], df.shape[0])
    for i, ((word_idx, pred_path), count_word) in enumerate(zip(top_model_words, top_count_words)):
        print("{}\t{:.0f}\t{} | {}".format(top_words-i, sr_w_rankings.loc[(s, r)][word_idx], pred_path, count_word))
    print("-"*80)   

Interpretation can be P(+|s,r,pred_path) = "How likely was it that [s] [pred_path]'d [r] was reported in the news".

In [None]:
# # Find generalized model predictions for top pred_paths for (s,r)'s that don't exist (show usefulness of s, r embeddings)
# sources = fcp.df["SOURCE"].unique()
# receivers = fcp.df["RECEIVER"].unique()
# sr_from_data = fcp.df.loc[:, ["SOURCE", "RECEIVER"]].values.tolist()

# for s, r in np.array(np.meshgrid(sources, receivers)).T.reshape(-1,2):
#     if [s, r] not in sr_from_data:
#         # Calc what model would consider as top words for given (s,r) not in data
#         sr_embed = np.concatenate((s_embeds[s, :], r_embeds[r, :]))
#         sr_word_prod = np.dot(sr_embed, w_embeds.T)
#         top_model_words = [(word_idx, fcp.twoway_maps["WORD"]["idx_to_col"][word_idx]) 
#                             for word_idx in np.argsort(sr_word_prod)][-top_words:]
        
#         print(fcp.twoway_maps["SOURCE"]["idx_to_col"][s], fcp.twoway_maps["RECEIVER"]["idx_to_col"][r])
#         for i, (word_idx, pred_path) in enumerate(top_model_words):
#             print("{}\t{}".format(top_words-i, pred_path))
#         print("-"*80)  