In [1]:
# Import relevant modules
%matplotlib notebook

import sys
sys.path.append("../")

from Models.SGNS import SRCTClassifier, SRCTModel
from Preprocessing.FullContextProcessor import FullContextProcessor

import numpy as np
import pandas as pd
import torch
import pickle

# Load Trained SRCT Model

In [2]:
# Load mappings and original data
fcp = FullContextProcessor(data_fpath="../Data/OConnor2013/ocon-nicepaths-extracted.txt", sep="\t")

# Create monthly time id's
fcp.createMonthTimeIdx("DATE", "TIME")

# Create mappings
fcp.createTwoWayMap("SOURCE")
fcp.createTwoWayMap("RECEIVER")
fcp.createTwoWayMap("PRED")
fcp.convertColToIdx("SOURCE")
fcp.convertColToIdx("RECEIVER")
fcp.convertColToIdx("PRED")

In [24]:
# Load model
model = SRCTModel(s_cnt=len(fcp.df["SOURCE"].unique()),
                    r_cnt=len(fcp.df["RECEIVER"].unique()),
                    p_cnt=len(fcp.df["PRED"].unique()),
                    T=len(fcp.df["TIME"].unique()),
                    K_s=100,
                    K_r=100,
                    K_p=200)

model.load_state_dict(torch.load("srct-K200-lr1-alpha1e5-lam1e-9.pt", map_location="cpu"))

In [35]:
fcp.df.shape

(365623, 9)

In [25]:
# Extract the emebddings into numpy arrays
s_embeds = model.s_embeds.weight.detach().numpy()
r_embeds = model.r_embeds.weight.detach().numpy()
p_embeds = model.p_embeds.weight.detach().numpy()

# Dash PCA data generation

In [None]:
# Create dataframes carrying the keys for embeddings (only care about sr's that appeared in data)
sr_df = fcp.df.loc[:, ["SOURCE", "RECEIVER", "TIME", "YEAR", "MONTH"]].drop_duplicates().reset_index(drop=True)
p_df = fcp.df.loc[:, ["PRED"]].drop_duplicates().reset_index(drop=True)

# Create numpy version of all valid sr embeds, append to with pred embeds for PCA
K = 200
sr_embeds = np.empty((sr_df.shape[0], K))
for i, row in sr_df.iterrows():
    sr_embeds[i, :] = np.concatenate((
        s_embeds[row["SOURCE"] + model.s_cnt*row["TIME"], :], r_embeds[row["RECEIVER"] + model.r_cnt*row["TIME"], :]))
    
srp_embeds = np.concatenate((sr_embeds, p_embeds), axis=0) # order is preserved

In [None]:
# Perform PCA on embeddings
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

In [None]:
# Scale data so each dim is mean = 0 std = 1
scaler = StandardScaler()
srp_scaled = scaler.fit_transform(X=srp_embeds)

# Apply PCA to scaled data
pca = PCA(n_components=2, svd_solver="full")
srp_reduced = pca.fit_transform(X=srp_scaled)

In [None]:
# Fill in dataframes with PCA projected coordinates
sr_df.loc[:, "X"] = pd.Series(srp_reduced[:sr_embeds.shape[0], 0])
sr_df.loc[:, "Y"] = pd.Series(srp_reduced[:sr_embeds.shape[0], 1])
p_df.loc[:, "X"] = pd.Series(srp_reduced[sr_embeds.shape[0]:, 0])
p_df.loc[:, "Y"] = pd.Series(srp_reduced[sr_embeds.shape[0]:, 1])

In [None]:
# Save dataframes for visualization
sr_df.to_csv("sr_df.csv")
p_df.to_csv("p_df.csv")

# Dash Plot data generation

In [26]:
# Create dataframes carrying the keys for embeddings (only care about sr's that appeared in data)
sr_df = fcp.df.loc[:, ["SOURCE", "RECEIVER", "TIME", "YEAR", "MONTH"]]
sr_df.drop_duplicates(inplace=True)
sr_df.sort_values(by="TIME", inplace=True)
sr_df.reset_index(drop=True, inplace=True)

In [27]:
# Create numpy version of all valid sr embeds, append to with pred embeds for PCA
K = 200
sr_embeds = np.empty((sr_df.shape[0], K))
for i, row in sr_df.iterrows():
    sr_embeds[i, :] = np.concatenate((
        s_embeds[row["SOURCE"] + model.s_cnt*row["TIME"], :], r_embeds[row["RECEIVER"] + model.r_cnt*row["TIME"], :]))

In [28]:
# Convert source and receivers to string names
sr_df.loc[:, "SOURCE"] = sr_df.loc[:, "SOURCE"].apply(lambda x: fcp.twoway_maps["SOURCE"]["idx_to_col"][x])
sr_df.loc[:, "RECEIVER"] = sr_df.loc[:, "RECEIVER"].apply(lambda x: fcp.twoway_maps["RECEIVER"]["idx_to_col"][x])

In [29]:
# Calculate Pr(+|p, s, r, t) for each srt-p combination
srt_p_sig = 1.0/(1.0 + np.exp(-np.dot(sr_embeds, p_embeds.T)))
srt_p_sig_sorted = np.argsort(-srt_p_sig) # (argsort finds min to max, negative to do max to min)

In [30]:
import pickle

# Save sr_df, srt_p_sig, srt_p_sig_sorted for use in viz
sr_df.to_csv("sr_df.csv")
np.savetxt(fname="sr_embeds.csv", X=sr_embeds, delimiter=",")
np.savetxt(fname="p_embeds.csv", X=p_embeds, delimiter=",")

with open('pred_map.pickle', 'wb') as handle:
    pickle.dump(fcp.twoway_maps["PRED"]["idx_to_col"], handle)

# with open('pred_map.pickle', 'rb') as handle:
#     pred_map = pickle.load(handle)


In [33]:
sr_df.groupby(["SOURCE", "RECEIVER"]).size().sort_values(ascending=False)

SOURCE  RECEIVER
ISR     PSE         226
USA     IGOUNO      221
ISR     LBN         206
USA     ISR         204
ISR     USA         204
CHN     USA         203
USA     IRQ         196
JPN     USA         194
IRQ     USA         194
USA     RUS         194
        JPN         193
IGOUNO  IRQ         193
RUS     USA         192
PSE     ISR         191
USA     CHN         190
IGOUNO  USA         190
GBR     USA         189
FRA     USA         188
USA     IRN         185
EGY     ISR         183
ISR     SYR         182
IRN     USA         181
IRQ     IGOUNO      178
CHN     TWN         178
IND     PAK         177
TWN     CHN         175
FRA     IGOUNO      175
USA     PSE         174
CHN     IGOUNO      174
        RUS         173
                   ... 
ESP     IGOUNO       63
USA     HRV          62
IRL     IGOEEC       62
TWN     IGOUNO       61
ITA     IGOUNO       61
DNK     IRQ          61
FRA     CIV          60
BGR     IRQ          58
BIH     SRB          55
ALB     KSV          54

# Tensorboard output

In [None]:
fcp.twoway_maps["SOURCE"]["col_to_idx"]["CHN"]

In [None]:
fcp.twoway_maps["RECEIVER"]["col_to_idx"]["JPN"]

In [None]:
with open("sr_embeds.txt", "w") as embed_f, open("sr_labels.txt", "w") as label_f:    
    # write header for labels
    label_f.write("TYPE\tTIME-NAME\n")
    
    # write all the predicate embeddings
    for p in range(model.p_cnt):
        embed_f.write("\t".join([str(p_val) for p_val in p_embeds[p]]) + "\n")
        label_f.write("PRED\t{}\n".format(fcp.twoway_maps["PRED"]["idx_to_col"][p]))
        
    # write all sr-combination embeddings
    for t in range(model.T):
        s=6
        r=30
        sr_embed = np.concatenate((s_embeds[s + model.s_cnt*t, :], r_embeds[r + model.r_cnt*t, :]))
        embed_f.write("\t".join([str(sr_val) for sr_val in sr_embed]) + "\n")
        label_f.write("C1C2\t{}\n".format(
            str(t) + \
            "-" + \
            fcp.twoway_maps["SOURCE"]["idx_to_col"][s] + \
            "-" + \
            fcp.twoway_maps["RECEIVER"]["idx_to_col"][r]))
            
#         for (s, r), _ in fcp.df.groupby(["SOURCE", "RECEIVER"]):
#             sr_embed = np.concatenate((s_embeds[s + model.s_cnt*t, :], r_embeds[r + model.r_cnt*t, :]))
#             embed_f.write("\t".join([str(sr_val) for sr_val in sr_embed]) + "\n")
#             label_f.write("C1C2\t{}\n".format(
#                 str(t) + \
#                 "-" + \
#                 fcp.twoway_maps["SOURCE"]["idx_to_col"][s] + \
#                 "-" + \
#                 fcp.twoway_maps["RECEIVER"]["idx_to_col"][r]))