In [1]:
save_index = 'k40_final_scaling'

DECODER_DIMS = {"text": 400, "bin": 400, "cat": 400, "bp": 400, "indus": 400}
ENCODER_DIMS = {"full": 1000, "logo": 100, "mgr": 1000, "design": 1000}
K = 40

FOLDS = 4
BATCHES = 2500
ITERS = 10

ADAM_LR = 0.00001
MIN_AF = 1e-6
ANNEALING_BATCHES = 2000
NUM_PARTICLES = 1

CENTER_BP = True

DISABLE_TQDM = False

In [2]:
import sys
sys.path.insert(1, '../../')

import os

import numpy as np
import torch
import torchvision.datasets as dset
import torch.nn as nn
import torchvision.transforms as transforms

import pyro
import pyro.distributions as dist
import pyro.contrib.examples.util  # patches torchvision
from pyro.infer import SVI, Trace_ELBO
from pyro.optim import Adam
from pyro import poutine

pyro.set_rng_seed(42)

import random
random.seed(42)

import pandas as pd

from skimage import io
from sklearn import metrics
from matplotlib import pyplot as plt

from tqdm.auto import tqdm, trange

from sklearn.metrics import classification_report

from data import SplitData
from model import LogoMVAE

assert pyro.__version__.startswith('1.3.0')

from IPython.display import clear_output
import time
from IPython import display

# Helper functions:

In [3]:
def split(a, n):
    k, m = divmod(len(a), n)
    return (a[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in range(n))

def compute_distance(z):
    b = z.reshape(z.shape[0], 1, z.shape[1])
    return np.sqrt(np.einsum('ijk, ijk->ij', z-b, z-b))

# Data Loading

First, load text data, and apply word filter. Note on notation: `tx` stands for "true x," because the model variables are also called x.

In [4]:
textdf = pd.read_csv("../../../../data/web_dtfm20_binary.csv", index_col=0)
tx_text = textdf.values
seltext = tx_text.sum(0) > 0.05
tx_text = textdf.values[:,seltext]

gt20words = tx_text.sum(1) > 20
tx_text = tx_text[gt20words,:]

words = textdf.columns[seltext]
N, V = tx_text.shape

binfeats = pd.read_csv("../../../../data/y_bin_all_py2.csv", index_col=0)
tx_b = binfeats.values
tx_b = tx_b[gt20words,:]
M_b = tx_b.shape[1]

catfeats = pd.read_csv("../../../../data/y_mult_ncolors_py2.csv", index_col=0)

tx_c1 = catfeats.values[:,0][gt20words]
M_c1 = len(np.unique(tx_c1))
tx_c1 = np.expand_dims(tx_c1, 1)

tx_c2 = catfeats.values[:,1][gt20words]
M_c2 = len(np.unique(tx_c2))
tx_c2 = np.expand_dims(tx_c2, 1)

tx_c3 = catfeats.values[:,2][gt20words]
M_c3 = len(np.unique(tx_c3))
tx_c3 = np.expand_dims(tx_c3, 1)

tx_c4 = catfeats.values[:,3][gt20words]
M_c4 = len(np.unique(tx_c4))
tx_c4 = np.expand_dims(tx_c4, 1)

tx_c5 = catfeats.values[:,4][gt20words]
M_c5 = len(np.unique(tx_c5))
tx_c5 = np.expand_dims(tx_c5, 1)

c1_labels = np.array(["black","blue_dark","blue_light","blue_medium","brown","green_dark",
                      "green_light","grey_dark","grey_light","orange","red","red_dark",
                      "yellow"])

c2_labels = np.array(["circle","rect-oval_medium","rect-oval_large","rect-oval_thin",
                      "square","triangle"])

c3_labels = np.array(["bad_letters","bulky_hollow_geometric","circular","dense_simple_geom",
                      "detailed_circle","hollow_circle","detailed_hor","long_hor","no_mark",
                      "simple","square","thin_vert_rect","vert_narrow","detailed","thin",
                      "hor_wispy"])

c4_labels = np.array(["nochars","sans","serif"])

c5_labels = np.array(["one_color","two_colors","three_colors","many_colors"])

bp = pd.read_csv("../../../../data/bp_avg_all_traits.csv", index_col=0)

bp_labels = bp.columns

tx_bp = bp.values
tx_bp = tx_bp[gt20words]
if CENTER_BP:
    tx_bp = (tx_bp - tx_bp.mean(0)) / tx_bp.std(0)
M_bp = tx_bp.shape[1]

indus = pd.read_csv("../../../../data/industry_codes_updated.csv", index_col=0)
indus = indus.iloc[np.in1d(indus.index, bp.index),:]
indus = indus.sort_index()

tx_indus = indus.values.astype('int')
tx_indus = tx_indus[:, tx_indus.sum(0) > 9]
tx_indus = tx_indus[gt20words,:]
M_indus = tx_indus.shape[1]

indus_labels = indus.columns[indus.values.sum(0) > 9]

allnames = binfeats.index.values[gt20words]

x_sizes = {"text": V, 
           "bin": M_b, 
           "cat1": M_c1, 
           "cat2": M_c2, 
           "cat3": M_c3, 
           "cat4": M_c4, 
           "cat5": M_c5, 
           "bp": M_bp, 
           "indus": M_indus, 
           "logo": M_b + 5, 
           "all": V + M_b + 5 + M_bp + M_indus}

task_sizes = {"full": x_sizes["all"], 
              "logo": x_sizes["bin"] + 5, 
              "design": x_sizes["text"] + x_sizes["bp"] + x_sizes["indus"], 
              "mgr": x_sizes["all"] - x_sizes["bp"]}


# Training: Instantiate Model and Run

In [5]:
givens = pd.DataFrame(np.concatenate(([[K], list(DECODER_DIMS.values()), list(ENCODER_DIMS.values()), [BATCHES], [ITERS], [ADAM_LR], [ANNEALING_BATCHES], [NUM_PARTICLES], [CENTER_BP]]))).T
givens.columns = ["K", "text_dec", "bin_dec", "cat_dec", "bp_dec", "indus_dec", "full_enc", "logo_enc", "mgr_enc", "design_enc", "batches", "iters", "adam_lr", "annealing_batches", "num_particles", "center_bp"]

Create holdout and cross-validation subsets (just the indices):

In [6]:
if FOLDS > 1:
    holdout_indices = list(split(np.arange(N), FOLDS))
    holdout_indices.append(np.array([]))
    fold_indices = [np.setdiff1d(np.arange(N), holdout_indices[i]) for i in range(FOLDS)]
    fold_indices.append(np.arange(N))
else:
    holdout_indices = [np.array([])]

Set the KL annealing schedule (same across each fold):

In [7]:
schedule = np.linspace(MIN_AF, 1., ANNEALING_BATCHES)
# schedule = np.concatenate([np.linspace(MIN_AF, 1., round(ANNEALING_BATCHES/4.)) for _ in range(4)])

In [8]:
track_logo_bp_mse = []
track_logo_indus_macf1 = []
track_mgr_bp_mse = []
track_des_bin_macf1 = []
track_des_cat1_macf1 = []

In [None]:
# Run the model across all folds (sequentially):
for fold in tqdm(range(FOLDS+1), desc="Folds", disable=DISABLE_TQDM):

    pyro.clear_param_store()

    data = SplitData(tx_text, tx_b, tx_c1, tx_c2, tx_c3, tx_c4, tx_c5, tx_bp, tx_indus, 
                     allnames, test_indices = holdout_indices[fold])   

    lmvae = LogoMVAE(K, ENCODER_DIMS, DECODER_DIMS, x_sizes, task_sizes, use_cuda=True)
    optimizer = Adam({"lr": ADAM_LR}) #, "weight_decay": 0.4})
    svi = SVI(lmvae.model, lmvae.guide, optimizer, loss=Trace_ELBO(num_particles = NUM_PARTICLES))

    track_loss = []

    for i in tqdm(range(BATCHES), desc="Batches", leave=False, disable=DISABLE_TQDM):

        if i < ANNEALING_BATCHES:
            annealing_factor = schedule[i]
        else:
            annealing_factor = 1.

        data.training.shuffle()

        for j in tqdm(range(ITERS), desc="Iters", leave=False, disable=True):
            svi.step(data.training, annealing_factor)
            track_loss.append(svi.evaluate_loss(data.training, annealing_factor))
        
        if (i % 20) == 0:
            if hasattr(data, 'test'):

                data.test.make_torch()

                lmvae.eval()

                lmvae.predict(data.test, network = "logo")
                track_logo_bp_mse.append(lmvae.pred.metrics.bp_mse.features.mean())
                track_logo_indus_macf1.append(lmvae.pred.metrics.indus_report['macro avg']['f1-score'])

                lmvae.predict(data.test, network = "des")
                track_des_bin_macf1.append(lmvae.pred.metrics.bin_report['macro avg']['f1-score'])
                track_des_cat1_macf1.append(lmvae.pred.metrics.cat1_report['macro avg']['f1-score'])

                lmvae.predict(data.test, network = "mgr")
                track_mgr_bp_mse.append(lmvae.pred.metrics.bp_mse.features.mean())

                lmvae.train()
    
    # Final save of stats
    lmvae.eval()
    
    lmvae.predict(data.training)
    lmvae.pred.metrics.summarize(path = str(save_index) + "_training_metrics.csv", index = fold, givens = givens)
    lmvae.pred.metrics.save_features_table(path = str(save_index) + "_training_bin_features.csv", names = binfeats.columns, index = fold, givens = givens)
    lmvae.pred.ll.summarize(path = str(save_index) + "_training_ll.csv", index = fold, givens = givens)
        
    if hasattr(data, 'test'):
        data.test.make_torch()
        lmvae.predict(data.test)
        lmvae.pred.metrics.summarize(path = str(save_index) + "_test_metrics.csv", index = fold, givens = givens)
        lmvae.pred.metrics.save_features_table(path = str(save_index) + "_test_bin_features.csv", names = binfeats.columns, index = fold, givens = givens)
        lmvae.pred.ll.summarize(path = str(save_index) + "_test_ll.csv", index = fold, givens = givens)
        
        lmvae.predict(data.test, network = "logo")
        lmvae.pred.metrics.summarize(path = str(save_index) + "_logo_metrics.csv", index = fold, givens = givens)
        lmvae.pred.ll.summarize(path = str(save_index) + "_logo_ll.csv", index = fold, givens = givens)
                
        lmvae.predict(data.test, network = "des")
        lmvae.pred.metrics.summarize(path = str(save_index) + "_des_metrics.csv", index = fold, givens = givens)
        lmvae.pred.metrics.save_features_table(path = str(save_index) + "_des_bin_features.csv", names = binfeats.columns, index = fold, givens = givens)
        lmvae.pred.ll.summarize(path = str(save_index) + "_des_ll.csv", index = fold, givens = givens)
                
        lmvae.predict(data.test, network = "mgr")
        lmvae.pred.metrics.summarize(path = str(save_index) + "_mgr_metrics.csv", index = fold, givens = givens)
        lmvae.pred.ll.summarize(path = str(save_index) + "_mgr_ll.csv", index = fold, givens = givens)

HBox(children=(FloatProgress(value=0.0, description='Folds', max=5.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Batches', max=2500.0, style=ProgressStyle(description_wid…

In [None]:
plt.plot(track_logo_bp_mse)

In [None]:
plt.plot([-x for x in track_logo_indus_macf1])

In [None]:
plt.plot([-x for x in track_des_bin_macf1])

In [None]:
plt.plot([-x for x in track_des_cat1_macf1])

In [None]:
plt.plot(track_mgr_bp_mse)

In [None]:
lmvae.predict(data.training)

z = lmvae.pred.z.z_loc.cpu().numpy()
end_names = data.training.names
# z_est = z_est[:,z_est.std(0) > 0.5]

dist_z = compute_distance(z)

test_firms = ['itw','harman-intl','lilly','goldman-sachs','21st-century-fox','facebook','gucci','old-navy','3m','actavis','mcdonalds', 'kfc']
test_neighbors = [end_names[dist_z[np.where(end_names == test_firms[i])[0][0],:].argsort()][1:5] for i in range(len(test_firms))]
test_dist = [np.sort(dist_z[np.where(end_names == test_firms[i])[0][0],:].round(2))[1:5] for i in range(len(test_firms))]
formatted_neighbors = [", ".join(test_neighbors[i].tolist()) for i in range(len(test_neighbors))]

neighbors_df = pd.DataFrame(test_neighbors)
neighbors_df.index = test_firms
neighbors_df.columns = np.arange(1,5)

In [None]:
neighbors_df

In [None]:
pyro.get_param_store().save("lmvae_20_100_lowlr_full.pyro")

In [None]:
torch.save(lmvae.state_dict(), "lmvae_state_dict.pt")

## Generate random brands

In [None]:
from model import Predict

class PredictNoVar():
    def __init__(self, lmvae, Z):
        bp = lmvae.bp_decoder(Z.cuda())
        self.bp = bp[0].cpu().detach()
        self.bin = lmvae.bin_decoder(Z.cuda()).cpu().detach()
        self.indus = lmvae.indus_decoder(Z.cuda()).cpu().detach()
        self.text = lmvae.text_decoder(Z.cuda()).cpu().detach()
        self.cat1 = lmvae.cat1_decoder(Z.cuda()).cpu().detach()
        self.cat2 = lmvae.cat2_decoder(Z.cuda()).cpu().detach()
        self.cat3 = lmvae.cat3_decoder(Z.cuda()).cpu().detach()
        self.cat4 = lmvae.cat4_decoder(Z.cuda()).cpu().detach()
        self.cat5 = lmvae.cat5_decoder(Z.cuda()).cpu().detach()

In [None]:
class RandomBrand():
    def __init__(self, lmvae, K, N = 100):
        self.K = K
        self.N = N
        self.Z = dist.Normal(loc=torch.tensor(0.), scale=torch.tensor(1.)).sample([N,self.K])
        self.pred = PredictNoVar(lmvae, self.Z)

In [None]:
def raw_profile(pred, i = 0):

    return {'bp': pred.bp[i],
            'bin': pred.bin[i],
            'indus': pred.indus[i],
            'text': pred.text[i],
            'cat1': pred.cat1[i],
            'cat2': pred.cat2[i],
            'cat3': pred.cat3[i],
            'cat4': pred.cat4[i],
            'cat5': pred.cat5[i]}

In [None]:
def profile(pred, data, i = 0):

    raw = raw_profile(pred, i)

    # Binary logo feats:
    act_probs = pd.DataFrame(raw["bin"])
    act_probs.index = binfeats.columns
    act_probs.columns = ["Prob"]

    rel_probs = pd.DataFrame(raw["bin"] - data.bin.mean(0).cpu().numpy())
    rel_probs.index = binfeats.columns
    rel_probs.columns = ["Rel Prob"]

    bin_profile = pd.concat([rel_probs, act_probs], axis=1)
    bin_profile = bin_profile.sort_values(by="Rel Prob", ascending=False)

    # Cat 1:
    act_probs = pd.DataFrame(raw["cat1"])
    act_probs.index = c1_labels
    act_probs.columns = ["Prob"]

    c1_probs = pd.Series(data.cat1.cpu().numpy().flatten()).value_counts().sort_index().to_numpy() / data.cat1.shape[0]

    rel_probs = pd.DataFrame(raw["cat1"] - c1_probs)
    rel_probs.index = c1_labels
    rel_probs.columns = ["Rel Prob"]

    cat1_profile = pd.concat([rel_probs, act_probs], axis=1)
    cat1_profile = cat1_profile.sort_values(by="Rel Prob", ascending=False)

    # Cat 2:
    act_probs = pd.DataFrame(raw["cat2"])
    act_probs.index = c2_labels
    act_probs.columns = ["Prob"]

    c2_probs = pd.Series(data.cat2.cpu().numpy().flatten()).value_counts().sort_index().to_numpy() / data.cat2.shape[0]

    rel_probs = pd.DataFrame(raw["cat2"] - c2_probs)
    rel_probs.index = c2_labels
    rel_probs.columns = ["Rel Prob"]

    cat2_profile = pd.concat([rel_probs, act_probs], axis=1)
    cat2_profile = cat2_profile.sort_values(by="Rel Prob", ascending=False)

    # Cat 3:
    act_probs = pd.DataFrame(raw["cat3"])
    act_probs.index = c3_labels
    act_probs.columns = ["Prob"]

    c3_probs = pd.Series(data.cat3.cpu().numpy().flatten()).value_counts().sort_index().to_numpy() / data.cat3.shape[0]

    rel_probs = pd.DataFrame(raw["cat3"] - c3_probs)
    rel_probs.index = c3_labels
    rel_probs.columns = ["Rel Prob"]

    cat3_profile = pd.concat([rel_probs, act_probs], axis=1)
    cat3_profile = cat3_profile.sort_values(by="Rel Prob", ascending=False)

    # Cat 4:
    act_probs = pd.DataFrame(raw["cat4"])
    act_probs.index = c4_labels
    act_probs.columns = ["Prob"]

    c4_probs = pd.Series(data.cat4.cpu().numpy().flatten()).value_counts().sort_index().to_numpy() / data.cat4.shape[0]

    rel_probs = pd.DataFrame(raw["cat4"] - c4_probs)
    rel_probs.index = c4_labels
    rel_probs.columns = ["Rel Prob"]

    cat4_profile = pd.concat([rel_probs, act_probs], axis=1)
    cat4_profile = cat4_profile.sort_values(by="Rel Prob", ascending=False)

    # Cat 5:
    act_probs = pd.DataFrame(raw["cat5"])
    act_probs.index = c5_labels
    act_probs.columns = ["Prob"]

    c5_probs = pd.Series(data.cat5.cpu().numpy().flatten()).value_counts().sort_index().to_numpy() / data.cat5.shape[0]

    rel_probs = pd.DataFrame(raw["cat5"] - c5_probs)
    rel_probs.index = c5_labels
    rel_probs.columns = ["Rel Prob"]

    cat5_profile = pd.concat([rel_probs, act_probs], axis=1)
    cat5_profile = cat5_profile.sort_values(by="Rel Prob", ascending=False)

    # Indus tags:
    act_probs = pd.DataFrame(raw["indus"])
    act_probs.index = indus_labels
    act_probs.columns = ["Prob"]

    rel_probs = pd.DataFrame(raw["indus"] - data.indus.mean(0).cpu().numpy())
    rel_probs.index = indus_labels
    rel_probs.columns = ["Rel Prob"]

    indus_profile = pd.concat([rel_probs, act_probs], axis=1)
    indus_profile = indus_profile.sort_values(by="Rel Prob", ascending=False)

    # BP:
    bp_profile = pd.DataFrame(raw["bp"])
    bp_profile.index = bp_labels
    bp_profile.columns = ["Rel Values"]
    bp_profile = bp_profile.sort_values(by="Rel Values", ascending=False)

    # Text:
    act_probs = pd.DataFrame(raw["text"])
    act_probs.index = words
    act_probs.columns = ["Prob"]

    rel_probs = pd.DataFrame(raw["text"] - data.text.mean(0).cpu().numpy())
    rel_probs.index = words
    rel_probs.columns = ["Rel Prob"]

    text_profile = pd.concat([rel_probs, act_probs], axis=1)
    text_profile = text_profile.sort_values(by="Rel Prob", ascending=False)


    return {"bp": bp_profile, "text": text_profile, "indus": indus_profile,
            "bin": bin_profile, "cat1": cat1_profile, "cat2": cat2_profile,
            "cat3": cat3_profile, "cat4": cat4_profile, "cat5": cat5_profile}

In [None]:
testgen = RandomBrand(lmvae, K, N = 1)

In [None]:
testprof = profile(testgen.pred, data = data.training)

In [None]:
testprof['bp'][0:5]

In [None]:
testprof['bp'][-5:]

In [None]:
testprof['indus'][:5]

# Brand Arithmetic

*We don't actually need this:*

In [None]:
class CompanyData():
    pass

def get_company(data, index=None, name=None, cuda=False):
    if (index == None) and (name == None):
        raise Exception("Need either an index or a name")

    if (index != None) and (name != None):
        raise Exception("Can't have both an index and a name")

    company = CompanyData()
    if (index != None):
        company.text = torch.tensor(data.x_text[index], dtype = torch.float)
        company.bin = torch.tensor(data.x_bin[index], dtype = torch.float)
        company.cat1 = torch.tensor(data.x_cat1[index], dtype = torch.float)
        company.cat2 = torch.tensor(data.x_cat2[index], dtype = torch.float)
        company.cat3 = torch.tensor(data.x_cat3[index], dtype = torch.float)
        company.cat4 = torch.tensor(data.x_cat4[index], dtype = torch.float)
        company.cat5 = torch.tensor(data.x_cat5[index], dtype = torch.float)
        company.bp = torch.tensor(data.x_bp[index], dtype = torch.float)
        company.indus = torch.tensor(data.x_indus[index], dtype = torch.float)

    if (name != None):
        company.text = torch.tensor(data.x_text[data.x_names == name], dtype = torch.float)
        company.bin = torch.tensor(data.x_bin[data.x_names == name], dtype = torch.float)
        company.cat1 = torch.tensor(data.x_cat1[data.x_names == name], dtype = torch.float)
        company.cat2 = torch.tensor(data.x_cat2[data.x_names == name], dtype = torch.float)
        company.cat3 = torch.tensor(data.x_cat3[data.x_names == name], dtype = torch.float)
        company.cat4 = torch.tensor(data.x_cat4[data.x_names == name], dtype = torch.float)
        company.cat5 = torch.tensor(data.x_cat5[data.x_names == name], dtype = torch.float)
        company.bp = torch.tensor(data.x_bp[data.x_names == name], dtype = torch.float)
        company.indus = torch.tensor(data.x_indus[data.x_names == name], dtype = torch.float)
        
    if cuda:
        company.text = company.text.cuda()
        company.bin = company.bin.cuda()
        company.cat1 = company.cat1.cuda()
        company.cat2 = company.cat2.cuda()
        company.cat3 = company.cat3.cuda()
        company.cat4 = company.cat4.cuda()
        company.cat5 = company.cat5.cuda()
        company.indus = company.indus.cuda()
        company.bp = company.bp.cuda()

    return company

In [None]:
x_mckinsey = get_company(data.training, name = "mckinsey", cuda = True)
x_goldman = get_company(data.training, name = "goldman-sachs", cuda = True)

In [None]:
x_goldman.bp

## Brand Interpolation

In [None]:
lmvae.predict(data.training)

z = lmvae.pred.z.z_loc.cpu().numpy()
end_names = data.training.names

In [None]:
def interp(logic1, logic2, n=10, w1=0.5, w2=0.5, return_z = False):
    interp = w1 * z[logic1].mean(0) + w2 * z[logic2].mean(0)
    interp_dists = compute_distance(np.vstack([z, interp]))
    if return_z:
        return interp
    else: 
        return end_names[interp_dists[-1,:].argsort()[1:(n+1)]]

In [None]:
interp(end_names == "goldman-sachs", end_names == "mckinsey", n=10)

In [None]:
interp(end_names == "nike", end_names == "gucci", n=10)

## Health + {Tech, Finance}

In [None]:
tech_company = data.training.indus.cpu().numpy()[:,np.isin(indus_labels, ["Hardware","Consumer.Electronics","Software"])].max(1) == 1
health_company = data.training.indus.cpu().numpy()[:,np.isin(indus_labels, ["Health.Care"])].max(1) == 1
finance_company = data.training.indus.cpu().numpy()[:,np.isin(indus_labels, ["Financial.Services"])].max(1) == 1

In [None]:
interp(tech_company, health_company, n=10, w1=3, w2=3)

In [None]:
interp(finance_company, health_company, w1=3, w2=3)

## Shopping + {Data, Payments}

In [None]:
payments_company = data.training.indus.cpu().numpy()[:,np.isin(indus_labels, ["Payments"])].max(1) == 1
shopping_company = data.training.indus.cpu().numpy()[:,np.isin(indus_labels, ["Commerce.and.Shopping"])].max(1) == 1
data_company = data.training.indus.cpu().numpy()[:,np.isin(indus_labels, ["Data.and.Analytics"])].max(1) == 1

In [None]:
interp(payments_company, shopping_company, w1=3, w2=3)

In [None]:
interp(shopping_company, data_company, w1=3, w2=3)

## Daring Fast Food

In [None]:
z_daring_ff = interp(daring_company, fastfood_company, w2 = 3, w1 = 1)
z_daring_ff

In [None]:
daring_company = data.training.bp.cpu().numpy()[:, bp.columns == "daring"].flatten() > 2.
fastfood_company = np.isin(end_names, ["mcdonalds","burger-king","kfc"])

z_daring_ff = interp(daring_company, fastfood_company, w1 = 2, w2 = 1, return_z = True)

pred_daring_ff = PredictNoVar(lmvae, torch.tensor(z_daring_ff).unsqueeze(0))
prof_daring_ff = profile(pred_daring_ff, data.training, i=0)

In [None]:
prof_daring_ff['bp']

# Decision support

## McDonald's Analysis

In [None]:
class NewCompany(CompanyData):
    def __init__(self, name, read_dir = "../../code/extract_features/new_logo_outputs/"):
    
        self.bp = pd.read_csv(read_dir + name + "_rel_bp.csv", header=None, index_col=0).values.T

        indus_df = pd.read_csv(read_dir + name + "_indus.csv", header=None, index_col=0)
        self.indus = indus_df.values.T

        new_bin = pd.read_csv(read_dir + name + "_y_bin.csv", index_col=0)
        self.bin = new_bin.values

        new_mult = pd.read_csv(read_dir + name + "_y_mult.csv", index_col=0)

        self.cat1 = np.expand_dims(new_mult.values[:,0], 1)
        self.cat2 = np.expand_dims(new_mult.values[:,1], 1)
        self.cat3 = np.expand_dims(new_mult.values[:,2], 1)
        self.cat4 = np.expand_dims(new_mult.values[:,3], 1)
        self.cat5 = np.expand_dims(new_mult.values[:,4], 1)

        new_text_df = pd.read_csv(read_dir + name + "_newrow_binary.csv", index_col=0)
        self.text = new_text_df.values
        
    def make_torch(self, cuda = False):
        if cuda:
            self.text = torch.tensor(self.text, dtype = torch.float).cuda()
            self.bin = torch.tensor(self.bin, dtype = torch.float).cuda()
            self.cat1 = torch.tensor(self.cat1, dtype = torch.float).cuda()
            self.cat2 = torch.tensor(self.cat2, dtype = torch.float).cuda()
            self.cat3 = torch.tensor(self.cat3, dtype = torch.float).cuda()
            self.cat4 = torch.tensor(self.cat4, dtype = torch.float).cuda()
            self.cat5 = torch.tensor(self.cat5, dtype = torch.float).cuda()
            self.bp = torch.tensor(self.bp, dtype = torch.float).cuda()
            self.indus = torch.tensor(self.indus, dtype = torch.float).cuda()
        else:
            self.text = torch.tensor(self.text, dtype = torch.float)
            self.bin = torch.tensor(self.bin, dtype = torch.float)
            self.cat1 = torch.tensor(self.cat1, dtype = torch.float)
            self.cat2 = torch.tensor(self.cat2, dtype = torch.float)
            self.cat3 = torch.tensor(self.cat3, dtype = torch.float)
            self.cat4 = torch.tensor(self.cat4, dtype = torch.float)
            self.cat5 = torch.tensor(self.cat5, dtype = torch.float)
            self.bp = torch.tensor(self.bp, dtype = torch.float)
            self.indus = torch.tensor(self.indus, dtype = torch.float)

In [None]:
from model import NewZ, Predict

new_mcds1 = NewCompany(name = "mcdonalds1", read_dir = "../../../extract_features/new_logo_outputs/")
new_mcds1.make_torch(cuda = True)
z_mcds1 = NewZ(lmvae, data = new_mcds1, network = "mgr")

new_mcds2 = NewCompany(name = "mcdonalds2", read_dir = "../../../extract_features/new_logo_outputs/")
new_mcds2.make_torch(cuda = True)
z_mcds2 = NewZ(lmvae, data = new_mcds2, network = "mgr")

new_mcds0 = NewCompany(name = "mcdonalds-old", read_dir = "../../../extract_features/new_logo_outputs/")
new_mcds0.make_torch(cuda = True)
z_mcds0 = NewZ(lmvae, data = new_mcds0, network = "mgr")

In [None]:
pred_mcds1 = Predict(lmvae, z = z_mcds1)
pred_mcds2 = Predict(lmvae, z = z_mcds2)
pred_mcds0 = Predict(lmvae, z = z_mcds0)

In [None]:
pred_mcds1.bp

In [None]:
pred_mcds2.bp

In [None]:
pred_mcds0.bp

In [None]:
out = pd.DataFrame(np.vstack([bp.columns, pred_mcds1.bp, pred_mcds2.bp, pred_mcds0.bp]).T)
out.to_csv("../../../mcdonalds/new_model_mcd_bp.csv")

## Shake Shack / In-n-Out Analysis

In [None]:
class MultiviewZ():
    def __init__(self, lmvae, data):
        self.full = NewZ(lmvae, data, network = "full")
        self.mgr = NewZ(lmvae, data, network = "mgr")
        self.des = NewZ(lmvae, data, network = "des")
        self.logo = NewZ(lmvae, data, network = "logo")

In [None]:
data_ss = NewCompany(name = "shake-shack", read_dir = "../../../extract_features/new_logo_outputs/")
data_ss.make_torch(cuda = True)
z_ss = MultiviewZ(lmvae, data_ss)

In [None]:
plt.scatter(z_ss.full.z_loc.cpu().numpy(), z_ss.mgr.z_loc.cpu().numpy())

In [None]:
plt.scatter(z_ss.full.z_loc.cpu().numpy(), z_ss.logo.z_loc.cpu().numpy())

In [None]:
plt.scatter(z_ss.full.z_loc.cpu().numpy(), z_ss.des.z_loc.cpu().numpy())

In [None]:
data_ino = NewCompany(name = "in-n-out", read_dir = "../../../extract_features/new_logo_outputs/")
data_ino.make_torch(cuda = True)
z_ino = MultiviewZ(lmvae, data_ino)

In [None]:
plt.scatter(z_ss.full.z_loc.cpu(), z_ino.full.z_loc.cpu())

In [None]:
pred_ss = Predict(lmvae, z_ss.des)
prof_ss = profile(pred_ss, data.training)

In [None]:
prof_ss["bin"]

In [None]:
pred_ino = Predict(lmvae, z_ino.des)
prof_ino = profile(pred_ino, data.training)

In [None]:
prof_ino["bin"]