In [3]:
import os
import sys
sys.path.insert(1, '../')

DECODER_DIMS = {"text": 400, "bin": 400, "cat": 400, "bp": 400, "indus": 400}
ENCODER_DIMS = {"text": 400, "logo": 200, "bp": 200, "indus": 200}

K = 20

FOLDS = 4
BATCHES = 1600
ITERS = 10

ADAM_LR = 0.00001
MIN_AF = 1e-6
ANNEALING_BATCHES = 1100
NUM_PARTICLES = 1

CENTER_BP = True

WEIGHT_DECAY = 0.

DISABLE_TQDM = True

In [4]:
import numpy as np
import torch
import torchvision.datasets as dset
import torch.nn as nn
import torchvision.transforms as transforms

import pyro
import pyro.distributions as dist
import pyro.contrib.examples.util  # patches torchvision
from pyro.infer import SVI, Trace_ELBO
from pyro.optim import Adam
from pyro import poutine

pyro.set_rng_seed(42)

import random
random.seed(42)

import pandas as pd

from skimage import io
from sklearn import metrics
from matplotlib import pyplot as plt

from tqdm.auto import tqdm, trange

from sklearn.metrics import classification_report

from data import SplitData
from model import LogoMVAE

assert pyro.__version__.startswith('1.3.0')



# # Helper functions:


def split(a, n):
    k, m = divmod(len(a), n)
    return (a[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in range(n))

def compute_distance(z):
    b = z.reshape(z.shape[0], 1, z.shape[1])
    return np.sqrt(np.einsum('ijk, ijk->ij', z-b, z-b))





## Data Loading

# First, load text data, and apply word filter. Note on notation: `tx` stands for "true x," because the model variables are also called x.

textdf = pd.read_csv("../../../data/web_dtfm20_binary.csv", index_col=0)
tx_text = textdf.values
seltext = tx_text.sum(0) > 0.05
tx_text = textdf.values[:,seltext]

gt20words = tx_text.sum(1) > 20
tx_text = tx_text[gt20words,:]

words = textdf.columns[seltext]
N, V = tx_text.shape

binfeats = pd.read_csv("../../../data/y_bin_all_py2.csv", index_col=0)
tx_b = binfeats.values
tx_b = tx_b[gt20words,:]
M_b = tx_b.shape[1]

catfeats = pd.read_csv("../../../data/y_mult_ncolors_py2.csv", index_col=0)

tx_c1 = catfeats.values[:,0][gt20words]
M_c1 = len(np.unique(tx_c1))
tx_c1 = np.expand_dims(tx_c1, 1)

tx_c2 = catfeats.values[:,1][gt20words]
M_c2 = len(np.unique(tx_c2))
tx_c2 = np.expand_dims(tx_c2, 1)

tx_c3 = catfeats.values[:,2][gt20words]
M_c3 = len(np.unique(tx_c3))
tx_c3 = np.expand_dims(tx_c3, 1)

tx_c4 = catfeats.values[:,3][gt20words]
M_c4 = len(np.unique(tx_c4))
tx_c4 = np.expand_dims(tx_c4, 1)

tx_c5 = catfeats.values[:,4][gt20words]
M_c5 = len(np.unique(tx_c5))
tx_c5 = np.expand_dims(tx_c5, 1)

c1_labels = np.array(["black","blue_dark","blue_light","blue_medium","brown","green_dark",
                      "green_light","grey_dark","grey_light","orange","red","red_dark",
                      "yellow"])

c2_labels = np.array(["circle","rect-oval_medium","rect-oval_large","rect-oval_thin",
                      "square","triangle"])

c3_labels = np.array(["bad_letters","bulky_hollow_geometric","circular","dense_simple_geom",
                      "detailed_circle","hollow_circle","detailed_hor","long_hor","no_mark",
                      "simple","square","thin_vert_rect","vert_narrow","detailed","thin",
                      "hor_wispy"])

c4_labels = np.array(["nochars","sans","serif"])

c5_labels = np.array(["one_color","two_colors","three_colors","many_colors"])

bp = pd.read_csv("../../../data/bp_avg_all_traits.csv", index_col=0)

bp_labels = bp.columns

tx_bp = bp.values
tx_bp = tx_bp[gt20words]
if CENTER_BP:
    tx_bp = (tx_bp - tx_bp.mean(0)) / tx_bp.std(0)
M_bp = tx_bp.shape[1]

indus = pd.read_csv("../../../data/industry_codes_b2bc.csv", index_col=0)
indus = indus.iloc[np.in1d(indus.index, bp.index),:]
indus = indus.sort_index()

tx_indus = indus.values.astype('int')
tx_indus = tx_indus[:, tx_indus.sum(0) > 9]
tx_indus = tx_indus[gt20words,:]
M_indus = tx_indus.shape[1]

indus_labels = indus.columns[indus.values.sum(0) > 9]

allnames = binfeats.index.values[gt20words]

x_sizes = {"text": V, 
           "bin": M_b, 
           "cat1": M_c1, 
           "cat2": M_c2, 
           "cat3": M_c3, 
           "cat4": M_c4, 
           "cat5": M_c5, 
           "bp": M_bp, 
           "indus": M_indus, 
           "logo": M_b + M_c1 + M_c2 + M_c3 + M_c4 + M_c5, 
           "all": V + M_b + M_c1 + M_c2 + M_c3 + M_c4 + M_c5 + M_bp + M_indus}

task_sizes = {"full": x_sizes["all"], 
              "res": x_sizes["logo"] + x_sizes["indus"], 
              "design": x_sizes["text"] + x_sizes["bp"] + x_sizes["indus"], 
              "mgr": x_sizes["all"] - x_sizes["bp"]}

noptions = np.array([M_c1, M_c2, M_c3, M_c4, M_c5])


## Training: Instantiate Model and Run

givens = pd.DataFrame(np.concatenate(([[K], list(DECODER_DIMS.values()), list(ENCODER_DIMS.values()), [BATCHES], [ITERS], [ADAM_LR], [ANNEALING_BATCHES], [NUM_PARTICLES], [CENTER_BP], [WEIGHT_DECAY]]))).T
givens.columns = ["K", "text_dec", "bin_dec", "cat_dec", "bp_dec", "indus_dec", "full_enc", "logo_enc", "mgr_enc", "des_enc", "batches", "iters", "adam_lr", "annealing_batches", "num_particles", "center_bp", "weight_decay"]


# Create holdout and cross-validation subsets (just the indices):

if FOLDS > 1:
    holdout_indices = list(split(np.arange(N), FOLDS))
    holdout_indices.append(np.array([]))
    fold_indices = [np.setdiff1d(np.arange(N), holdout_indices[i]) for i in range(FOLDS)]
    fold_indices.append(np.arange(N))
else:
    holdout_indices = [np.array([])]


# Make `domain_mask` for holding out domains, one mask for each fold

from itertools import combinations

one_index = [i for j in np.arange(1,5) for i in combinations(np.arange(4),j)]
domain_mask_single = np.zeros((len(one_index), 4), dtype='int32')
for i in range(domain_mask_single.shape[0]):
    domain_mask_single[i,one_index[i]] = 1
    
# Note: 
#
# domain_mask_single = 
# array([[1, 0, 0, 0],
#        [0, 1, 0, 0],
#        [0, 0, 1, 0],
#        [0, 0, 0, 1],
#        [1, 1, 0, 0],
#        [1, 0, 1, 0],
#        [1, 0, 0, 1],
#        [0, 1, 1, 0],
#        [0, 1, 0, 1],
#        [0, 0, 1, 1],
#        [1, 1, 1, 0],
#        [1, 1, 0, 1],
#        [1, 0, 1, 1],
#        [0, 1, 1, 1],
#        [1, 1, 1, 1]], dtype=int32)
#
# column 1 = text
# column 2 = logo
# column 3 = bp
# column 4 = tags
    
full_mask_single = domain_mask_single[-1]  # everything
des_mask_single = domain_mask_single[-3]   # everything but logo
mgr_mask_single = domain_mask_single[-4]   # everything but BP
res_mask_single = domain_mask_single[8]    # logo + tags

In [6]:
data = SplitData(tx_text, tx_b, tx_c1, tx_c2, tx_c3, tx_c4, tx_c5, tx_bp, tx_indus, 
                     allnames, noptions, test_indices = holdout_indices[FOLDS]) 

data.training.make_torch()

In [25]:
lmvae = LogoMVAE(K, ENCODER_DIMS, DECODER_DIMS, x_sizes, use_cuda=True)

In [26]:
pyro.get_param_store().load("x113020-105312/x113020-105312.pt")

In [27]:
pyro.module("encoder", lmvae.encoder, update_module_params=True)
pyro.module("text_decoder", lmvae.text_decoder, update_module_params=True)
pyro.module("bin_decoder", lmvae.bin_decoder, update_module_params=True)
pyro.module("cat1_decoder", lmvae.cat1_decoder, update_module_params=True)
pyro.module("cat2_decoder", lmvae.cat2_decoder, update_module_params=True)
pyro.module("cat3_decoder", lmvae.cat3_decoder, update_module_params=True)
pyro.module("cat4_decoder", lmvae.cat4_decoder, update_module_params=True)
pyro.module("cat5_decoder", lmvae.cat5_decoder, update_module_params=True)
pyro.module("bp_decoder", lmvae.bp_decoder, update_module_params=True)
pyro.module("indus_decoder", lmvae.indus_decoder, update_module_params=True);

In [28]:
lmvae.eval();

# Test: Re-create Neighbors Table

In [44]:
full_mask = torch.tensor(np.tile(full_mask_single, (data.training.N,1))).cuda()

In [120]:
full_mask_single = torch.tensor(full_mask_single).unsqueeze(0).cuda()
mgr_mask_single = torch.tensor(mgr_mask_single).unsqueeze(0).cuda()
des_mask_single = torch.tensor(des_mask_single).unsqueeze(0).cuda()
res_mask_single = torch.tensor(res_mask_single).unsqueeze(0).cuda()

In [29]:
lmvae.predict(data.training, torch.tensor(np.tile(full_mask_single, (data.training.N,1))).cuda())

z = lmvae.pred.z.z_loc.cpu().numpy()
end_names = data.training.names

dist_z = compute_distance(z)

test_firms = ['itw','harman-intl','lilly','goldman-sachs','21st-century-fox','facebook','gucci','old-navy','3m','actavis','mcdonalds', 'kfc']
test_neighbors = [end_names[dist_z[np.where(end_names == test_firms[i])[0][0],:].argsort()][1:5] for i in range(len(test_firms))]
test_dist = [np.sort(dist_z[np.where(end_names == test_firms[i])[0][0],:].round(2))[1:5] for i in range(len(test_firms))]
formatted_neighbors = [", ".join(test_neighbors[i].tolist()) for i in range(len(test_neighbors))]

neighbors_df = pd.DataFrame(test_neighbors)
neighbors_df.index = test_firms
neighbors_df.columns = np.arange(1,5)

neighbors_df

Unnamed: 0,1,2,3,4
itw,textron,owens-corning,ppg,jacobs-engineering
harman-intl,harman-international-industries,motorola,ch2m-hill-companies,abbvie-inc
lilly,delta,colgate,wrigleys,carmax
goldman-sachs,mckinsey,us-bank,old-republic-international-corporation,kkr
21st-century-fox,nbc,iheartmedia,windstream,gamestop
facebook,twitter,apple,ebay,uber
gucci,mac,dior,cartier,calvin-klein
old-navy,gap,ross-dress-for-less,qvc,taco-bell
3m,western-digital,panasonic,fedex,ch2m-hill-companies
actavis,navient-corporation,bcg,phillips,coremark-holding-company


## Generate random brands

In [30]:
from model import Predict

class PredictNoVar():
    def __init__(self, lmvae, Z):
        bp = lmvae.bp_decoder(Z.cuda())
        self.bp = bp[0].cpu().detach()
        self.bin = lmvae.bin_decoder(Z.cuda()).cpu().detach()
        self.indus = lmvae.indus_decoder(Z.cuda()).cpu().detach()
        self.text = lmvae.text_decoder(Z.cuda()).cpu().detach()
        self.cat1 = lmvae.cat1_decoder(Z.cuda()).cpu().detach()
        self.cat2 = lmvae.cat2_decoder(Z.cuda()).cpu().detach()
        self.cat3 = lmvae.cat3_decoder(Z.cuda()).cpu().detach()
        self.cat4 = lmvae.cat4_decoder(Z.cuda()).cpu().detach()
        self.cat5 = lmvae.cat5_decoder(Z.cuda()).cpu().detach()

In [31]:
class RandomBrand():
    def __init__(self, lmvae, K, N = 100):
        self.K = K
        self.N = N
        self.Z = dist.Normal(loc=torch.tensor(0.), scale=torch.tensor(1.)).sample([N,self.K])
        self.pred = PredictNoVar(lmvae, self.Z)

In [32]:
def raw_profile(pred, i = 0):

    return {'bp': pred.bp[i],
            'bin': pred.bin[i],
            'indus': pred.indus[i],
            'text': pred.text[i],
            'cat1': pred.cat1[i],
            'cat2': pred.cat2[i],
            'cat3': pred.cat3[i],
            'cat4': pred.cat4[i],
            'cat5': pred.cat5[i]}

In [33]:
def profile(pred, data, i = 0):

    raw = raw_profile(pred, i)

    # Binary logo feats:
    act_probs = pd.DataFrame(raw["bin"])
    act_probs.index = binfeats.columns
    act_probs.columns = ["Prob"]

    rel_probs = pd.DataFrame(raw["bin"] - data.bin.mean(0).cpu().numpy())
    rel_probs.index = binfeats.columns
    rel_probs.columns = ["Rel Prob"]

    bin_profile = pd.concat([rel_probs, act_probs], axis=1)
    bin_profile = bin_profile.sort_values(by="Rel Prob", ascending=False)

    # Cat 1:
    act_probs = pd.DataFrame(raw["cat1"])
    act_probs.index = c1_labels
    act_probs.columns = ["Prob"]

    c1_probs = pd.Series(data.cat1.cpu().numpy().flatten()).value_counts().sort_index().to_numpy() / data.cat1.shape[0]

    rel_probs = pd.DataFrame(raw["cat1"] - c1_probs)
    rel_probs.index = c1_labels
    rel_probs.columns = ["Rel Prob"]

    cat1_profile = pd.concat([rel_probs, act_probs], axis=1)
    cat1_profile = cat1_profile.sort_values(by="Rel Prob", ascending=False)

    # Cat 2:
    act_probs = pd.DataFrame(raw["cat2"])
    act_probs.index = c2_labels
    act_probs.columns = ["Prob"]

    c2_probs = pd.Series(data.cat2.cpu().numpy().flatten()).value_counts().sort_index().to_numpy() / data.cat2.shape[0]

    rel_probs = pd.DataFrame(raw["cat2"] - c2_probs)
    rel_probs.index = c2_labels
    rel_probs.columns = ["Rel Prob"]

    cat2_profile = pd.concat([rel_probs, act_probs], axis=1)
    cat2_profile = cat2_profile.sort_values(by="Rel Prob", ascending=False)

    # Cat 3:
    act_probs = pd.DataFrame(raw["cat3"])
    act_probs.index = c3_labels
    act_probs.columns = ["Prob"]

    c3_probs = pd.Series(data.cat3.cpu().numpy().flatten()).value_counts().sort_index().to_numpy() / data.cat3.shape[0]

    rel_probs = pd.DataFrame(raw["cat3"] - c3_probs)
    rel_probs.index = c3_labels
    rel_probs.columns = ["Rel Prob"]

    cat3_profile = pd.concat([rel_probs, act_probs], axis=1)
    cat3_profile = cat3_profile.sort_values(by="Rel Prob", ascending=False)

    # Cat 4:
    act_probs = pd.DataFrame(raw["cat4"])
    act_probs.index = c4_labels
    act_probs.columns = ["Prob"]

    c4_probs = pd.Series(data.cat4.cpu().numpy().flatten()).value_counts().sort_index().to_numpy() / data.cat4.shape[0]

    rel_probs = pd.DataFrame(raw["cat4"] - c4_probs)
    rel_probs.index = c4_labels
    rel_probs.columns = ["Rel Prob"]

    cat4_profile = pd.concat([rel_probs, act_probs], axis=1)
    cat4_profile = cat4_profile.sort_values(by="Rel Prob", ascending=False)

    # Cat 5:
    act_probs = pd.DataFrame(raw["cat5"])
    act_probs.index = c5_labels
    act_probs.columns = ["Prob"]

    c5_probs = pd.Series(data.cat5.cpu().numpy().flatten()).value_counts().sort_index().to_numpy() / data.cat5.shape[0]

    rel_probs = pd.DataFrame(raw["cat5"] - c5_probs)
    rel_probs.index = c5_labels
    rel_probs.columns = ["Rel Prob"]

    cat5_profile = pd.concat([rel_probs, act_probs], axis=1)
    cat5_profile = cat5_profile.sort_values(by="Rel Prob", ascending=False)

    # Indus tags:
    act_probs = pd.DataFrame(raw["indus"])
    act_probs.index = indus_labels
    act_probs.columns = ["Prob"]

    rel_probs = pd.DataFrame(raw["indus"] - data.indus.mean(0).cpu().numpy())
    rel_probs.index = indus_labels
    rel_probs.columns = ["Rel Prob"]

    indus_profile = pd.concat([rel_probs, act_probs], axis=1)
    indus_profile = indus_profile.sort_values(by="Rel Prob", ascending=False)

    # BP:
    bp_profile = pd.DataFrame(raw["bp"])
    bp_profile.index = bp_labels
    bp_profile.columns = ["Rel Values"]
    bp_profile = bp_profile.sort_values(by="Rel Values", ascending=False)

    # Text:
    act_probs = pd.DataFrame(raw["text"])
    act_probs.index = words
    act_probs.columns = ["Prob"]

    rel_probs = pd.DataFrame(raw["text"] - data.text.mean(0).cpu().numpy())
    rel_probs.index = words
    rel_probs.columns = ["Rel Prob"]

    text_profile = pd.concat([rel_probs, act_probs], axis=1)
    text_profile = text_profile.sort_values(by="Rel Prob", ascending=False)


    return {"bp": bp_profile, "text": text_profile, "indus": indus_profile,
            "bin": bin_profile, "cat1": cat1_profile, "cat2": cat2_profile,
            "cat3": cat3_profile, "cat4": cat4_profile, "cat5": cat5_profile}

In [34]:
testgen = RandomBrand(lmvae, K, N = 1)

In [35]:
testprof = profile(testgen.pred, data = data.training)

In [36]:
testprof['bp'][0:5]

Unnamed: 0,Rel Values
technical,0.797391
corporate,0.643041
intelligent,0.34276
secure,0.154824
masculine,-0.051236


In [37]:
testprof['bp'][-5:]

Unnamed: 0,Rel Values
wholesome,-0.921379
friendly,-0.957724
sentimental,-0.997361
charming,-1.053445
cheerful,-1.130129


In [38]:
testprof['indus'][:5]

Unnamed: 0,Rel Prob,Prob
B2B,0.341685,0.845934
Government.and.Military,0.002559,0.047885
Data.and.Analytics,-5.5e-05,0.021191
Privacy.and.Security,-0.00193,0.0179
Telecommunications,-0.004925,0.036151


# Brand Arithmetic

*We don't actually need this:*

In [39]:
class CompanyData():
    pass

def get_company(data, index=None, name=None, cuda=False):
    if (index == None) and (name == None):
        raise Exception("Need either an index or a name")

    if (index != None) and (name != None):
        raise Exception("Can't have both an index and a name")

    company = CompanyData()
    if (index != None):
        company.text = torch.tensor(data.x_text[index], dtype = torch.float)
        company.bin = torch.tensor(data.x_bin[index], dtype = torch.float)
        company.cat1 = torch.tensor(data.x_cat1[index], dtype = torch.float)
        company.cat2 = torch.tensor(data.x_cat2[index], dtype = torch.float)
        company.cat3 = torch.tensor(data.x_cat3[index], dtype = torch.float)
        company.cat4 = torch.tensor(data.x_cat4[index], dtype = torch.float)
        company.cat5 = torch.tensor(data.x_cat5[index], dtype = torch.float)
        company.bp = torch.tensor(data.x_bp[index], dtype = torch.float)
        company.indus = torch.tensor(data.x_indus[index], dtype = torch.float)

    if (name != None):
        company.text = torch.tensor(data.x_text[data.x_names == name], dtype = torch.float)
        company.bin = torch.tensor(data.x_bin[data.x_names == name], dtype = torch.float)
        company.cat1 = torch.tensor(data.x_cat1[data.x_names == name], dtype = torch.float)
        company.cat2 = torch.tensor(data.x_cat2[data.x_names == name], dtype = torch.float)
        company.cat3 = torch.tensor(data.x_cat3[data.x_names == name], dtype = torch.float)
        company.cat4 = torch.tensor(data.x_cat4[data.x_names == name], dtype = torch.float)
        company.cat5 = torch.tensor(data.x_cat5[data.x_names == name], dtype = torch.float)
        company.bp = torch.tensor(data.x_bp[data.x_names == name], dtype = torch.float)
        company.indus = torch.tensor(data.x_indus[data.x_names == name], dtype = torch.float)
        
    if cuda:
        company.text = company.text.cuda()
        company.bin = company.bin.cuda()
        company.cat1 = company.cat1.cuda()
        company.cat2 = company.cat2.cuda()
        company.cat3 = company.cat3.cuda()
        company.cat4 = company.cat4.cuda()
        company.cat5 = company.cat5.cuda()
        company.indus = company.indus.cuda()
        company.bp = company.bp.cuda()

    return company

In [40]:
x_mckinsey = get_company(data.training, name = "mckinsey", cuda = True)
x_goldman = get_company(data.training, name = "goldman-sachs", cuda = True)

In [41]:
x_goldman.bp

tensor([[-0.3897, -1.6536,  1.0376, -1.3042, -1.2885,  2.0831, -0.9798, -1.9506,
         -1.0082, -1.5210, -0.9744, -2.1583,  0.7038, -0.5330, -0.6251, -2.6404,
         -1.6058, -0.2057, -0.0750, -0.1845,  0.9328, -1.6500, -1.2597, -1.5817,
         -1.1877, -0.9718, -1.1507, -1.1697, -1.8489, -0.2959,  0.1408, -1.6526,
          0.9227, -0.5152, -0.4351, -1.4868, -1.4525, -0.3161,  2.3189, -0.4404,
         -1.4936, -0.7948]], device='cuda:0')

## Brand Interpolation

In [45]:
lmvae.predict(data.training, full_mask)

z = lmvae.pred.z.z_loc.cpu().numpy()
end_names = data.training.names

In [46]:
avg_norm = np.sqrt((z**2).sum(1)).mean()
avg_norm

5.121642

In [47]:
def interp(logic1, logic2, n=10, w1=0.5, w2=0.5, return_z=False, norm=6.17):
    z1_unscaled = z[logic1].mean(0)
    z2_unscaled = z[logic2].mean(0)
    
    z1_scaling = norm / np.sqrt(np.sum(np.power(z1_unscaled,2)))
    z2_scaling = norm / np.sqrt(np.sum(np.power(z2_unscaled,2)))
    
    z1 = z1_scaling * z1_unscaled
    z2 = z2_scaling * z2_unscaled
    
    interp = w1 * z1 + w2 * z2
    interp_dists = compute_distance(np.vstack([z, interp]))
    if return_z:
        return interp
    else: 
        return end_names[interp_dists[-1,:].argsort()[1:(n+1)]]

In [48]:
interp(end_names == "goldman-sachs", end_names == "mckinsey", n=10)

array(['mckinsey', 'goldman-sachs', 'emcor-group', 'workday',
       'the-bank-of-new-york-mellon', 'state-street', 'cbre', 'kkr',
       'siemens', 'jones-lang-lasalle-incorporated'], dtype=object)

In [49]:
interp(end_names == "nike", end_names == "louis-vuitton", n=10)

array(['adidas', 'audi', 'disney', 'calvin-klein', 'mac', 'coach',
       'under-armour', 'bmw', 'nordstrom', 'norwegian-cruise'],
      dtype=object)

## Health + {Tech, Finance}

In [50]:
tech_company = data.training.indus.cpu().numpy()[:,np.isin(indus_labels, ["Hardware","Consumer.Electronics","Software"])].max(1) == 1
health_company = data.training.indus.cpu().numpy()[:,np.isin(indus_labels, ["Health.Care"])].max(1) == 1
finance_company = data.training.indus.cpu().numpy()[:,np.isin(indus_labels, ["Financial.Services"])].max(1) == 1

In [51]:
interp(tech_company, health_company)

array(['micron-technology', 'becton-dickinson-and-company', 'cerner-corp',
       'cisco', 'baxter-international', 'emerson',
       'thermo-fisher-scientific', 'biogen-idec-inc', 'celgene',
       'baxalta-incorporated'], dtype=object)

In [52]:
interp(finance_company, health_company)

array(['anthem', 'unum-group',
       'teachers-insurance-and-annuity-association', 'davita-healthcar',
       'centene-corporation', 'cigna', 'fidelity-national',
       'envision-healthcare-holdings', 'pnc', 'citizens'], dtype=object)

## Shopping + {Data, Payments}

In [53]:
payments_company = data.training.indus.cpu().numpy()[:,np.isin(indus_labels, ["Payments"])].max(1) == 1
shopping_company = data.training.indus.cpu().numpy()[:,np.isin(indus_labels, ["Commerce.and.Shopping"])].max(1) == 1
data_company = data.training.indus.cpu().numpy()[:,np.isin(indus_labels, ["Data.and.Analytics"])].max(1) == 1

In [54]:
interp(payments_company, shopping_company)

array(['sams-club', 'staples', 'walmart', 'maytag', 'kohls',
       'wells-fargo', 'marriott', 'frontier', 'capital-one', 'qvc'],
      dtype=object)

In [55]:
interp(shopping_company, data_company)

array(['sams-club', '21st-century-fox', 'alliance-data-systems-corp',
       'kelly-services', 'expedia', 'frontier', 'gamestop',
       'cablevision-systems-corporation', 'coremark-holding-company',
       'navient-corporation'], dtype=object)

## Daring Fast Food

In [56]:
daring_company = data.training.bp.cpu().numpy()[:, bp.columns == "daring"].flatten() > 2.
fastfood_company = np.isin(end_names, ["mcdonalds","burger-king","kfc"])

z_daring_ff = interp(daring_company, fastfood_company, w1 = 0.5, w2 = 0.5, return_z = True)

pred_daring_ff = PredictNoVar(lmvae, torch.tensor(z_daring_ff).unsqueeze(0))
prof_daring_ff = profile(pred_daring_ff, data.training, i=0)

In [57]:
prof_daring_ff['bp']

Unnamed: 0,Rel Values
trendy,1.647097
cool,1.56303
young,1.422094
cheerful,1.415841
charming,1.40915
good.looking,1.389537
exciting,1.368251
spirited,1.363425
original,1.25083
imaginative,1.12044


# Decision support

## McDonald's Analysis

In [94]:
class NewCompany(CompanyData):
    def __init__(self, name, noptions, read_dir = "../../code/extract_features/new_logo_outputs/"):
    
        self.bp = pd.read_csv(read_dir + name + "_rel_bp.csv", header=None, index_col=0).values.T

        indus_df = pd.read_csv(read_dir + name + "_indus.csv", header=None, index_col=0)
        self.indus = indus_df.values.T

        new_bin = pd.read_csv(read_dir + name + "_y_bin.csv", index_col=0)
        self.bin = new_bin.values

        new_mult = pd.read_csv(read_dir + name + "_y_mult.csv", index_col=0)

        self.cat1 = np.expand_dims(new_mult.values[:,0], 1)
        self.cat2 = np.expand_dims(new_mult.values[:,1], 1)
        self.cat3 = np.expand_dims(new_mult.values[:,2], 1)
        self.cat4 = np.expand_dims(new_mult.values[:,3], 1)
        self.cat5 = np.expand_dims(new_mult.values[:,4], 1)

        new_text_df = pd.read_csv(read_dir + name + "_newrow_binary.csv", index_col=0)
        self.text = new_text_df.values
        
        self.noptions = noptions
        
    def make_torch(self, cuda = False):
        if cuda:
            
            self.cat1_hot = torch.nn.functional.one_hot(torch.tensor(self.cat1, dtype = torch.int).long(), self.noptions[0]).float().squeeze(0).cuda()
            self.cat2_hot = torch.nn.functional.one_hot(torch.tensor(self.cat2, dtype = torch.int).long(), self.noptions[1]).float().squeeze(0).cuda()
            self.cat3_hot = torch.nn.functional.one_hot(torch.tensor(self.cat3, dtype = torch.int).long(), self.noptions[2]).float().squeeze(0).cuda()
            self.cat4_hot = torch.nn.functional.one_hot(torch.tensor(self.cat4, dtype = torch.int).long(), self.noptions[3]).float().squeeze(0).cuda()
            self.cat5_hot = torch.nn.functional.one_hot(torch.tensor(self.cat5, dtype = torch.int).long(), self.noptions[4]).float().squeeze(0).cuda()
            
            self.text = torch.tensor(self.text, dtype = torch.float).cuda()
            self.bin = torch.tensor(self.bin, dtype = torch.float).cuda()
            self.cat1 = torch.tensor(self.cat1, dtype = torch.float).cuda()
            self.cat2 = torch.tensor(self.cat2, dtype = torch.float).cuda()
            self.cat3 = torch.tensor(self.cat3, dtype = torch.float).cuda()
            self.cat4 = torch.tensor(self.cat4, dtype = torch.float).cuda()
            self.cat5 = torch.tensor(self.cat5, dtype = torch.float).cuda()
            self.bp = torch.tensor(self.bp, dtype = torch.float).cuda()
            self.indus = torch.tensor(self.indus, dtype = torch.float).cuda()
            

       
        else:
            
            self.cat1_hot = torch.nn.functional.one_hot(torch.tensor(self.cat1.squeeze(0), dtype = torch.int).long(), self.noptions[0]).float().squeeze(0)
            self.cat2_hot = torch.nn.functional.one_hot(torch.tensor(self.cat2.squeeze(0), dtype = torch.int).long(), self.noptions[1]).float().squeeze(0)
            self.cat3_hot = torch.nn.functional.one_hot(torch.tensor(self.cat3.squeeze(0), dtype = torch.int).long(), self.noptions[2]).float().squeeze(0)
            self.cat4_hot = torch.nn.functional.one_hot(torch.tensor(self.cat4.squeeze(0), dtype = torch.int).long(), self.noptions[3]).float().squeeze(0)
            self.cat5_hot = torch.nn.functional.one_hot(torch.tensor(self.cat5.squeeze(0), dtype = torch.int).long(), self.noptions[4]).float().squeeze(0)
       
    
            self.text = torch.tensor(self.text, dtype = torch.float)
            self.bin = torch.tensor(self.bin, dtype = torch.float)
            self.cat1 = torch.tensor(self.cat1, dtype = torch.float)
            self.cat2 = torch.tensor(self.cat2, dtype = torch.float)
            self.cat3 = torch.tensor(self.cat3, dtype = torch.float)
            self.cat4 = torch.tensor(self.cat4, dtype = torch.float)
            self.cat5 = torch.tensor(self.cat5, dtype = torch.float)
            self.bp = torch.tensor(self.bp, dtype = torch.float)
            self.indus = torch.tensor(self.indus, dtype = torch.float)
            


In [96]:
new_mcds1 = NewCompany(name = "mcdonalds1", noptions = noptions, read_dir = "../../extract_features/new_logo_outputs/")
new_mcds1.make_torch(cuda = True)

In [121]:
from model import NewZ, Predict

new_mcds1 = NewCompany(name = "mcdonalds1", noptions = noptions, read_dir = "../../extract_features/new_logo_outputs/")
new_mcds1.make_torch(cuda = True)
z_mcds1 = NewZ(lmvae, new_mcds1, mgr_mask_single)

new_mcds2 = NewCompany(name = "mcdonalds2", noptions = noptions, read_dir = "../../extract_features/new_logo_outputs/")
new_mcds2.make_torch(cuda = True)
z_mcds2 = NewZ(lmvae, new_mcds2, mgr_mask_single)

new_mcds0 = NewCompany(name = "mcdonalds-old", noptions = noptions, read_dir = "../../extract_features/new_logo_outputs/")
new_mcds0.make_torch(cuda = True)
z_mcds0 = NewZ(lmvae, new_mcds0, mgr_mask_single)

In [122]:
pred_mcds1 = Predict(lmvae, z = z_mcds1)
pred_mcds2 = Predict(lmvae, z = z_mcds2)
pred_mcds0 = Predict(lmvae, z = z_mcds0)

In [123]:
pred_mcds1.bp

array([[ 0.5934951 ,  1.0574461 , -0.2493328 , -0.30346575, -0.08721163,
        -0.67140776, -0.4628132 ,  1.3885912 , -0.35564035,  1.2248441 ,
         0.14356457,  1.197489  , -0.46056926,  0.00658364,  0.24414298,
         0.85758626,  0.00558295,  0.2509566 , -0.7386305 ,  0.19384232,
        -0.1694466 ,  0.43672863,  0.51414436,  0.6283392 ,  0.49808994,
         0.37862647, -0.05007655,  0.9564573 ,  0.73421293,  1.0728681 ,
        -0.07749314,  0.2665683 ,  0.45671365, -1.0317416 ,  0.05531666,
        -0.13865617, -0.21728939, -0.54711014, -1.0252931 ,  0.62427944,
         1.3706301 , -0.02232823]], dtype=float32)

In [124]:
pred_mcds2.bp

array([[ 0.8016872 ,  1.2099961 , -0.16119954, -0.02822491,  0.03198169,
        -0.5410377 , -0.3464926 ,  1.2648119 , -0.17086679,  1.3393182 ,
         0.29464975,  1.264971  , -0.2824357 ,  0.17275557,  0.09398585,
         0.8586496 ,  0.18187933,  0.17806382, -0.5685192 ,  0.23511949,
        -0.42377058,  0.4774607 ,  0.26879358,  0.59223795,  0.5058294 ,
         0.07062679,  0.01557026,  1.0807729 ,  0.80899495,  0.9002725 ,
         0.08394565,  0.40786234,  0.5084161 , -0.94709665, -0.20901935,
         0.02875178, -0.06286352, -0.31563312, -0.8368906 ,  0.4360091 ,
         1.4240582 ,  0.09100404]], dtype=float32)

In [125]:
pred_mcds0.bp

array([[ 0.6343989 ,  1.1358556 , -0.42143   , -0.53972185, -0.19472156,
        -0.8877269 , -0.5821122 ,  1.5319855 , -0.48639324,  1.3675879 ,
         0.07746692,  1.3263475 , -0.59917146, -0.11616736,  0.16887814,
         0.839994  , -0.11585759,  0.21568929, -0.94633853,  0.02876899,
        -0.19217685,  0.36135414,  0.6159008 ,  0.619541  ,  0.4012607 ,
         0.42210615, -0.17689584,  1.0622754 ,  0.7470898 ,  1.319301  ,
        -0.14317754,  0.25644478,  0.27995044, -1.180827  ,  0.04462747,
        -0.26683888, -0.32284263, -0.81985104, -1.2249033 ,  0.6695991 ,
         1.5081626 , -0.09563659]], dtype=float32)

In [126]:
out = pd.DataFrame(np.vstack([bp.columns, pred_mcds1.bp, pred_mcds2.bp, pred_mcds0.bp]).T)
out.to_csv("../../mcdonalds/new_model_mcd_bp.csv")

## Shake Shack / In-n-Out Analysis

In [129]:
class MultiviewZ():
    def __init__(self, lmvae, data):
        self.full = NewZ(lmvae, data, full_mask_single)
        self.mgr = NewZ(lmvae, data, mgr_mask_single)
        self.des = NewZ(lmvae, data, des_mask_single)
        self.logo = NewZ(lmvae, data, res_mask_single)

In [131]:
data_ss = NewCompany(name = "shake-shack", noptions = noptions, read_dir = "../../extract_features/new_logo_outputs/")
data_ss.make_torch(cuda = True)
z_ss = MultiviewZ(lmvae, data_ss)

In [132]:
data_ino = NewCompany(name = "in-n-out", noptions = noptions, read_dir = "../../extract_features/new_logo_outputs/")
data_ino.make_torch(cuda = True)
z_ino = MultiviewZ(lmvae, data_ino)

In [133]:
pred_ss = Predict(lmvae, z_ss.des)
prof_ss = profile(pred_ss, data.training)

In [134]:
prof_ss["bin"]

Unnamed: 0,Rel Prob,Prob
down_diag-high,0.149530,0.400238
color.red,0.138071,0.400111
mean_light-high,0.124570,0.376694
h_sym-high,0.122882,0.375007
sd_light-low,0.112630,0.363339
...,...,...
sd_light-high,-0.077849,0.170027
many_chars,-0.093416,0.127547
up_diag-low,-0.099155,0.151554
down_diag-low,-0.120571,0.127304


In [135]:
prof_ss["cat1"]

Unnamed: 0,Rel Prob,Prob
red,0.105095,0.25382
green_light,0.041478,0.062724
blue_dark,0.026672,0.104576
orange,0.021229,0.039643
grey_light,0.007659,0.023239
brown,0.004555,0.017303
red_dark,0.00372,0.026383
grey_dark,0.002916,0.051075
yellow,0.000717,0.013464
green_dark,-0.001582,0.038078


In [136]:
pred_ino = Predict(lmvae, z_ino.des)
prof_ino = profile(pred_ino, data.training)

In [137]:
prof_ino["bin"]

Unnamed: 0,Rel Prob,Prob
color.red,0.293439,0.555478
high_sat,0.257376,0.488254
h_sym-high,0.217791,0.469915
perc_white-low,0.186905,0.437613
sd_light-low,0.147814,0.398523
...,...,...
font.weight_has_orig,-0.117107,0.422553
low_sat_sd,-0.118317,0.106895
color.grey_light,-0.120404,0.178462
many_chars,-0.123571,0.097393


In [138]:
prof_ino["cat1"]

Unnamed: 0,Rel Prob,Prob
red,0.314429,0.463154
red_dark,0.025812,0.048475
green_light,0.023881,0.045127
green_dark,0.020133,0.059793
orange,0.007429,0.025843
yellow,0.007376,0.020124
brown,-0.007117,0.005631
blue_dark,-0.0084,0.069504
grey_light,-0.011331,0.00425
grey_dark,-0.0313,0.016858
