In [1]:
import os
import sys
sys.path.insert(1, '../')

DECODER_DIMS = {"text": 400, "bin": 400, "cat": 400, "bp": 400, "indus": 400}
ENCODER_DIMS = {"full": 400, "res": 200, "mgr": 200, "design": 200}

K = 20

FOLDS = 4
BATCHES = 1600
ITERS = 10

ADAM_LR = 0.00001
MIN_AF = 1e-6
ANNEALING_BATCHES = 1100
NUM_PARTICLES = 1

CENTER_BP = True

WEIGHT_DECAY = 0.

DISABLE_TQDM = True

In [2]:
import numpy as np
import torch
import torchvision.datasets as dset
import torch.nn as nn
import torchvision.transforms as transforms

import pyro
import pyro.distributions as dist
import pyro.contrib.examples.util  # patches torchvision
from pyro.infer import SVI, Trace_ELBO
from pyro.optim import Adam
from pyro import poutine

pyro.set_rng_seed(42)

import random
random.seed(42)

import pandas as pd

from skimage import io
from sklearn import metrics
from matplotlib import pyplot as plt

from tqdm.auto import tqdm, trange

from sklearn.metrics import classification_report

from data import SplitData
from model import LogoMVAE

assert pyro.__version__.startswith('1.3.0')



# # Helper functions:


def split(a, n):
    k, m = divmod(len(a), n)
    return (a[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in range(n))

def compute_distance(z):
    b = z.reshape(z.shape[0], 1, z.shape[1])
    return np.sqrt(np.einsum('ijk, ijk->ij', z-b, z-b))





## Data Loading

# First, load text data, and apply word filter. Note on notation: `tx` stands for "true x," because the model variables are also called x.

textdf = pd.read_csv("../../../data/web_dtfm20_binary.csv", index_col=0)
tx_text = textdf.values
seltext = tx_text.sum(0) > 0.05
tx_text = textdf.values[:,seltext]

gt20words = tx_text.sum(1) > 20
tx_text = tx_text[gt20words,:]

words = textdf.columns[seltext]
N, V = tx_text.shape

binfeats = pd.read_csv("../../../data/y_bin_all_py2.csv", index_col=0)
tx_b = binfeats.values
tx_b = tx_b[gt20words,:]
M_b = tx_b.shape[1]

catfeats = pd.read_csv("../../../data/y_mult_ncolors_py2.csv", index_col=0)

tx_c1 = catfeats.values[:,0][gt20words]
M_c1 = len(np.unique(tx_c1))
tx_c1 = np.expand_dims(tx_c1, 1)

tx_c2 = catfeats.values[:,1][gt20words]
M_c2 = len(np.unique(tx_c2))
tx_c2 = np.expand_dims(tx_c2, 1)

tx_c3 = catfeats.values[:,2][gt20words]
M_c3 = len(np.unique(tx_c3))
tx_c3 = np.expand_dims(tx_c3, 1)

tx_c4 = catfeats.values[:,3][gt20words]
M_c4 = len(np.unique(tx_c4))
tx_c4 = np.expand_dims(tx_c4, 1)

tx_c5 = catfeats.values[:,4][gt20words]
M_c5 = len(np.unique(tx_c5))
tx_c5 = np.expand_dims(tx_c5, 1)

c1_labels = np.array(["black","blue_dark","blue_light","blue_medium","brown","green_dark",
                      "green_light","grey_dark","grey_light","orange","red","red_dark",
                      "yellow"])

c2_labels = np.array(["circle","rect-oval_medium","rect-oval_large","rect-oval_thin",
                      "square","triangle"])

c3_labels = np.array(["bad_letters","bulky_hollow_geometric","circular","dense_simple_geom",
                      "detailed_circle","hollow_circle","detailed_hor","long_hor","no_mark",
                      "simple","square","thin_vert_rect","vert_narrow","detailed","thin",
                      "hor_wispy"])

c4_labels = np.array(["nochars","sans","serif"])

c5_labels = np.array(["one_color","two_colors","three_colors","many_colors"])

bp = pd.read_csv("../../../data/bp_avg_all_traits.csv", index_col=0)

bp_labels = bp.columns

tx_bp = bp.values
tx_bp = tx_bp[gt20words]
if CENTER_BP:
    tx_bp = (tx_bp - tx_bp.mean(0)) / tx_bp.std(0)
M_bp = tx_bp.shape[1]

indus = pd.read_csv("../../../data/industry_codes_b2bc.csv", index_col=0)
indus = indus.iloc[np.in1d(indus.index, bp.index),:]
indus = indus.sort_index()

tx_indus = indus.values.astype('int')
tx_indus = tx_indus[:, tx_indus.sum(0) > 9]
tx_indus = tx_indus[gt20words,:]
M_indus = tx_indus.shape[1]

indus_labels = indus.columns[indus.values.sum(0) > 9]

allnames = binfeats.index.values[gt20words]

x_sizes = {"text": V, 
           "bin": M_b, 
           "cat1": M_c1, 
           "cat2": M_c2, 
           "cat3": M_c3, 
           "cat4": M_c4, 
           "cat5": M_c5, 
           "bp": M_bp, 
           "indus": M_indus, 
           "logo": M_b + M_c1 + M_c2 + M_c3 + M_c4 + M_c5, 
           "all": V + M_b + M_c1 + M_c2 + M_c3 + M_c4 + M_c5 + M_bp + M_indus}

task_sizes = {"full": x_sizes["all"], 
              "res": x_sizes["logo"] + x_sizes["indus"], 
              "design": x_sizes["text"] + x_sizes["bp"] + x_sizes["indus"], 
              "mgr": x_sizes["all"] - x_sizes["bp"]}

noptions = np.array([M_c1, M_c2, M_c3, M_c4, M_c5])


## Training: Instantiate Model and Run

givens = pd.DataFrame(np.concatenate(([[K], list(DECODER_DIMS.values()), list(ENCODER_DIMS.values()), [BATCHES], [ITERS], [ADAM_LR], [ANNEALING_BATCHES], [NUM_PARTICLES], [CENTER_BP], [WEIGHT_DECAY]]))).T
givens.columns = ["K", "text_dec", "bin_dec", "cat_dec", "bp_dec", "indus_dec", "full_enc", "logo_enc", "mgr_enc", "des_enc", "batches", "iters", "adam_lr", "annealing_batches", "num_particles", "center_bp", "weight_decay"]


# Create holdout and cross-validation subsets (just the indices):

if FOLDS > 1:
    holdout_indices = list(split(np.arange(N), FOLDS))
    holdout_indices.append(np.array([]))
    fold_indices = [np.setdiff1d(np.arange(N), holdout_indices[i]) for i in range(FOLDS)]
    fold_indices.append(np.arange(N))
else:
    holdout_indices = [np.array([])]

In [3]:
data = SplitData(tx_text, tx_b, tx_c1, tx_c2, tx_c3, tx_c4, tx_c5, tx_bp, tx_indus, 
                     allnames, noptions, test_indices = holdout_indices[FOLDS]) 

data.training.make_torch()

In [4]:
lmvae = LogoMVAE(K, ENCODER_DIMS, DECODER_DIMS, x_sizes, task_sizes, use_cuda=True)

In [5]:
pyro.get_param_store().load("x112520-212918/x112520-212918.pt")

In [6]:
pyro.module("encoder", lmvae.encoder, update_module_params=True)
pyro.module("text_decoder", lmvae.text_decoder, update_module_params=True)
pyro.module("bin_decoder", lmvae.bin_decoder, update_module_params=True)
pyro.module("cat1_decoder", lmvae.cat1_decoder, update_module_params=True)
pyro.module("cat2_decoder", lmvae.cat2_decoder, update_module_params=True)
pyro.module("cat3_decoder", lmvae.cat3_decoder, update_module_params=True)
pyro.module("cat4_decoder", lmvae.cat4_decoder, update_module_params=True)
pyro.module("cat5_decoder", lmvae.cat5_decoder, update_module_params=True)
pyro.module("bp_decoder", lmvae.bp_decoder, update_module_params=True)
pyro.module("indus_decoder", lmvae.indus_decoder, update_module_params=True);

In [7]:
lmvae.eval();

# Test: Re-create Neighbors Table

In [8]:
lmvae.predict(data.training)

z = lmvae.pred.z.z_loc.cpu().numpy()
end_names = data.training.names

dist_z = compute_distance(z)

test_firms = ['itw','harman-intl','lilly','goldman-sachs','21st-century-fox','facebook','gucci','old-navy','3m','actavis','mcdonalds', 'kfc']
test_neighbors = [end_names[dist_z[np.where(end_names == test_firms[i])[0][0],:].argsort()][1:5] for i in range(len(test_firms))]
test_dist = [np.sort(dist_z[np.where(end_names == test_firms[i])[0][0],:].round(2))[1:5] for i in range(len(test_firms))]
formatted_neighbors = [", ".join(test_neighbors[i].tolist()) for i in range(len(test_neighbors))]

neighbors_df = pd.DataFrame(test_neighbors)
neighbors_df.index = test_firms
neighbors_df.columns = np.arange(1,5)

neighbors_df

Unnamed: 0,1,2,3,4
itw,textron,csc,ppg,owens-corning
harman-intl,ch2m-hill-companies,thermo-fisher-scientific,wipro,danaher
lilly,state-farm-insurance-cos,delta,johnsons,eli-lilly-and-company
goldman-sachs,kkr,cbre,mckinsey,jones-lang-lasalle-incorporated
21st-century-fox,gamestop,iheartmedia,nbc,essendant
facebook,twitter,uber,salesforce,qualcomm
gucci,mac,wynn-resorts,cartier,dior
old-navy,gap,taco-bell,subway,sprite
3m,hp,western-digital,xerox,carrier
actavis,aes,yahoo,conagra-foods,ingredion-incorporated


## Generate random brands

In [9]:
from model import Predict

class PredictNoVar():
    def __init__(self, lmvae, Z):
        bp = lmvae.bp_decoder(Z.cuda())
        self.bp = bp[0].cpu().detach()
        self.bin = lmvae.bin_decoder(Z.cuda()).cpu().detach()
        self.indus = lmvae.indus_decoder(Z.cuda()).cpu().detach()
        self.text = lmvae.text_decoder(Z.cuda()).cpu().detach()
        self.cat1 = lmvae.cat1_decoder(Z.cuda()).cpu().detach()
        self.cat2 = lmvae.cat2_decoder(Z.cuda()).cpu().detach()
        self.cat3 = lmvae.cat3_decoder(Z.cuda()).cpu().detach()
        self.cat4 = lmvae.cat4_decoder(Z.cuda()).cpu().detach()
        self.cat5 = lmvae.cat5_decoder(Z.cuda()).cpu().detach()

In [10]:
class RandomBrand():
    def __init__(self, lmvae, K, N = 100):
        self.K = K
        self.N = N
        self.Z = dist.Normal(loc=torch.tensor(0.), scale=torch.tensor(1.)).sample([N,self.K])
        self.pred = PredictNoVar(lmvae, self.Z)

In [11]:
def raw_profile(pred, i = 0):

    return {'bp': pred.bp[i],
            'bin': pred.bin[i],
            'indus': pred.indus[i],
            'text': pred.text[i],
            'cat1': pred.cat1[i],
            'cat2': pred.cat2[i],
            'cat3': pred.cat3[i],
            'cat4': pred.cat4[i],
            'cat5': pred.cat5[i]}

In [12]:
def profile(pred, data, i = 0):

    raw = raw_profile(pred, i)

    # Binary logo feats:
    act_probs = pd.DataFrame(raw["bin"])
    act_probs.index = binfeats.columns
    act_probs.columns = ["Prob"]

    rel_probs = pd.DataFrame(raw["bin"] - data.bin.mean(0).cpu().numpy())
    rel_probs.index = binfeats.columns
    rel_probs.columns = ["Rel Prob"]

    bin_profile = pd.concat([rel_probs, act_probs], axis=1)
    bin_profile = bin_profile.sort_values(by="Rel Prob", ascending=False)

    # Cat 1:
    act_probs = pd.DataFrame(raw["cat1"])
    act_probs.index = c1_labels
    act_probs.columns = ["Prob"]

    c1_probs = pd.Series(data.cat1.cpu().numpy().flatten()).value_counts().sort_index().to_numpy() / data.cat1.shape[0]

    rel_probs = pd.DataFrame(raw["cat1"] - c1_probs)
    rel_probs.index = c1_labels
    rel_probs.columns = ["Rel Prob"]

    cat1_profile = pd.concat([rel_probs, act_probs], axis=1)
    cat1_profile = cat1_profile.sort_values(by="Rel Prob", ascending=False)

    # Cat 2:
    act_probs = pd.DataFrame(raw["cat2"])
    act_probs.index = c2_labels
    act_probs.columns = ["Prob"]

    c2_probs = pd.Series(data.cat2.cpu().numpy().flatten()).value_counts().sort_index().to_numpy() / data.cat2.shape[0]

    rel_probs = pd.DataFrame(raw["cat2"] - c2_probs)
    rel_probs.index = c2_labels
    rel_probs.columns = ["Rel Prob"]

    cat2_profile = pd.concat([rel_probs, act_probs], axis=1)
    cat2_profile = cat2_profile.sort_values(by="Rel Prob", ascending=False)

    # Cat 3:
    act_probs = pd.DataFrame(raw["cat3"])
    act_probs.index = c3_labels
    act_probs.columns = ["Prob"]

    c3_probs = pd.Series(data.cat3.cpu().numpy().flatten()).value_counts().sort_index().to_numpy() / data.cat3.shape[0]

    rel_probs = pd.DataFrame(raw["cat3"] - c3_probs)
    rel_probs.index = c3_labels
    rel_probs.columns = ["Rel Prob"]

    cat3_profile = pd.concat([rel_probs, act_probs], axis=1)
    cat3_profile = cat3_profile.sort_values(by="Rel Prob", ascending=False)

    # Cat 4:
    act_probs = pd.DataFrame(raw["cat4"])
    act_probs.index = c4_labels
    act_probs.columns = ["Prob"]

    c4_probs = pd.Series(data.cat4.cpu().numpy().flatten()).value_counts().sort_index().to_numpy() / data.cat4.shape[0]

    rel_probs = pd.DataFrame(raw["cat4"] - c4_probs)
    rel_probs.index = c4_labels
    rel_probs.columns = ["Rel Prob"]

    cat4_profile = pd.concat([rel_probs, act_probs], axis=1)
    cat4_profile = cat4_profile.sort_values(by="Rel Prob", ascending=False)

    # Cat 5:
    act_probs = pd.DataFrame(raw["cat5"])
    act_probs.index = c5_labels
    act_probs.columns = ["Prob"]

    c5_probs = pd.Series(data.cat5.cpu().numpy().flatten()).value_counts().sort_index().to_numpy() / data.cat5.shape[0]

    rel_probs = pd.DataFrame(raw["cat5"] - c5_probs)
    rel_probs.index = c5_labels
    rel_probs.columns = ["Rel Prob"]

    cat5_profile = pd.concat([rel_probs, act_probs], axis=1)
    cat5_profile = cat5_profile.sort_values(by="Rel Prob", ascending=False)

    # Indus tags:
    act_probs = pd.DataFrame(raw["indus"])
    act_probs.index = indus_labels
    act_probs.columns = ["Prob"]

    rel_probs = pd.DataFrame(raw["indus"] - data.indus.mean(0).cpu().numpy())
    rel_probs.index = indus_labels
    rel_probs.columns = ["Rel Prob"]

    indus_profile = pd.concat([rel_probs, act_probs], axis=1)
    indus_profile = indus_profile.sort_values(by="Rel Prob", ascending=False)

    # BP:
    bp_profile = pd.DataFrame(raw["bp"])
    bp_profile.index = bp_labels
    bp_profile.columns = ["Rel Values"]
    bp_profile = bp_profile.sort_values(by="Rel Values", ascending=False)

    # Text:
    act_probs = pd.DataFrame(raw["text"])
    act_probs.index = words
    act_probs.columns = ["Prob"]

    rel_probs = pd.DataFrame(raw["text"] - data.text.mean(0).cpu().numpy())
    rel_probs.index = words
    rel_probs.columns = ["Rel Prob"]

    text_profile = pd.concat([rel_probs, act_probs], axis=1)
    text_profile = text_profile.sort_values(by="Rel Prob", ascending=False)


    return {"bp": bp_profile, "text": text_profile, "indus": indus_profile,
            "bin": bin_profile, "cat1": cat1_profile, "cat2": cat2_profile,
            "cat3": cat3_profile, "cat4": cat4_profile, "cat5": cat5_profile}

In [13]:
testgen = RandomBrand(lmvae, K, N = 1)

In [14]:
testprof = profile(testgen.pred, data = data.training)

In [15]:
testprof['bp'][0:5]

Unnamed: 0,Rel Values
secure,0.879659
reliable,0.755486
up.to.date,0.747348
successful,0.715364
honest,0.704584


In [16]:
testprof['bp'][-5:]

Unnamed: 0,Rel Values
small.town,-0.251441
masculine,-0.560919
tough,-0.566793
rugged,-0.593435
outdoorsy,-0.600185


In [17]:
testprof['indus'][:5]

Unnamed: 0,Rel Prob,Prob
B2C,0.243907,0.936541
Travel.and.Tourism,0.08882,0.14831
Internet.Services,0.07164,0.153793
Information.Technology,0.061268,0.133506
Financial.Services,0.054592,0.180654


# Brand Arithmetic

*We don't actually need this:*

In [18]:
class CompanyData():
    pass

def get_company(data, index=None, name=None, cuda=False):
    if (index == None) and (name == None):
        raise Exception("Need either an index or a name")

    if (index != None) and (name != None):
        raise Exception("Can't have both an index and a name")

    company = CompanyData()
    if (index != None):
        company.text = torch.tensor(data.x_text[index], dtype = torch.float)
        company.bin = torch.tensor(data.x_bin[index], dtype = torch.float)
        company.cat1 = torch.tensor(data.x_cat1[index], dtype = torch.float)
        company.cat2 = torch.tensor(data.x_cat2[index], dtype = torch.float)
        company.cat3 = torch.tensor(data.x_cat3[index], dtype = torch.float)
        company.cat4 = torch.tensor(data.x_cat4[index], dtype = torch.float)
        company.cat5 = torch.tensor(data.x_cat5[index], dtype = torch.float)
        company.bp = torch.tensor(data.x_bp[index], dtype = torch.float)
        company.indus = torch.tensor(data.x_indus[index], dtype = torch.float)

    if (name != None):
        company.text = torch.tensor(data.x_text[data.x_names == name], dtype = torch.float)
        company.bin = torch.tensor(data.x_bin[data.x_names == name], dtype = torch.float)
        company.cat1 = torch.tensor(data.x_cat1[data.x_names == name], dtype = torch.float)
        company.cat2 = torch.tensor(data.x_cat2[data.x_names == name], dtype = torch.float)
        company.cat3 = torch.tensor(data.x_cat3[data.x_names == name], dtype = torch.float)
        company.cat4 = torch.tensor(data.x_cat4[data.x_names == name], dtype = torch.float)
        company.cat5 = torch.tensor(data.x_cat5[data.x_names == name], dtype = torch.float)
        company.bp = torch.tensor(data.x_bp[data.x_names == name], dtype = torch.float)
        company.indus = torch.tensor(data.x_indus[data.x_names == name], dtype = torch.float)
        
    if cuda:
        company.text = company.text.cuda()
        company.bin = company.bin.cuda()
        company.cat1 = company.cat1.cuda()
        company.cat2 = company.cat2.cuda()
        company.cat3 = company.cat3.cuda()
        company.cat4 = company.cat4.cuda()
        company.cat5 = company.cat5.cuda()
        company.indus = company.indus.cuda()
        company.bp = company.bp.cuda()

    return company

In [19]:
x_mckinsey = get_company(data.training, name = "mckinsey", cuda = True)
x_goldman = get_company(data.training, name = "goldman-sachs", cuda = True)

In [20]:
x_goldman.bp

tensor([[-0.3897, -1.6536,  1.0376, -1.3042, -1.2885,  2.0831, -0.9798, -1.9506,
         -1.0082, -1.5210, -0.9744, -2.1583,  0.7038, -0.5330, -0.6251, -2.6404,
         -1.6058, -0.2057, -0.0750, -0.1845,  0.9328, -1.6500, -1.2597, -1.5817,
         -1.1877, -0.9718, -1.1507, -1.1697, -1.8489, -0.2959,  0.1408, -1.6526,
          0.9227, -0.5152, -0.4351, -1.4868, -1.4525, -0.3161,  2.3189, -0.4404,
         -1.4936, -0.7948]], device='cuda:0')

## Brand Interpolation

In [21]:
lmvae.predict(data.training)

z = lmvae.pred.z.z_loc.cpu().numpy()
end_names = data.training.names

In [22]:
avg_norm = np.sqrt((z**2).sum(1)).mean()
avg_norm

3.7366588

In [23]:
def interp(logic1, logic2, n=10, w1=0.5, w2=0.5, return_z=False, norm=6.17):
    z1_unscaled = z[logic1].mean(0)
    z2_unscaled = z[logic2].mean(0)
    
    z1_scaling = norm / np.sqrt(np.sum(np.power(z1_unscaled,2)))
    z2_scaling = norm / np.sqrt(np.sum(np.power(z2_unscaled,2)))
    
    z1 = z1_scaling * z1_unscaled
    z2 = z2_scaling * z2_unscaled
    
    interp = w1 * z1 + w2 * z2
    interp_dists = compute_distance(np.vstack([z, interp]))
    if return_z:
        return interp
    else: 
        return end_names[interp_dists[-1,:].argsort()[1:(n+1)]]

In [24]:
interp(end_names == "goldman-sachs", end_names == "mckinsey", n=10)

array(['goldman-sachs', 'mckinsey', 'emcor-group', 'kkr',
       'jones-lang-lasalle-incorporated', 'cbre', 'health-net-inc',
       'the-bank-of-new-york-mellon', 'emc-corporation',
       'energy-future-holdings-corp'], dtype=object)

In [25]:
interp(end_names == "nike", end_names == "louis-vuitton", n=10)

array(['adidas', 'mac', 'audi', 'under-armour', 'tiffany-co', 'zara',
       'louis-vuitton', 'bmw', 'heineken', 'porsche'], dtype=object)

## Health + {Tech, Finance}

In [26]:
tech_company = data.training.indus.cpu().numpy()[:,np.isin(indus_labels, ["Hardware","Consumer.Electronics","Software"])].max(1) == 1
health_company = data.training.indus.cpu().numpy()[:,np.isin(indus_labels, ["Health.Care"])].max(1) == 1
finance_company = data.training.indus.cpu().numpy()[:,np.isin(indus_labels, ["Financial.Services"])].max(1) == 1

In [27]:
interp(tech_company, health_company)

array(['cerner-corp', 'biogen-idec-inc', 'becton-dickinson-and-company',
       'cisco', 'celgene', 'baxter-international',
       'thermo-fisher-scientific', 'baxalta-incorporated', 'emerson',
       'mckesson'], dtype=object)

In [28]:
interp(finance_company, health_company)

array(['pnc', 'envision-healthcare-holdings', 'celgene', 'mckesson',
       'davita-healthcar', 'unum-group', 'centene-corporation',
       'citizens', 'first-american-financial-corporation',
       'teachers-insurance-and-annuity-association'], dtype=object)

## Shopping + {Data, Payments}

In [29]:
payments_company = data.training.indus.cpu().numpy()[:,np.isin(indus_labels, ["Payments"])].max(1) == 1
shopping_company = data.training.indus.cpu().numpy()[:,np.isin(indus_labels, ["Commerce.and.Shopping"])].max(1) == 1
data_company = data.training.indus.cpu().numpy()[:,np.isin(indus_labels, ["Data.and.Analytics"])].max(1) == 1

In [30]:
interp(payments_company, shopping_company)

array(['sams-club', 'jc-penney', 'qvc', 'tracfone', 'walmart', 'staples',
       'kohls', 'burlington-stores', 'holiday-inn', 'ross-dress-for-less'],
      dtype=object)

In [31]:
interp(shopping_company, data_company)

array(['sams-club', 'cablevision-systems-corporation',
       'coremark-holding-company', 'windstream', 'suntrust',
       'news-corporation', 'jetblue-airways-corporation',
       'alliance-data-systems-corp', 'expedia', '21st-century-fox'],
      dtype=object)

## Daring Fast Food

In [32]:
daring_company = data.training.bp.cpu().numpy()[:, bp.columns == "daring"].flatten() > 2.
fastfood_company = np.isin(end_names, ["mcdonalds","burger-king","kfc"])

z_daring_ff = interp(daring_company, fastfood_company, w1 = 0.5, w2 = 0.5, return_z = True)

pred_daring_ff = PredictNoVar(lmvae, torch.tensor(z_daring_ff).unsqueeze(0))
prof_daring_ff = profile(pred_daring_ff, data.training, i=0)

In [33]:
prof_daring_ff['bp']

Unnamed: 0,Rel Values
cool,1.672018
trendy,1.642678
young,1.49802
exciting,1.443451
good.looking,1.407185
cheerful,1.40658
spirited,1.335416
charming,1.299812
original,1.292367
unique,1.094205


# Decision support

## McDonald's Analysis

In [36]:
class NewCompany(CompanyData):
    def __init__(self, name, noptions, read_dir = "../../code/extract_features/new_logo_outputs/"):
    
        self.bp = pd.read_csv(read_dir + name + "_rel_bp.csv", header=None, index_col=0).values.T

        indus_df = pd.read_csv(read_dir + name + "_indus.csv", header=None, index_col=0)
        self.indus = indus_df.values.T

        new_bin = pd.read_csv(read_dir + name + "_y_bin.csv", index_col=0)
        self.bin = new_bin.values

        new_mult = pd.read_csv(read_dir + name + "_y_mult.csv", index_col=0)

        self.cat1 = np.expand_dims(new_mult.values[:,0], 1)
        self.cat2 = np.expand_dims(new_mult.values[:,1], 1)
        self.cat3 = np.expand_dims(new_mult.values[:,2], 1)
        self.cat4 = np.expand_dims(new_mult.values[:,3], 1)
        self.cat5 = np.expand_dims(new_mult.values[:,4], 1)

        new_text_df = pd.read_csv(read_dir + name + "_newrow_binary.csv", index_col=0)
        self.text = new_text_df.values
        
        self.noptions = noptions
        
    def make_torch(self, cuda = False):
        if cuda:
            
            self.cat1_hot = torch.nn.functional.one_hot(torch.tensor(self.cat1, dtype = torch.int).long(), self.noptions[0]).float().squeeze(0).cuda()
            self.cat2_hot = torch.nn.functional.one_hot(torch.tensor(self.cat2, dtype = torch.int).long(), self.noptions[1]).float().squeeze(0).cuda()
            self.cat3_hot = torch.nn.functional.one_hot(torch.tensor(self.cat3, dtype = torch.int).long(), self.noptions[2]).float().squeeze(0).cuda()
            self.cat4_hot = torch.nn.functional.one_hot(torch.tensor(self.cat4, dtype = torch.int).long(), self.noptions[3]).float().squeeze(0).cuda()
            self.cat5_hot = torch.nn.functional.one_hot(torch.tensor(self.cat5, dtype = torch.int).long(), self.noptions[4]).float().squeeze(0).cuda()
            
            self.text = torch.tensor(self.text, dtype = torch.float).cuda()
            self.bin = torch.tensor(self.bin, dtype = torch.float).cuda()
            self.cat1 = torch.tensor(self.cat1, dtype = torch.float).cuda()
            self.cat2 = torch.tensor(self.cat2, dtype = torch.float).cuda()
            self.cat3 = torch.tensor(self.cat3, dtype = torch.float).cuda()
            self.cat4 = torch.tensor(self.cat4, dtype = torch.float).cuda()
            self.cat5 = torch.tensor(self.cat5, dtype = torch.float).cuda()
            self.bp = torch.tensor(self.bp, dtype = torch.float).cuda()
            self.indus = torch.tensor(self.indus, dtype = torch.float).cuda()
            

       
        else:
            
            self.cat1_hot = torch.nn.functional.one_hot(torch.tensor(self.cat1.squeeze(0), dtype = torch.int).long(), self.noptions[0]).float().squeeze(0)
            self.cat2_hot = torch.nn.functional.one_hot(torch.tensor(self.cat2.squeeze(0), dtype = torch.int).long(), self.noptions[1]).float().squeeze(0)
            self.cat3_hot = torch.nn.functional.one_hot(torch.tensor(self.cat3.squeeze(0), dtype = torch.int).long(), self.noptions[2]).float().squeeze(0)
            self.cat4_hot = torch.nn.functional.one_hot(torch.tensor(self.cat4.squeeze(0), dtype = torch.int).long(), self.noptions[3]).float().squeeze(0)
            self.cat5_hot = torch.nn.functional.one_hot(torch.tensor(self.cat5.squeeze(0), dtype = torch.int).long(), self.noptions[4]).float().squeeze(0)
       
    
            self.text = torch.tensor(self.text, dtype = torch.float)
            self.bin = torch.tensor(self.bin, dtype = torch.float)
            self.cat1 = torch.tensor(self.cat1, dtype = torch.float)
            self.cat2 = torch.tensor(self.cat2, dtype = torch.float)
            self.cat3 = torch.tensor(self.cat3, dtype = torch.float)
            self.cat4 = torch.tensor(self.cat4, dtype = torch.float)
            self.cat5 = torch.tensor(self.cat5, dtype = torch.float)
            self.bp = torch.tensor(self.bp, dtype = torch.float)
            self.indus = torch.tensor(self.indus, dtype = torch.float)

In [38]:
from model import NewZ, Predict

new_mcds1 = NewCompany(name = "mcdonalds1", noptions = noptions, read_dir = "../../extract_features/new_logo_outputs/")
new_mcds1.make_torch(cuda = True)
z_mcds1 = NewZ(lmvae, new_mcds1, network = "mgr")

new_mcds2 = NewCompany(name = "mcdonalds2", noptions = noptions, read_dir = "../../extract_features/new_logo_outputs/")
new_mcds2.make_torch(cuda = True)
z_mcds2 = NewZ(lmvae, new_mcds2, network = "mgr")

new_mcds0 = NewCompany(name = "mcdonalds-old", noptions = noptions, read_dir = "../../extract_features/new_logo_outputs/")
new_mcds0.make_torch(cuda = True)
z_mcds0 = NewZ(lmvae, new_mcds0, network = "mgr")

In [39]:
pred_mcds1 = Predict(lmvae, z = z_mcds1)
pred_mcds2 = Predict(lmvae, z = z_mcds2)
pred_mcds0 = Predict(lmvae, z = z_mcds0)

In [40]:
pred_mcds1.bp

array([[ 6.77089870e-01,  1.12842476e+00, -3.54679465e-01,
        -3.66932750e-01, -8.55696201e-02, -8.80889893e-01,
        -5.19719839e-01,  1.50119996e+00, -3.89508098e-01,
         1.41581368e+00,  2.50559956e-01,  1.29775846e+00,
        -4.71003413e-01,  6.99140411e-03,  2.76372820e-01,
         1.12295556e+00, -1.62236005e-01,  2.02023521e-01,
        -7.10482776e-01,  1.08048506e-01, -2.06103742e-01,
         4.16002631e-01,  5.18226981e-01,  7.54897177e-01,
         5.99876881e-01,  2.83316344e-01,  1.00432429e-03,
         1.07887793e+00,  9.61780310e-01,  1.22091949e+00,
        -1.54261038e-01,  3.26619774e-01,  3.76595289e-01,
        -1.15711081e+00, -5.14448388e-04, -1.76815376e-01,
        -1.85626417e-01, -6.23500168e-01, -1.01317203e+00,
         5.58635294e-01,  1.55228686e+00, -3.67451906e-02]], dtype=float32)

In [41]:
pred_mcds2.bp

array([[ 1.0464348e+00,  1.4220246e+00, -4.0158965e-02,  1.1414982e-01,
         2.5533637e-01, -7.0920867e-01, -1.8241528e-01,  1.4660299e+00,
        -7.5984746e-04,  1.5487605e+00,  5.6187922e-01,  1.5350263e+00,
        -1.7790696e-01,  3.7190190e-01,  2.4560709e-01,  1.2354792e+00,
         2.3099519e-01,  3.2880977e-01, -4.5333362e-01,  3.5360804e-01,
        -3.9261603e-01,  6.7094690e-01,  3.8914201e-01,  8.7187845e-01,
         7.1042186e-01,  5.9002779e-02,  1.5230599e-01,  1.2949346e+00,
         1.1854568e+00,  9.9878800e-01,  1.7898516e-01,  6.8439043e-01,
         5.9773397e-01, -1.1017486e+00, -1.7030293e-01,  1.9258064e-01,
         1.6564436e-01, -2.4781516e-01, -7.2007847e-01,  4.4780701e-01,
         1.6812104e+00,  2.5870755e-01]], dtype=float32)

In [42]:
pred_mcds0.bp

array([[ 0.79224986,  1.327562  , -0.45407316, -0.44474575, -0.12397147,
        -1.0206999 , -0.6031766 ,  1.7190996 , -0.45620337,  1.6882973 ,
         0.30721906,  1.5141703 , -0.56377035, -0.02683702,  0.30001107,
         1.2416471 , -0.18469736,  0.19538213, -0.82685846,  0.06843727,
        -0.2844928 ,  0.4199068 ,  0.56854606,  0.83606863,  0.6227213 ,
         0.2689222 , -0.06299068,  1.2857987 ,  1.1004801 ,  1.4436342 ,
        -0.1920882 ,  0.37801376,  0.35834724, -1.3495125 , -0.06839258,
        -0.23756778, -0.22309238, -0.7567977 , -1.1837695 ,  0.6180643 ,
         1.8246703 , -0.04704958]], dtype=float32)

In [43]:
out = pd.DataFrame(np.vstack([bp.columns, pred_mcds1.bp, pred_mcds2.bp, pred_mcds0.bp]).T)
out.to_csv("../../mcdonalds/new_model_mcd_bp.csv")

## Shake Shack / In-n-Out Analysis

In [45]:
class MultiviewZ():
    def __init__(self, lmvae, data):
        self.full = NewZ(lmvae, data, network = "full")
        self.mgr = NewZ(lmvae, data, network = "mgr")
        self.des = NewZ(lmvae, data, network = "des")
        self.logo = NewZ(lmvae, data, network = "res")

In [46]:
data_ss = NewCompany(name = "shake-shack", noptions = noptions, read_dir = "../../extract_features/new_logo_outputs/")
data_ss.make_torch(cuda = True)
z_ss = MultiviewZ(lmvae, data_ss)

In [47]:
data_ino = NewCompany(name = "in-n-out", noptions = noptions, read_dir = "../../extract_features/new_logo_outputs/")
data_ino.make_torch(cuda = True)
z_ino = MultiviewZ(lmvae, data_ino)

In [48]:
pred_ss = Predict(lmvae, z_ss.des)
prof_ss = profile(pred_ss, data.training)

In [49]:
prof_ss["bin"]

Unnamed: 0,Rel Prob,Prob
down_diag-high,0.175886,0.426594
perc_white-high,0.151304,0.400596
hor-low,0.122515,0.374640
color.black,0.113478,0.493081
gpc-high,0.100840,0.351548
...,...,...
high_sat_sd,-0.104077,0.128218
down_diag-low,-0.114388,0.133488
high_sat,-0.135243,0.095635
color.blue_medium,-0.140800,0.141070


In [50]:
prof_ss["cat1"]

Unnamed: 0,Rel Prob,Prob
black,0.0595,0.328622
grey_dark,0.045875,0.094034
green_light,0.025205,0.046451
blue_dark,0.020318,0.098222
orange,0.015891,0.034305
grey_light,0.015779,0.03136
brown,0.011154,0.023901
yellow,-0.001234,0.011514
red_dark,-0.00127,0.021392
green_dark,-0.012492,0.027168


In [51]:
pred_ino = Predict(lmvae, z_ino.des)
prof_ino = profile(pred_ino, data.training)

In [52]:
prof_ino["bin"]

Unnamed: 0,Rel Prob,Prob
color.red,0.275955,0.537995
high_sat,0.241151,0.472030
perc_white-low,0.185646,0.436354
h_sym-high,0.182997,0.435122
color.yellow,0.133252,0.208322
...,...,...
many_chars,-0.109861,0.111102
font.width_has_orig,-0.112349,0.686518
low_sat_sd,-0.123577,0.101636
font.weight_has_orig,-0.155995,0.383665


In [53]:
prof_ino["cat1"]

Unnamed: 0,Rel Prob,Prob
red,0.24647,0.395196
orange,0.030525,0.048938
red_dark,0.03038,0.053042
green_dark,0.027659,0.067319
yellow,0.023068,0.035816
green_light,0.019717,0.040963
brown,-0.002601,0.010147
blue_dark,-0.004081,0.073822
grey_light,-0.005945,0.009636
grey_dark,-0.027438,0.020721
