## importing required libraries

In [18]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
from collections import defaultdict
import struct
import numpy as np
import numpy
import random
import tensorflow as tf
import os
import pickle as pickle
import matplotlib.pyplot as plt
import sys
from datetime import datetime
import time
%matplotlib inline

def format_time(t):
    return t.strftime("%Y-%m-%d %H:%M:%S")

#this function helps to visualize the dict
from itertools import islice

def take(n, iterable):
    "Return first n items of the iterable as a list"
    return list(islice(iterable, n))
# take(1, prod_desc.values())

# load data after creating features
def load_data_hybrid(data_path, min_items=2, min_users=2, sampling= True, sample_size = 0.5):
    user_ratings = defaultdict(set)
    item_ratings = defaultdict(set)
    max_u_id = -1
    max_i_id = -1
    user_count = 0
    item_count = 0
    reviews = 0
    users = {}  # aid to id LUT
    items = {}  # asid to id LUT
    records = {} # all records
    features = {}
    random.seed(0)
    columns = None
    offset_to_features = 3
    with open(data_path, 'r') as f:
        bad_actor = 0
        for line in f.readlines():
            record = {}
            split_line = line.split(",")
            if columns is None:
                columns = [e.rstrip() for e in split_line]
                continue
            #if (sampling and random.random()>sample_size):
            #    continue
            reviews += 1
            
            if (len(split_line) > len(columns)):
                bad_actor = bad_actor + 1
                continue
            else:
                auid, asid, _ = split_line[0:offset_to_features]
                record = {columns[i]:split_line[i].rstrip() for i in  range (offset_to_features, len(split_line))}

            u, i = None, None

            if auid in users:
                u = users[auid]
            else:
                user_count += 1  # new user so increment
                users[auid] = user_count
                u = user_count
            
            if asid in items:
                i = items[asid]
            else:
                item_count += 1  # new i so increment
                items[asid] = item_count
                i = item_count
                
                for c in ['price_delta_calc1','price_delta_calc2','price_delta_l4avg']:
                    if c in record:
                        record[c] = float(record[c])
                if 'price' in record:
                    if record['price'] == '':
                        record['price'] = 0
                    else:
                        record['price'] = float(record['price'])
                if 'polarity' in record:
                    record['polarity']= round((float(record['polarity'])),2)
                    
                if 'feature_vector' in record:
                    if len(record['feature_vector']) == 0:
                        record['feature_vector'] = list(np.zeros(4524))
                    else:
                        record['feature_vector'] = [int(el) for el in list(record['feature_vector'])[:-1][1:]]
    
                for c in ['top_categories','rating','percentile_hotcoded','season','level4','sentiment']:
                    if c in record:
                        record[c] = [int(el) for el in list(record[c])[:-2][1:]]
                records[i] = record
            
            user_ratings[u].add(i)
            item_ratings[i].add(u)
            max_u_id = max(u, max_u_id)
            max_i_id = max(i, max_i_id)
            
    print ("max_u_id: ", max_u_id)
    print ("max_i_id: ", max_i_id)
    print ("reviews : ", reviews)


    # filter out users w/ less than X reviews
    num_u_id = 0
    num_i_id = 0
    num_reviews = 0
    user_ratings_filtered = defaultdict(set)
    for u, ids in user_ratings.items():
        if len(ids) > min_items:
            user_ratings_filtered[u] = ids
            num_u_id += 1
            num_reviews += len(ids)
            
    item_ratings_filtered = defaultdict(set)
    for ids, u in item_ratings.items():
        if len(u) > min_users:
            # keep
            item_ratings_filtered[ids] = u
            num_i_id += 1
    
    feature_keys = records[1].keys() #should be same as columns[offset:]
    features = {k:{i:records[i][k] for i in range(1,len(records)+1)} for k in feature_keys}

    print ("u_id: ", num_u_id)
    print ("i_id: ", num_i_id)
    print ("reviews : ", num_reviews)
    #return max_u_id, max_i_id, users, items, user_ratings_filtered,\
    #            item_ratings_filtered, brands, prices, prod_desc, prod_cat,price_feature,season_feature
    return max_u_id, max_i_id, users, items, user_ratings_filtered,item_ratings_filtered, features

#load image features for the given asin collection into dictionary
def load_image_features(path, items):
    count=0
    image_features = {}
    f = open(path, 'rb')
    while True:
        asin = f.read(10)
        if asin == '': break
        features_bytes = f.read(16384) # 4 * 4096 = 16KB, fast read, don't unpack
  
        if asin in items: #only unpack 4096 bytes if w need it -- big speed up
            features = (np.fromstring(features_bytes, dtype=np.float32)/58.388599)
            iid=items[asin]
            if len(features)==0:
                image_features[iid] = np.zeros(4096)
            else:
                image_features[iid] = features
    
    return image_features

def uniform_sample_batch(train_ratings, test_ratings, item_count, advanced_features):
    neg_items = 2
    for u in train_ratings.keys():
        t = []
        iv = []
        jv = []
        for i in train_ratings[u]:
            if (u in test_ratings.keys()):
                if (i != test_ratings[u]):  # make sure it's not in the test set
                    for k in range(1,neg_items):
                        j = random.randint(1, item_count)
                        while j in train_ratings[u]:
                            j = random.randint(1, item_count)
                        # sometimes there will not be an image for given product
                        try:
                            advanced_features[i]
                            advanced_features[j]
                        except KeyError:
                            continue
                        iv.append(advanced_features[i])
                        jv.append(advanced_features[j])
                        t.append([u, i, j])
            else:
                for k in range(1,neg_items):
                    j = random.randint(1, item_count)
                    while j in train_ratings[u]:
                        j = random.randint(1, item_count)
                    # sometimes there will not be an image for given product
                    try:
                        advanced_features[i]
                        advanced_features[j]
                    except KeyError:
                        continue
                    iv.append(advanced_features[i])
                    jv.append(advanced_features[j])
                    t.append([u, i, j])

        # block if queue is full
        if len(iv)>1:
            yield numpy.asarray(t), numpy.vstack(tuple(iv)), numpy.vstack(tuple(jv))
        else:
            continue

def test_batch_generator_by_user(train_ratings, test_ratings, item_ratings, item_count, advanced_features, cold_start = False, cold_start_thresh = 5):
    # using leave one cv
    for u in random.sample(test_ratings.keys(), 4000):
    #for u in test_ratings.keys():
        i = test_ratings[u]
        if (cold_start and len(item_ratings[i]) > cold_start_thresh-1):
            continue
        t = []
        ilist = []
        jlist = []
        count = 0
        for j in random.sample(range(item_count), 100):
            # find item not in test[u] and train[u]
            if j != test_ratings[u] and not (j in train_ratings[u]):
                try:
                    advanced_features[i]
                    advanced_features[j]
                except KeyError:
                    continue

                count += 1
                t.append([u, i, j])
                ilist.append(advanced_features[i])
                jlist.append(advanced_features[j])

        # print numpy.asarray(t).shape
        # print numpy.vstack(tuple(ilist)).shape
        # print numpy.vstack(tuple(jlist)).shape
        if (len(ilist) == 0):
            #print "could not find neg item for user, count: ", count, u
            continue
        yield numpy.asarray(t), numpy.vstack(tuple(ilist)), numpy.vstack(tuple(jlist))

def generate_test(user_ratings):
    '''
    for each user, random select one rating into test set
    '''
    user_test = dict()
    for u, i_list in user_ratings.items():
        user_test[u] = random.sample(user_ratings[u], 1)[0]
    return user_test

#user_count, item_count, users, items, user_ratings, item_ratings, brands, prices, prod_desc = load_data_hybrid(data_path, min_items=4, min_users=0, sampling= True, sample_size = 0.8)
def transform_features (features):
    if 'price' in features:
        """
        prices = features['price']
        prices_features= {}
        prices_all = list(set(prices.values()))
        price_quant_level = 10
        price_max = float(max(prices.values()))
        for key, value in prices.items():
            prices_vec = numpy.zeros(price_quant_level+1)
            idx = int(numpy.ceil(float(value)/(price_max/price_quant_level)))
            prices_vec[idx]=1
            prices_features[key] = prices_vec
        features['price'] = prices_features
        """
        
        """
        prices_log = {k:np.log(1+v) for k, v in prices.items()}
        prices_log_features = {}
        prices_log_all = list(set(prices_log.values()))
        price_log_quantlevels = 10
        price_log_max = float(max(prices_log.values()))
        for key, value in prices_log.items():
            prices_log_vec = numpy.zeros(price_log_quantlevels+1)
            idx = int(numpy.ceil(float(value)/(price_log_max/price_log_quantlevels)))
            prices_log_vec[idx]=1
            prices_log_features[key] = prices_log_vec
        """
        
        
        prices = features['price']
        prices_log = {k:np.log(1+v) for k, v in prices.items()}
        prices_log_features = {}
        #prices_log_all = list(set(prices_log.values()))
        price_log_quantlevels = 10
        price_log_max = float(max(prices_log.values()))
        for key, value in prices_log.items():
            prices_log_vec = numpy.zeros(price_log_quantlevels+1)
            idx = int(numpy.ceil(float(value)/(price_log_max/price_log_quantlevels)))
            prices_log_vec[idx]=1
            prices_log_features[key] = prices_log_vec
        features['price'] = prices_log_features
        
        
        
    for calc_feature in ['price_delta_calc1','price_delta_calc2','price_delta_l4avg']:
        if calc_feature in features:
            d_prices = features[calc_feature]
            d_prices_log_features={}
            d_price_min = min(d_prices.values())  
            # Need to shift the value so that it is zero + 1 centered
            d_prices_log = {k:np.log(1+abs(d_price_min) + v) for k, v in d_prices.items()}
            #features['log_' + calc_feature] = d_prices_log
            #d_prices_log_all = list(set(d_prices_log.values()))
            d_price_log_quantlevels = 5
            d_log_price_max = max(d_prices_log.values())
            for key, value in d_prices_log.items():
                d_log_prices_vec = numpy.zeros(d_price_log_quantlevels+1)
                d_idx = int(numpy.ceil(float(value)/(d_log_price_max/d_price_log_quantlevels)))
                d_log_prices_vec[d_idx]=1
                d_prices_log_features[key] = d_log_prices_vec
                #print ("min:%f,max:%f" % (price_min,price_max))
                #print ("index:%d,value:%f,shifted value:%f" % (idx,value,shifted_value))
            features[calc_feature] = d_prices_log_features
    
    
    
    if 'brand' in features:
        brands_features = {}
        brands = loaded_features['brand']
        brands_all = list(set(brands.values()))
        for key, value in brands.items():
            brands_vec = numpy.zeros(len(brands_all))
            brands_vec[brands_all.index(value)] = 1
            brands_features[key] = brands_vec
        features['brand'] = brands_features
        
    return features

# list of features defined as dicts can be passed and they are combined, if none array of zeros are created

def feature_set(feature_dicts=None):
    if feature_dicts!=None:
        combined_features = defaultdict(list)
        for d in feature_dicts:
            for k, v in d.items():  
                combined_features[k].extend(v)

        return dict([(k,v) for k,v in combined_features.items()])
    
    else:
        
        return {n: [0] for n in range(1,item_count+1)} #return just zeros dummy advanced features for baseline BPR

def abpr(user_count, item_count, advanced_features, hidden_dim=10, hidden_adv_dim=10,
         l2_regulization=0.1,
         bias_regulization=0.01,
         embed_regulization = 0,
         adv_feature_regulization =0.1,
         adv_feature_bias_regulization = 0.01):
    """
    user_count: total number of users
    item_count: total number of items
    hidden_dim: hidden feature size of MF
    hidden_adv_dim: hidden visual/non-visual feature size of MF
    P.S. advanced_features can be one or many features combined. it can only be image features, non-image features, or both
    """
    advanced_feat_dim = len(advanced_features[1])
    iv = tf.placeholder(tf.float32, [None, advanced_feat_dim])
    jv = tf.placeholder(tf.float32, [None, advanced_feat_dim])
    u = tf.placeholder(tf.int32, [None])
    i = tf.placeholder(tf.int32, [None])
    j = tf.placeholder(tf.int32, [None])
    
    # model parameters -- LEARN THESE
    # latent factors
    user_emb_w = tf.get_variable("user_emb_w", [user_count + 1, hidden_dim],
                                 initializer=tf.random_normal_initializer(0, 0.1))
    item_emb_w = tf.get_variable("item_emb_w", [item_count + 1, hidden_dim],
                                 initializer=tf.random_normal_initializer(0, 0.1))
    # biases
    item_b = tf.get_variable("item_b", [item_count + 1, 1], initializer=tf.constant_initializer(0.0))

    # pull out the respective latent factor vectors for a given user u and items i & j
    u_emb = tf.nn.embedding_lookup(user_emb_w, u)
    i_emb = tf.nn.embedding_lookup(item_emb_w, i)
    j_emb = tf.nn.embedding_lookup(item_emb_w, j)
    
    # get the respective biases for items i & j
    i_b = tf.nn.embedding_lookup(item_b, i)
    j_b = tf.nn.embedding_lookup(item_b, j)


    # MF predict: u_i > u_j
   
    # UxD Advanced feature latent factors for users
    user_adv_w = tf.get_variable("user_adv_w", [user_count + 1, hidden_adv_dim],
                             initializer=tf.random_normal_initializer(0, 0.1))
    # this is E, the embedding matrix
    item_adv_w = tf.get_variable("item_adv_w", [hidden_adv_dim, advanced_feat_dim],
                            initializer=tf.random_normal_initializer(0, 0.1))

    theta_i = tf.matmul(iv, item_adv_w, transpose_b=True)  # (f_i * E), eq. 3
    theta_j = tf.matmul(jv, item_adv_w, transpose_b=True)  # (f_j * E), eq. 3

    adv_feature_bias = tf.get_variable("adv_feature_bias", [1, advanced_feat_dim], initializer=tf.random_normal_initializer(0, 0.1))
    # pull out the visual factor, 1 X D for user u

    u_img = tf.nn.embedding_lookup(user_adv_w, u)

    xui = i_b + tf.reduce_sum(tf.multiply(u_emb, i_emb), 1, keep_dims=True) + tf.reduce_sum(tf.multiply(u_img, theta_i), 1, keep_dims=True) \
                                                                        + tf.reduce_sum(tf.multiply(adv_feature_bias, iv), 1, keep_dims=True) 
    xuj = j_b + tf.reduce_sum(tf.multiply(u_emb, j_emb), 1, keep_dims=True) + tf.reduce_sum(tf.multiply(u_img, theta_j), 1, keep_dims=True) \
                                                                        + tf.reduce_sum(tf.multiply(adv_feature_bias, jv), 1, keep_dims=True) 
    l2_norm = tf.add_n([
        l2_regulization * tf.reduce_sum(tf.multiply(u_emb, u_emb)),
        adv_feature_regulization * tf.reduce_sum(tf.multiply(u_img, u_img)),
        l2_regulization * tf.reduce_sum(tf.multiply(i_emb, i_emb)),
        l2_regulization * tf.reduce_sum(tf.multiply(j_emb, j_emb)),
        embed_regulization * tf.reduce_sum(tf.multiply(item_adv_w, item_adv_w)),
        bias_regulization * tf.reduce_sum(tf.multiply(i_b, i_b)),
        bias_regulization * tf.reduce_sum(tf.multiply(j_b, j_b)),
        adv_feature_bias_regulization * tf.reduce_sum(tf.multiply(adv_feature_bias, adv_feature_bias))
    ])
        
    xuij = xui - xuj

    auc = tf.reduce_mean(tf.to_float(xuij > 0))
    
    loss = l2_norm - tf.reduce_mean(tf.log(tf.sigmoid(xuij)))
    
    train_op = tf.train.AdamOptimizer().minimize(loss)
    
    return xuij,u, i, j, iv, jv, loss, auc, train_op

def session_run(num_iter, user_count, item_count, users, items, 
                user_ratings, item_ratings, advanced_features):
    ### Loading and parsing the review matrix for Women 5-core dataset
    auc_train = []
    auc_test = []
    auc_test_cs = []
    #data_path = os.path.join('/Users/nolanthomas/Public/amazon', 'out_topcategories_pricepercentile_seasonmeteorological.csv')
    #user_count, item_count, users, items, user_ratings, item_ratings, brands, features = load_data_hybrid(data_path, min_items=4, min_users=0, sampling= True, sample_size = 0.8)
    user_ratings_test = generate_test(user_ratings)
    with tf.Graph().as_default(), tf.Session() as session:
        with tf.variable_scope('abpr'):
            xuij,u, i, j, iv, jv, loss, auc, train_op = abpr(user_count, item_count, advanced_features)

        session.run(tf.global_variables_initializer())
        

        for epoch in range(1, num_iter+1):
            print ("epoch ", epoch)
            _loss_train = 0.0
            user_count = 0
            auc_train_values = []
            for d, _iv, _jv in uniform_sample_batch(user_ratings, user_ratings_test, item_count, advanced_features):
                user_count += 1
                _loss, _auc, _ = session.run([loss, auc, train_op], feed_dict={u:d[:,0], i:d[:,1], j:d[:,2], iv:_iv, jv:_jv})
                _loss_train += _loss
                auc_train_values.append(_auc)
            print ("train_loss:", _loss_train/user_count, "train auc: ", numpy.mean(auc_train_values))
            auc_train.append(numpy.mean(auc_train_values))

            user_items_test=[]
            auc_values = []
            _loss_test = 0.0
            user_count = 0
            for d, _iv, _jv in test_batch_generator_by_user(user_ratings, user_ratings_test, item_ratings, item_count, advanced_features, cold_start = False):
                user_count += 1
                _loss, _auc = session.run([loss, auc], feed_dict={u: d[:, 0], i: d[:, 1], j: d[:, 2], iv: _iv, jv: _jv})
                _loss_test += _loss
                auc_values.append(_auc)
            print ("test_loss: ", _loss_test / user_count, "test auc: ", numpy.mean(auc_values))
            auc_test.append(numpy.mean(auc_values))

            auc_values_cs = []
            _loss_test_cs = 0.0
            user_count = 0
            for d, _iv, _jv in test_batch_generator_by_user(user_ratings, user_ratings_test, item_ratings, item_count, advanced_features, cold_start = True, cold_start_thresh = 10):
                user_count += 1
                _xuij,_loss, _auc = session.run([xuij,loss, auc], feed_dict={u: d[:, 0], i: d[:, 1], j: d[:, 2], iv: _iv, jv: _jv})
                _loss_test_cs += _loss
                auc_values_cs.append(_auc)
                if epoch==num_iter:
                    user_items_test.append((d,_xuij))
            print ("cold start test_loss: ", _loss_test_cs / user_count, "cold start auc: ", numpy.mean(auc_values_cs))
            auc_test_cs.append(numpy.mean(auc_values_cs))
        return user_items_test,auc_train, auc_test, auc_test_cs

def run(num_sessions, user_count, item_count, users, items, 
            user_ratings, item_ratings, advanced_features):
    t1 = datetime.now()
    user_items_test, auc_train, auc_test, auc_test_cold = session_run(num_sessions, user_count, item_count, 
                                                     users, items, user_ratings, item_ratings, 
                                                     advanced_features)
    t2 = datetime.now()
    return {'num_sessions':NUM_SESSIONS, 'sys.platform':str(sys.platform), 
            'sys.version':str(sys.version), 
            'user_items_test': user_items_test,
            'auc_train': auc_train, 'auc_test': auc_test, 
            'auc_cold_test': auc_test_cold,
            'start':format_time(t1),'end':format_time(t1),
            'delta_sec':(t2-t1).total_seconds()}

## Load and make transformations; reuse across sessions
#### 1. Load data
#### 2. Make transformations
#### 3. Run Session

In [19]:
from pathlib import Path
home = str(Path.home())
bpr_data_file = os.path.join(home,'/Users/nirmal/Downloads/out_model_features.20180512.csv')

In [6]:
data_path = os.path.join(bpr_data_file)
user_count, item_count, users, items, user_ratings, item_ratings, loaded_features   = load_data_hybrid(data_path, min_items=4, min_users=0, sampling= True, sample_size = 0.8)
# len(take(1, loaded_features['brand'].values())[0])

max_u_id:  34102
max_i_id:  10303
reviews :  131979
u_id:  10218
i_id:  10303
reviews :  69211


In [7]:
loaded_features = transform_features(loaded_features)

## Running BASELINE BPR with no advanced features at all

In [20]:
# define number of sessions or epochs
NUM_SESSIONS = 3
results = {}
variants  = {
#     'Category-Tags':['top_categories']#,
#     'BPR':None,
    #,
    'Subcat-L4':['level4']
    #'Brand':['brand'],
    #'Price-L4-User':['price_delta_calc1'],
    #'Price-4L-Avg':['price_delta_l4avg']
}
"""
variants  = {
    'Price$':['price'],
    'Category-Tags, Subcat-L4, $Price':['top_categories','level4','price'],
    'Subcat-L4, $Price':['level4','price'],
    'Category-Tags, $Price':['top_categories','price'],
    'Category-Tags, Subcat-L4, Brand, $Price':['price','top_categories', 'level4', 'brand']
}
"""

"\nvariants  = {\n    'Price$':['price'],\n    'Category-Tags, Subcat-L4, $Price':['top_categories','level4','price'],\n    'Subcat-L4, $Price':['level4','price'],\n    'Category-Tags, $Price':['top_categories','price'],\n    'Category-Tags, Subcat-L4, Brand, $Price':['price','top_categories', 'level4', 'brand']\n}\n"

In [22]:
# feature_set() is called without arguments
for desc, features_to_use in variants.items():
    print (desc + str(features_to_use))
    if features_to_use != None:
        features_list = feature_set([loaded_features[c] for c in features_to_use])
    else:
        features_list = feature_set()
    results[desc] = run(NUM_SESSIONS, user_count, item_count, 
                         users, items, user_ratings, 
                         item_ratings,features_list)

Subcat-L4['level4']
epoch  1
train_loss: 0.79657267741 train auc:  0.670829
test_loss:  2.16772876462 test auc:  0.580342
cold start test_loss:  2.24769064637 cold start auc:  0.454155
epoch  2
train_loss: 0.691830861867 train auc:  0.719088
test_loss:  1.68797983697 test auc:  0.609001
cold start test_loss:  1.7293540574 cold start auc:  0.438135
epoch  3
train_loss: 0.631612273264 train auc:  0.754326
test_loss:  1.56671794461 test auc:  0.641389
cold start test_loss:  1.57222863976 cold start auc:  0.444649


In [None]:
import pickle
datetime.now().strftime("%Y%m%d.%H%M")
pickle.dump( results, open( "results."+datetime.now().strftime("%Y%m%d.%H%M")
 + ".pickle", "wb" ) ) 

In [None]:
results.keys()

In [None]:
"""
# Top 3 performers: 'Category-Tags, Subcat-L4, Brand'

features_list = feature_set([loaded_features[c] for c in ['top_categories', 'level4', 'brand']])
results['Category-Tags, Subcat-L4, Brand'] = run(NUM_SESSIONS, user_count, item_count, 
                         users, items, user_ratings, 
                         item_ratings,features_list)
"""

In [None]:
"""
features_list = feature_set([loaded_features[c] for c in ['price']])
results['Price$'] = run(NUM_SESSIONS, user_count, item_count, 
                         users, items, user_ratings, 
                         item_ratings,features_list)
"""

## comparing Test AUC vs. number of iterations for different models

In [None]:
import pickle
datetime.now().strftime("%Y%m%d.%H%M")
pickle.dump( results, open( "results."+datetime.now().strftime("%Y%m%d.%H%M")
 + ".pickle", "wb" ) ) 

In [None]:
#!pip install seaborn

In [None]:
import matplotlib as mpl
import seaborn as sns
mpl.style.use('seaborn')


def plot_auc_curve(results_to_graph, title, highlight):
    dt_str = datetime.now().strftime("%Y%m%d.%H%M")
    sns.set_context("talk")
    plt.figure(figsize=(20,10))
    plt.tick_params(axis='both', which='major', labelsize=20)
    plt.title(title,fontsize=30)
    for calc_desc, calc_results in results_to_graph.items():
        ls='solid'
        lw=3
        ms=7
        if calc_desc == 'BPR':
            lw=5
            ms=11
            ls='dashed'
        if calc_desc == highlight:
            lw=5
            ms=11
            ls='-.'
        plt.plot(calc_results['auc_test'], 
            label=calc_desc,
            linewidth=lw,
            linestyle=ls,
            markersize=ms,
            marker='o')
    plt.legend()
    plt.ylabel("Test AUC",fontsize=20)
    plt.xlabel("Number of Iterations",fontsize=20)
    #savefig('auc_curve.' + dt_str + '.png')
    #show()

def plot_auc_cold_start_curve(results_to_graph, title,highlight): 
    dt_str = datetime.now().strftime("%Y%m%d.%H%M")
    sns.set_context("talk")
    plt.figure(figsize=(20,10))
    plt.tick_params(axis='both', which='major', labelsize=20)
    plt.title(title,fontsize=30)
    for calc_desc, calc_results in results_to_graph.items():
        ls='solid'
        lw=3
        ms=7
        if calc_desc == 'BPR':
            lw=5
            ms=11
            ls='dashed'
        if calc_desc == highlight:
            lw=5
            ms=11
            ls='-.'
        plt.plot(calc_results['auc_cold_test'],
            label=calc_desc,
            linewidth=lw,
            linestyle=ls,
            markersize=ms,
            marker='o')
    plt.legend()
    plt.ylabel("Cold Start Test AUC",fontsize=20)
    plt.xlabel("Number of Iterations",fontsize=20)
    #plt.savefig('auc_cold_start_curve.' + dt_str + '.png')
    #plt.show()


In [None]:
visualize_list = ['BPR', 'Category-Tags','Subcat-L4']
results_to_graph ={v:results[v] for v in visualize_list}

In [None]:
rpt_date = datetime.now().strftime("%Y%m%d.%H%M")
plot_auc_curve(results,'BPR Features Test AUC - ' + rpt_date ,None)

In [None]:
plot_auc_cold_start_curve(results, 'BPR AFeatures Cold Start AUC - ' + rpt_date,None)

In [None]:
single_feature = ['BPR', 'Category-Tags', 'Subcat-L4', 'Brand', 'Price$']
results_to_graph ={v:results[v] for v in single_feature}
plot_auc_curve(results_to_graph,'BPR Features Test AUC','Category-Tags')

In [None]:
plot_auc_cold_start_curve(results_to_graph,
                          'BPR Features Cold Start Test AUC',
                          'Category-Tags')

In [None]:
combos = ['Category-Tags, Subcat-L4, $Price', 'Subcat-L4, $Price', 'Category-Tags, $Price', 'Category-Tags, Subcat-L4, Brand, $Price', 'BPR', 'Category-Tags']
results_to_graph ={v:results[v] for v in combos}
plot_auc_curve(results_to_graph,
               'BPR Test AUC\nCategory Tags vs Feature Combinations',
               'Category-Tags')

In [None]:
plot_auc_cold_start_curve(results_to_graph,
                          'BPR Cold Start Test AUC\nCategory Tags vs Feature Combinations',
                          'Category-Tags')

In [None]:
bpr_data_file = !ls ~/data/out_model_features.csv
bpr_data_file[0]

In [None]:
import pickle
results_0141 = pickle.load( open( "results.20180511.0141.pickle", "rb" ) )

In [None]:
z = [v['delta_sec'] for k,v in results.items()]

In [None]:
dt_str = datetime.now().strftime("%Y%m%d.%H%M")
sns.set_context("talk")
plt.figure(figsize=(20,10))
plt.tick_params(axis='both', which='major', labelsize=20)
#plt.title(title,fontsize=30)
plt.scatter(z,
    #label=calc_desc,
    #linewidth=lw,
    #linestyle=ls,
    #markersize=ms,
    marker='o')
plt.legend()
#plt.ylabel("Cold Start Test AUC",fontsize=20)
#plt.xlabel("Number of Iterations",fontsize=20)

In [None]:
for k,v in results.items():   
    print('AUC Test -' + k +  str(v['auc_test'][-1]))
    print('Cold Start AUC Test - '+ k + str(v['auc_cold_test'][-1]))    

## FOR NOLAN TO ADD
f = {}

for i in (x[0]for x in enumerate(user_items_test)):
    for x in zip(user_items_test[i][0], user_items_test[i][1]):
        f[(x[0][0], x[0][1], x[0][2])] = x[1][0]

user_item_rec_score = {}
user_item_pur_score={}

for x in list(f.keys()):
    
    # if SCORE is less than zero meaning USER preffered j Over i
    if f[x] < 0:
        if x[0] not in user_item_rec_score.keys():
            user_item_rec_score[x[0]] = []
        
        user_item_rec_score[x[0]].append((x[2], f[x]))
    
    # SCORE of more than 0, meaning user preffered i over j
    else:
       
        if x[0] not in user_item_pur_score.keys():
            user_item_pur_score[x[0]] = []
        
        user_item_pur_score[x[0]].append((x[1], f[x]))

#reverse dict for look up
users_lookup= {v:k for k,v in users.items()}
items_lookup= {v:k for k,v in items.items()}

array=[]
for x in list(user_item_rec_score.keys()):
    a=user_item_rec_score[x]
    b= sorted(a, key=lambda x: x[1])
    for i in range(10): #recommending top 10 items
        array.append((users_lookup[x], items_lookup[b[i][0]]))  
        
        
#create dataframe of user and their REC LIST- Top 10 SORTED by high to low Preference score so ORDER matters
df=pd.DataFrame(array, columns=['User','Recommended Items'])
df_Rec=df.groupby('User', as_index=False)['Recommended Items'].agg({'Recommendation list':(lambda x: list(x))})
df_Rec.head()

c=df_Rec.set_index('User').T.to_json()

import json as json
with open('reclist_json', 'w+') as f: 
    f.write(json.dumps(c))