In [100]:
import pandas as pd
import tensorflow as tf
from pandas import DataFrame
import numpy as np
import math
import heapq
from tqdm import tqdm
import random
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
#!pip install --upgrade tensorflow
#!pip install --upgrade pandas
#!pip install --upgrade numpy
#tf.debugging.set_log_device_placement(True)


Num GPUs Available:  2


In [134]:
df = pd.read_csv('users-feeds.csv')
print(df['feed_id'].nunique())
print(len(df))
# remove sparse feeds
df = df[df.groupby('feed_id').feed_id.transform(len) > 1]
print('updated without sparse feeds')
print(df['feed_id'].nunique())
print(len(df))
full_items = df['feed_id'].nunique()

def shrink_users_df(df,user_id):
    userIds = np.random.choice(df[user_id].unique(),
                                    size=int(len(df[user_id].unique())*0.12),
                                    replace=False)
    return df.loc[df[user_id].isin(userIds)]
def add_negative_samples(df, item_tag, user_tag,label_tag):

    updated_df = pd.DataFrame(columns=[user_tag,item_tag,label_tag])
    all_feeds = df[item_tag].unique()
    users, items, labels = [], [], []

    user_item_set = set(zip(df[user_tag], df[item_tag]))
    num_negatives = 6

    for (u, i) in user_item_set:
        users.append(u)
        items.append(i)
        labels.append(1) # items that the user has interacted with are positive
        for _ in range(num_negatives):
            # randomly select an item
            negative_item = np.random.choice(all_feeds)
            # check that the user has not interacted with this item
            while (u, negative_item) in user_item_set:
                negative_item = np.random.choice(all_feeds)
            users.append(u)
            items.append(negative_item)
            labels.append(0) # items not interacted with are negative
    updated_df[user_tag] = users
    updated_df[item_tag] = items
    updated_df[label_tag] = labels
    del df
    return updated_df

3200721
8787564
updated without sparse feeds
713621
6300464


In [135]:
def mask_first(x):
    """
    Return a list of 0 for the first item and 1 for all others
    """
    result = np.ones_like(x)
    result[0] = 0

    return result

# needs to add validate in the future
def train_test_split(full_df):
    df_test = df.copy(deep=True)
    df_test = df_test.groupby(['user']).first()

    df_test['user'] = df_test.index
    df_test = df_test[['user', 'feed_id','is_following_feed']]
    df_test = df_test.rename_axis(None, axis=1)

    df_train = df.copy(deep=True)
    mask = df.groupby(['user'])['user'].transform(mask_first).astype(bool)

    df_train = df.loc[mask]
    return df_train, df_test

In [136]:
df = shrink_users_df(df, 'user')


df.loc[:, 'is_following_feed'] = 1

# from google.colab import files
# df.to_csv('update.csv')
# files.download( "update.csv" )

df = add_negative_samples(df,'feed_id','user','is_following_feed')

print(df.sample(10))

df_train, df_test = train_test_split(df)


# test_user_item_set = set(zip(df_test['user'], df_test['feed_id']))

# for (u,i) in test_user_item_set:
#   print(u)
#   print([u]*100)

           user  feed_id  is_following_feed
1158242   58894   122008                  0
4849229  125602     9128                  1
215805   130872  1732091                  0
1383668   44981    11711                  0
5062612  145664   616282                  0
115113   111791   787757                  0
3157093  168575   254317                  0
1892770  301534  3413260                  0
4128143   30725    77140                  0
752802    74777    43496                  0


In [153]:
def _get_user_embedding_layers(inputs, emb_dim):  
    """ create user embeddings """  
    user_gmf_emb = tf.keras.layers.Dense(emb_dim, activation='relu',kernel_initializer=tf.keras.initializers.GlorotUniform())(inputs)  
    user_mlp_emb = tf.keras.layers.Dense(emb_dim, activation='relu', kernel_initializer=tf.keras.initializers.GlorotUniform())(inputs)  

    return user_gmf_emb, user_mlp_emb  
  
def _get_item_embedding_layers(inputs, emb_dim):  
    """ create item embeddings """  
    item_gmf_emb = tf.keras.layers.Dense(emb_dim, activation='relu', kernel_initializer=tf.keras.initializers.GlorotUniform())(inputs)  
    item_mlp_emb = tf.keras.layers.Dense(emb_dim, activation='relu', kernel_initializer=tf.keras.initializers.GlorotUniform())(inputs)  
    return item_gmf_emb, item_mlp_emb  

In [154]:
def _gmf(user_emb, item_emb):  
    """ general matrix factorization branch """  
    gmf_mat = tf.keras.layers.Multiply()([user_emb, item_emb])  
    return gmf_mat  

In [155]:
def _mlp(user_emb, item_emb, dropout_rate):  
    """ multi-layer perceptron branch """  
    def add_layer(dim, input_layer, dropout_rate):  
        hidden_layer = tf.keras.layers.Dense(dim, activation='relu', kernel_initializer=tf.keras.initializers.GlorotUniform())(input_layer)  
        if dropout_rate:  
            dropout_layer = tf.keras.layers.Dropout(dropout_rate)(hidden_layer)  
            return dropout_layer  
        return hidden_layer  
  
    concat_layer = tf.keras.layers.Concatenate()([user_emb, item_emb])
    dropout_l1 = tf.keras.layers.Dropout(dropout_rate)(concat_layer)  
    dense_layer_1 = add_layer(64, dropout_l1, dropout_rate)  
    dense_layer_2 = add_layer(32, dense_layer_1, dropout_rate)  
    dense_layer_3 = add_layer(16, dense_layer_2, dropout_rate)  
    dense_layer_4 = add_layer(8, dense_layer_3, None)  
    return dense_layer_4  

In [156]:
def _neuCF(gmf, mlp, dropout_rate):  
    """ final output layer """  
    concat_layer = tf.keras.layers.Concatenate()([gmf, mlp])  
    output_layer = tf.keras.layers.Dense(1, activation='sigmoid', kernel_initializer=tf.keras.initializers.lecun_uniform())(concat_layer)  
    return output_layer  

In [157]:
def batch_generator(x, y, batch_size, n_batch, shuffle, user_dim, item_dim):
    """ batch generator to supply data for training and testing """
    user_df, item_df = x
#     print('user_df: ' )
#     print(user_df)
#     print('item_df: ')
#     print(item_df)

    counter = 0
    training_index = np.arange(user_df.shape[0])
#     print('training_index: ')
#     print(training_index)

    if shuffle:
        np.random.shuffle(training_index)

    while True:
        batch_index = training_index[batch_size*counter:batch_size*(counter+1)]
#         print('batch_index')
#         print(batch_index)
#         print('input to one_hot')
        #print(user_df[batch_index].shape)
        user_batch = tf.one_hot(user_df[batch_index], depth=user_dim)
        item_batch = tf.one_hot(item_df[batch_index], depth=item_dim)
#         print(user_batch)
#         print(item_batch)
#         break
        y_batch = y[batch_index]
        counter += 1
        yield [user_batch, item_batch], y_batch

        if counter == n_batch:
            if shuffle:
                np.random.shuffle(training_index)
            counter = 0

In [158]:
def build_graph(user_dim, item_dim, dropout_rate=0.25):
    """ neural collaborative filtering model """

    user_input = tf.keras.Input(shape=(user_dim), name='user_input')
    item_input = tf.keras.Input(shape=(item_dim), name='item_input')

    # create embedding layers
    user_gmf_emb, user_mlp_emb = _get_user_embedding_layers(user_input, 32)
    item_gmf_emb, item_mlp_emb = _get_item_embedding_layers(item_input, 32)

    # general matrix factorization
    gmf = _gmf(user_gmf_emb, item_gmf_emb)

    # multi layer perceptron
    mlp = _mlp(user_mlp_emb, item_mlp_emb, dropout_rate)

    # output
    output = _neuCF(gmf, mlp, dropout_rate)

    # create the model
    model = tf.keras.Model(inputs=[user_input, item_input], outputs=output)

    return model

In [159]:
def eval_hit_rate(test_df, full_df, model, n_user):
  print('num unique for users: ' + str(full_df['user'].nunique()))  
  print('num unique for items: ' + str(full_df['feed_id'].nunique()))  

  test_user_item_set = set(zip(test_df['user'], test_df['feed_id']))

  user_interacted_items = full_df.groupby('user')['feed_id'].apply(list).to_dict()
  hits = []
  counter = 0
  for (u,i) in test_user_item_set:
    interacted_items = user_interacted_items[u]
    not_interacted_items = set(full_df['feed_id'].unique()) - set(interacted_items)
    selected_not_interacted = list(np.random.choice(list(not_interacted_items), 99))
    test_items = selected_not_interacted + [i]
    
    #new code logic
    results = np.empty((0, 3), int)
    with tf.device('/GPU:0'):
        
        for item in test_items:
            user_embedding = tf.one_hot(np.asarray([u], dtype=np.int64), depth=n_user)
            item_embedding = tf.one_hot(np.asarray([item], dtype=np.int64), depth=480154)
            print(user_embedding)
            print(item_embedding)
            predicted_label = model.predict([np.asarray(user_embedding, dtype=np.int64), np.asarray(item_embedding, dtype=np.int64)])
            results = np.append(results, np.array([[u,predicted_label[0][0],item]]), axis=0)
            print('our label')
            print(predicted_label[0][0])
            print(type(predicted_label))
            break
        break
        counter = counter + 1
        if counter % 500 == 0:
            print('we are at step: ' + str(counter))
    #print(results)
    #break
            
    #print(predicted_labels)
    #top5_items = [test_items[i[0]] for i in np.argsort(predicted_label)[::-1][0:5].tolist()]
    top5_items =results[results[:,1].argsort()[::-1]][0:8]
    #print('top5_items')
    #print(top5_items)
    if i in top5_items:
        hits.append(1)
        #print('we hit for feed: ' + str(u))
    else:
        hits.append(0)
        #print('we missed for feed: ' + str(u))
        
  print("The Hit Ratio @ 5 is {:.2f}".format(np.average(hits)))

In [162]:
def model(x_train, y_train, n_user, n_item, num_epoch, batch_size):
    print('n_user')
    print(n_user)
    num_batch = np.ceil(x_train[0].shape[0]/(batch_size*2))


    # build graph
    model = build_graph(n_user, n_item)

    # compile and train
    #optimizer = tf.keras.optimizers.Adam(learning_rate=.00222)
    optimizer = tf.keras.optimizers.Adam(learning_rate=0.00382059, beta_1=0.783529, beta_2=0.909003, epsilon=1.45439e-07)
    model.compile(optimizer=optimizer,
                  loss=tf.keras.losses.BinaryCrossentropy(),
                  metrics=['accuracy', 'top_k_categorical_accuracy']
                  )

    model.fit_generator(
        generator=batch_generator(
            x=x_train, y=y_train,
            batch_size=batch_size, n_batch=num_batch,
            shuffle=True, user_dim=n_user, item_dim=n_item),
        epochs=num_epoch,
        steps_per_epoch=num_batch,
        verbose=1
    )

    return model

In [None]:
print('init size')
print(df['user'].nunique())
print(df['feed_id'].nunique())
print(df_train['user'].nunique())
print(df_train['feed_id'].nunique())
with tf.device('/GPU:0'):
    ncf_model = model(
            x_train=[df_train['user'].to_numpy(), df_train['feed_id'].to_numpy()],
            y_train=df_train['is_following_feed'].to_numpy(),
            n_user=df['user'].nunique(),
            n_item=full_items,
            num_epoch=10,
            batch_size=512
            
        )

init size
21292
278160
21292
278160
n_user
21292
Epoch 1/10

In [None]:
ncf_model.save('ncf-model-new.keras')
# import keras
# ncf_model = keras.models.load_model('ncf-model-34-loss-6-negs.keras')
# print(ncf_model.summary())

#print(df_test['user'].nunique())
#eval_hit_rate(df_test, df, ncf_model, n_user=df['user'].nunique())


In [None]:
# tf.keras.utils.plot_model(ncf_model, to_file="neural_collaborative_filtering_model.png")  
# files.download('neural_collaborative_filtering_model.png')

# df_test.head()


In [None]:
# really might have to update n_user and n_item to uniques in whole df

In [None]:
updated_eval_hit_rate(df_test, df, ncf_model, df['user'].nunique(), full_items)

In [None]:
def updated_eval_hit_rate(test_df, full_df, model, n_user, n_items):
  print('num unique for users: ' + str(full_df['user'].nunique()))  
  print('num unique for items: ' + str(full_df['feed_id'].nunique()))  

  test_user_item_set = set(zip(test_df['user'], test_df['feed_id']))

  user_interacted_items = full_df.groupby('user')['feed_id'].apply(list).to_dict()
  hits = []
  counter = 0
  for (u,i) in test_user_item_set:
    interacted_items = user_interacted_items[u]
    not_interacted_items = set(full_df['feed_id'].unique()) - set(interacted_items)
    selected_not_interacted = list(np.random.choice(list(not_interacted_items), 99))
    test_items = selected_not_interacted + [i]
    
    #new code logic
    with tf.device('/GPU:0'):
        inputrr = [tf.one_hot(np.asarray(u, dtype=np.int64), depth=n_user)]*100
        itemrr = [tf.one_hot(np.asarray(i, dtype=np.int64), depth=n_items) for i in test_items]
        predicted_label = [i[0] for i in model.predict([np.asarray(inputrr, dtype=np.int64), np.asarray(itemrr, dtype=np.int64)]).tolist()]
        counter = counter + 1
        results = sorted(dict(zip(test_items, predicted_label)).items(),  key=lambda x: x[1], reverse=True)
        if counter % 500 == 0:
            print('we are at step: ' + str(counter))
            print("the hit ration at this step is {:.2f}".format(np.average(hits)))
    top8_items = [i[0] for i in results[0:8]]
    #print(top8_items)
    if i in top8_items:
        hits.append(1)
#         print('we hit for feed: ' + str(u))
    else:
        hits.append(0)
#         print('we missed for feed: ' + str(u))
        
  print("The Hit Ratio @ 5 is {:.2f}".format(np.average(hits)))

In [None]:
updated_eval_hit_rate(df_test, df, ncf_model, df['user'].nunique(), full_items)