In [52]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from deepctr.models import DeepFM, xDeepFM
import deepctr.feature_column
from deepctr.feature_column import SparseFeat, DenseFeat
import ast
from pickle import dump
from keras.callbacks import ModelCheckpoint, CSVLogger, LearningRateScheduler, ReduceLROnPlateau

In [53]:
scores = pd.read_csv('scores.csv')
scores['scores'] = scores['scores'].apply(lambda x: ast.literal_eval(x))
df2 = pd.json_normalize(scores['scores'])
df2['feed_id'] = scores['feed_id']

In [54]:
#data = pd.read_csv("./movielens_sample.txt")
df = pd.read_csv('shrunk.csv')
# true or false is_premium for user
is_premium_user = pd.read_csv('is_premium.csv')

is_active = pd.read_csv('new-input.csv')
premiums = pd.read_csv('premium_subs.csv')
actives = pd.read_csv('active-subs.csv')
num = pd.read_csv('num-subs.csv')
avg_stories = pd.read_csv('avg_strs_pr_mnth.csv')
data = df.merge(premiums[['premium_subs', 'feed_id']], how = 'left',
                    left_on = 'feed_id', right_on = 'feed_id')

data = data.merge(actives[['active_subs', 'feed_id']], how = 'left',
                    left_on = 'feed_id', right_on = 'feed_id')

data = data.merge(num[['num_subs', 'feed_id']], how = 'left',
                    left_on = 'feed_id', right_on = 'feed_id')

data = data.merge(df2[['read_pct', 'feed_id', 'reader_count', 'reach_score', 'story_count', 'share_count']], how = 'left',
                    left_on = 'feed_id', right_on = 'feed_id')

data = data.merge(avg_stories[['average_stories_per_month', 'feed_id']], how = 'left',
                    left_on = 'feed_id', right_on = 'feed_id')

# active refers to feed, from Feed class

data = data.merge(is_active[['active', 'feed_id', 'active_premium_subscribers']], how = 'left',
                    left_on = 'feed_id', right_on = 'feed_id')

data = data.merge(is_premium_user[['is_premium', 'user',]], how = 'left',
                    left_on = 'user', right_on = 'user')

In [55]:
data['active'] = data['active'].apply(lambda x: int(x == True))
data['is_premium'] = data['is_premium'].apply(lambda x: int(x == True))

In [56]:
sparse_features = ["feed_id", "user", "active", "is_premium"]

dense_features = ["premium_subs", "active_subs", "num_subs", "read_pct", "reader_count", "reach_score", "story_count", "share_count", "average_stories_per_month", 'active_premium_subscribers']
target = ['is_following_feed']

In [57]:
for feat in sparse_features:
        lbe = LabelEncoder()
        data[feat] = lbe.fit_transform(data[feat])
        dump(lbe, open(feat + '-' + 'lbe.pkl', 'wb'))
        
mms = MinMaxScaler(feature_range=(0,1))
data[dense_features] = mms.fit_transform(data[dense_features])
# shouldn't need to save a ranged model
#dump(mms, open('minmax.pkl', 'wb'))


In [58]:
# write our vocab sizes to file
file = open("vocab-sizes.txt", "w")
for feat in sparse_features:
    file.write(str(data[feat].max()+1) + "\n")
file.close()

In [59]:
# For sparse features, we transform them into dense vectors by embedding techniques. For dense numerical features, 
# we concatenate them to the input tensors of fully connected layer.
fixlen_feature_columns = [SparseFeat(feat, vocabulary_size=data[feat].max() + 1,embedding_dim=16)
                       for i,feat in enumerate(sparse_features)] + [DenseFeat(feat, 1,)
                      for feat in dense_features]

In [60]:
print(type(fixlen_feature_columns))
print(fixlen_feature_columns[2])

<class 'list'>
SparseFeat(name='active', vocabulary_size=2, embedding_dim=16, use_hash=False, dtype='int32', embeddings_initializer=<tensorflow.python.keras.initializers.initializers_v1.RandomNormal object at 0x7fd0ca3f34d0>, embedding_name='active', group_name='default_group', trainable=True)


In [61]:
# if we add variable length features we need this
# linear_feature_columns = fixlen_feature_columns + varlen_feature_columns
# dnn_feature_columns = fixlen_feature_columns + varlen_feature_columns

linear_feature_columns = fixlen_feature_columns
dnn_feature_columns = fixlen_feature_columns

In [62]:
feature_names = deepctr.feature_column.get_feature_names(linear_feature_columns + dnn_feature_columns)


In [63]:
train, test = train_test_split(data, test_size=0.2, random_state=2020)
train_model_input = {name:train[name] for name in feature_names}
test_model_input = {name:test[name] for name in feature_names}

In [39]:
import tensorflow as tf
strategy = tf.distribute.MirroredStrategy(["GPU:0", "GPU:1", "GPU:2", "GPU:3"])

INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1', '/job:localhost/replica:0/task:0/device:GPU:2', '/job:localhost/replica:0/task:0/device:GPU:3')


In [40]:
def schedule(epoch):
    if epoch < 30:
        return 0.001
    elif epoch < 37:
        return 0.0008
    else:
        return 0.0005
lr_scheduler = LearningRateScheduler(schedule)

In [41]:
with strategy.scope():
    model = DeepFM(linear_feature_columns, dnn_feature_columns, task='binary', dnn_hidden_units=(128,128,128,128))
    optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
    checkpointer = ModelCheckpoint(monitor='val_binary_crossentropy',filepath='best_model_fm_emb16_4layers.keras', verbose=1, save_best_only=True)
    model.compile(optimizer, "binary_crossentropy", metrics=['binary_crossentropy'], )

    history = model.fit(train_model_input, train[target].values,
                        batch_size=256, epochs=35, verbose=2, validation_split=0.2, callbacks = [checkpointer, lr_scheduler])
    pred_ans = model.predict(test_model_input, batch_size=256)

Epoch 1/35
INFO:tensorflow:batch_all_reduce: 19 all-reduces with algorithm = nccl, num_packs = 1
INFO:tensorflow:batch_all_reduce: 19 all-reduces with algorithm = nccl, num_packs = 1
12938/12938 - 322s - loss: 0.3348 - binary_crossentropy: 0.3191 - val_loss: 0.3286 - val_binary_crossentropy: 0.3099

Epoch 00001: val_binary_crossentropy improved from inf to 0.30988, saving model to best_model_fm_emb16_4layers.keras
Epoch 2/35
12938/12938 - 310s - loss: 0.3253 - binary_crossentropy: 0.3027 - val_loss: 0.3226 - val_binary_crossentropy: 0.2997

Epoch 00002: val_binary_crossentropy improved from 0.30988 to 0.29968, saving model to best_model_fm_emb16_4layers.keras
Epoch 3/35
12938/12938 - 309s - loss: 0.3186 - binary_crossentropy: 0.2924 - val_loss: 0.3224 - val_binary_crossentropy: 0.2967

Epoch 00003: val_binary_crossentropy improved from 0.29968 to 0.29668, saving model to best_model_fm_emb16_4layers.keras
Epoch 4/35
12938/12938 - 310s - loss: 0.3138 - binary_crossentropy: 0.2848 - val_l

In [43]:
plt.plot(history.history['val_binary_crossentropy'])
        plt.title('model validation loss')
        plt.ylabel('loss')
        plt.xlabel('epoch')
        plt.legend(['train'], loc='upper left')
        plt.show()
        plt.savefig('model-loss.png')

IndentationError: unexpected indent (<ipython-input-43-e2c3fefbbfae>, line 2)

In [None]:
from sklearn.metrics import log_loss, roc_auc_score
print("test LogLoss", round(log_loss(test[target].values, pred_ans), 4))
print("test AUC", round(roc_auc_score(test[target].values, pred_ans), 4))

In [None]:
model.save('deepfm-8-embed-3-layers.keras')

In [None]:
# need to impl split for test/train 
# need to add evaluation function
# 

In [64]:
def selected_not_interacted_list(masked_df, columns, items):
    """ create better not interacted list
    masked_df: full dataset df with user mask already removed
    columns: columns needed for our model prediction
    """

    df = masked_df.drop(masked_df[masked_df.reader_count == 0.0].index)[columns]
    not_interacted_items = set(df['feed_id'].unique()) - set(items)
    return not_interacted_items
    
def evaluation(test, model, full_df, features):
    df = test.drop(test[test.is_following_feed != 1].index)[features]

    
    hits = []
    counter = 0
    input_dict = {}
    for index, test_row in df.iterrows():
        # get rows from full df 
        user_id = test_row['user']
        mask = full_df['user'] == user_id
        # full_df[mask]

        items = list(full_df[mask]['feed_id'])
        
        #selected_not_interacted_list(full_df[full_df['user'] != user_id], features)
        
        not_interacted_items = set(full_df['feed_id'].unique()) - set(items)
        selected_not_interacted = list(np.random.choice(list(not_interacted_items), 99))
        
        # might not be sparse enough, might add 15% back from reader_count != 0.0 list to add more variation
        # not sure why I still need to subract items, I guess they should pass a mask with feeds
        
        if bool(len({*items} & {*selected_not_interacted})):
            raise ValueError
        input_df = pd.DataFrame(columns=features)
        # need to grab the extra data needed for 
        for feed in selected_not_interacted:
            rows = full_df.loc[full_df['feed_id'] == feed]
            
            first = rows.iloc[0]

            input_df = input_df.append(first)

        
        # add our final correct one on the end
        
        input_df.loc[:, 'is_following_feed'] = 0
        input_df.loc[:, 'user'] = user_id
        test_row['is_following_feed'] = 1
        input_df = input_df.append(test_row)
        
        
        input_df = input_df.drop(['is_following_feed', 'Unnamed: 0'], axis=1)
        # predict with correct input format
        pred_ans = model.predict({name:input_df[name] for name in features})
  

        # convert predictions to a little bit better format
        predictions = [i[0] for i in pred_ans]
        
        feeds = input_df['feed_id'].tolist()
        
        results = sorted(dict(zip(feeds, predictions)).items(),  key=lambda x: x[1], reverse=True)
        counter = counter + 1
        if counter % 100 == 0:
            print('we are at step: ' + str(counter))
            print("the hit ratio at this step is {:.2f}".format(np.average(hits)))
        
        top10_items = [i[0] for i in results[0:10]]
        if test_row['feed_id'] in top10_items:
            hits.append(1)
#         print('we hit for feed: ' + str(u))
        else:
            hits.append(0)
#         print('we missed for feed: ' + str(u))

    print("The Hit Ratio @ 10 is {:.2f}".format(np.average(hits)))
        
        
        
    
            
    

In [3]:
# ! pip install pydot
# ! pip install pydotplus
# ! sudo apt-get install graphviz
#! pip install install keras-visualizer

Reading package lists... Done
Building dependency tree       
Reading state information... Done
graphviz is already the newest version (2.40.1-6).
0 upgraded, 0 newly installed, 0 to remove and 2 not upgraded.


In [4]:
import keras
import numpy as np
from deepctr.layers import custom_objects
from keras_visualizer import visualizer

model = keras.models.load_model('fm_emb16_4layers_.63HR.keras', custom_objects)


# visualizer(model, format='png', view=True)
#files.download('current-model.png')
evaluation(test, model, data, sparse_features + dense_features)


ModuleNotFoundError: No module named 'graphviz'

In [None]:
pred_ans = model.predict(test_model_input, batch_size=256)