In [5]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from deepctr.models import DeepFM
import deepctr.feature_column
from deepctr.feature_column import SparseFeat, DenseFeat
import ast

In [6]:
scores = pd.read_csv('scores.csv')
scores['scores'] = scores['scores'].apply(lambda x: ast.literal_eval(x))
df2 = pd.json_normalize(scores['scores'])
df2['feed_id'] = scores['feed_id']

In [7]:
df2

Unnamed: 0,read_pct,reader_count,reach_score,story_count,share_count,feed_id
0,0.0,0,0.0,0,0,5846640
1,0.0,0,0.0,0,0,1649173
2,0.0,0,0.0,0,0,3629790
3,0.0,0,0.0,0,0,1679956
4,0.0,0,0.0,0,0,745763
...,...,...,...,...,...,...
275044,0.0,0,0.0,0,0,1555162
275045,0.0,0,0.0,0,0,1484131
275046,0.0,0,0.0,0,0,3136267
275047,0.0,0,0.0,0,0,1320141


In [8]:
#data = pd.read_csv("./movielens_sample.txt")
df = pd.read_csv('shrunk.csv')

premiums = pd.read_csv('premium_subs.csv')
actives = pd.read_csv('active-subs.csv')
num = pd.read_csv('num-subs.csv')

data = df.merge(premiums[['premium_subs', 'feed_id']], how = 'left',
                    left_on = 'feed_id', right_on = 'feed_id')

data = data.merge(actives[['active_subs', 'feed_id']], how = 'left',
                    left_on = 'feed_id', right_on = 'feed_id')

data = data.merge(num[['num_subs', 'feed_id']], how = 'left',
                    left_on = 'feed_id', right_on = 'feed_id')

data = data.merge(df2[['read_pct', 'feed_id', 'reader_count', 'reach_score', 'story_count', 'share_count']], how = 'left',
                    left_on = 'feed_id', right_on = 'feed_id')

In [9]:
data

Unnamed: 0.1,Unnamed: 0,user,feed_id,is_following_feed,premium_subs,active_subs,num_subs,read_pct,reader_count,reach_score,story_count,share_count
0,0,497634,5846640,1,1,1,3,0.0,0,0.0,0,0
1,1,497634,1649173,0,1,1,5,0.0,0,0.0,0,0
2,2,497634,3629790,0,0,0,2,0.0,0,0.0,0,0
3,3,497634,1679956,0,0,0,3,0.0,0,0.0,0,0
4,4,497634,745763,0,0,0,10,0.0,0,0.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
5174969,5174969,207353,2492452,0,0,0,2,0.0,0,0.0,0,0
5174970,5174970,207353,3241729,0,0,0,2,0.0,0,0.0,0,0
5174971,5174971,207353,2109322,0,0,0,2,0.0,0,0.0,0,0
5174972,5174972,207353,3957346,0,0,0,2,0.0,0,0.0,0,0


In [10]:
sparse_features = ["feed_id", "user"]

dense_features = ["premium_subs", "active_subs", "num_subs", "read_pct", "reader_count", "reach_score", "story_count", "share_count"]
target = ['is_following_feed']

In [11]:
for feat in sparse_features:
        lbe = LabelEncoder()
        data[feat] = lbe.fit_transform(data[feat])
        
mms = MinMaxScaler(feature_range=(0,1))
data[dense_features] = mms.fit_transform(data[dense_features])


In [12]:
data

Unnamed: 0.1,Unnamed: 0,user,feed_id,is_following_feed,premium_subs,active_subs,num_subs,read_pct,reader_count,reach_score,story_count,share_count
0,0,14602,271635,1,0.001781,0.001749,0.000087,0.0,0.0,0.0,0.0,0.0
1,1,14602,198477,0,0.001781,0.001749,0.000131,0.0,0.0,0.0,0.0,0.0
2,2,14602,262775,0,0.001526,0.001499,0.000065,0.0,0.0,0.0,0.0,0.0
3,3,14602,200608,0,0.001526,0.001499,0.000087,0.0,0.0,0.0,0.0,0.0
4,4,14602,124915,0,0.001526,0.001499,0.000240,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
5174969,5174969,10913,240040,0,0.001526,0.001499,0.000065,0.0,0.0,0.0,0.0,0.0
5174970,5174970,10913,258092,0,0.001526,0.001499,0.000065,0.0,0.0,0.0,0.0,0.0
5174971,5174971,10913,224841,0,0.001526,0.001499,0.000065,0.0,0.0,0.0,0.0,0.0
5174972,5174972,10913,266344,0,0.001526,0.001499,0.000065,0.0,0.0,0.0,0.0,0.0


In [13]:
# For sparse features, we transform them into dense vectors by embedding techniques. For dense numerical features, 
# we concatenate them to the input tensors of fully connected layer.
fixlen_feature_columns = [SparseFeat(feat, vocabulary_size=data[feat].max() + 1,embedding_dim=4)
                       for i,feat in enumerate(sparse_features)] + [DenseFeat(feat, 1,)
                      for feat in dense_features]

In [14]:
print(type(fixlen_feature_columns))
print(fixlen_feature_columns[2])

<class 'list'>
DenseFeat(name='premium_subs', dimension=1, dtype='float32', transform_fn=None)


In [15]:
# if we add variable length features we need this
# linear_feature_columns = fixlen_feature_columns + varlen_feature_columns
# dnn_feature_columns = fixlen_feature_columns + varlen_feature_columns

linear_feature_columns = fixlen_feature_columns
dnn_feature_columns = fixlen_feature_columns

In [16]:
feature_names = deepctr.feature_column.get_feature_names(linear_feature_columns + dnn_feature_columns)


In [17]:
train, test = train_test_split(data, test_size=0.2, random_state=2020)
train_model_input = {name:train[name] for name in feature_names}
test_model_input = {name:test[name] for name in feature_names}

In [18]:
import tensorflow as tf
strategy = tf.distribute.MirroredStrategy(["GPU:0", "GPU:1", "GPU:2", "GPU:3"])

INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1', '/job:localhost/replica:0/task:0/device:GPU:2', '/job:localhost/replica:0/task:0/device:GPU:3')


In [19]:
with strategy.scope():
    model = DeepFM(linear_feature_columns, dnn_feature_columns, task='binary')
    optimizer = tf.keras.optimizers.Adam(learning_rate=0.0012)

    model.compile(optimizer, "binary_crossentropy", metrics=['binary_crossentropy'], )

    history = model.fit(train_model_input, train[target].values,
                        batch_size=256, epochs=30, verbose=2, validation_split=0.2, )
    pred_ans = model.predict(test_model_input, batch_size=256)


KeyboardInterrupt: 

In [None]:
from sklearn.metrics import log_loss, roc_auc_score
print("test LogLoss", round(log_loss(test[target].values, pred_ans), 4))
print("test AUC", round(roc_auc_score(test[target].values, pred_ans), 4))

In [None]:
model.save('deepfm-best-val-loss.keras')

In [None]:
# need to impl split for test/train 
# need to add evaluation function
# 

In [None]:
# Top-k evaluation for hitrate
def updated_eval_hit_rate(test_df, full_df, model, n_user, n_items):
  print('num unique for users: ' + str(full_df['user'].nunique()))  
  print('num unique for items: ' + str(full_df['feed_id'].nunique()))  

  test_user_item_set = set(zip(test_df['user'], test_df['feed_id']))

  user_interacted_items = full_df.groupby('user')['feed_id'].apply(list).to_dict()
  hits = []
  counter = 0
  for (u,i) in test_user_item_set:
    interacted_items = user_interacted_items[u]
    not_interacted_items = set(full_df['feed_id'].unique()) - set(interacted_items)
    selected_not_interacted = list(np.random.choice(list(not_interacted_items), 99))
    test_items = selected_not_interacted + [i]
    
    #new code logic
    with tf.device('/GPU:0'):
        inputrr = [tf.one_hot(np.asarray(u, dtype=np.int64), depth=n_user)]*100
        itemrr = [tf.one_hot(np.asarray(i, dtype=np.int64), depth=n_items) for i in test_items]
        predicted_label = [i[0] for i in model.predict([np.asarray(inputrr, dtype=np.int64), np.asarray(itemrr, dtype=np.int64)]).tolist()]
#         print('debug info------')
#         print('our user: ' + str(u) + ' our item: ' + str(i))
#         print('our test_items: ')
#         print(test_items)
#         print('   ')
#         print('predicted_label')
#         print(predicted_label)
#         return
        counter = counter + 1
        results = sorted(dict(zip(test_items, predicted_label)).items(),  key=lambda x: x[1], reverse=True)
        del predicted_label
        del inputrr
        del itemrr
        #print(str(counter))
        if counter % 100 == 0:
            print('we are at step: ' + str(counter))
            print("the hit ratio at this step is {:.2f}".format(np.average(hits)))
    top8_items = [i[0] for i in results[0:10]]
    #print(top8_items)
    if i in top8_items:
        hits.append(1)
#         print('we hit for feed: ' + str(u))
    else:
        hits.append(0)
#         print('we missed for feed: ' + str(u))
        
  print("The Hit Ratio @ 5 is {:.2f}".format(np.average(hits)))

def evaluation(test, model, full_df, features):
    df = test.drop(test[test.is_following_feed != 1].index)[features]
    
    
    hits = []
    counter = 0
    input_dict = {}
    for index, test_row in df.iterrows():
        # get rows from full df 
        user_id = test_row['user']
        mask = full_df['user'] == user_id
        # full_df[mask]
        items = list(full_df[mask]['feed_id'])
        
        not_interacted_items = set(full_df['feed_id'].unique()) - set(items)
        selected_not_interacted = list(np.random.choice(list(not_interacted_items), 99))
        
        if not bool(len({*items} & {*selected_not_interacted})):
            raise ValueError
        
        for feed in selected_not_interacted:
            rows = full_df.loc[full_df['feed_id'] == feed]
            
            first = rows.iloc[0]
            print(first)
        
        
        
        
        print('here')
        
        #not_interacted_items = set(full_df['feed_id'].unique()) - set(full_df[mask]['feed_id'].unique())
        
        selected_not_interacted = list(np.random.choice(list(not_interacted_items), 99))
        
        
        
        
    
            
    

In [None]:
assert 258713 in list(data[mask]['feed_id'])

In [None]:
assert 14575 not in list(not_interacted_items['feed_id'])

In [4]:
import keras
import numpy as np
model = keras.models.load_model('deepfm-best-val-loss.keras')
evaluation(test, model, data, sparse_features + dense_features)


ValueError: Unknown layer: NoMask

In [None]:
pred_ans = model.predict(test_model_input, batch_size=256)