In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler


In [2]:
# Reading data files for rating,users and books
book_rating_data_set = pd.read_csv('C:/Users/sharm/Downloads/BX-CSV-Dump (1)/BX-Book-Ratings.csv', sep=';', error_bad_lines=False, encoding="latin-1")
user_data_set = pd.read_csv('C:/Users/sharm/Downloads/BX-CSV-Dump (1)/BX-Users.csv', sep=';', error_bad_lines=False, encoding="latin-1")
book_data_set = pd.read_csv('C:/Users/sharm/Downloads/BX-CSV-Dump (1)/BX-Books.csv', sep=';', error_bad_lines=False, encoding="latin-1")


b'Skipping line 6452: expected 8 fields, saw 9\nSkipping line 43667: expected 8 fields, saw 10\nSkipping line 51751: expected 8 fields, saw 9\n'
b'Skipping line 92038: expected 8 fields, saw 9\nSkipping line 104319: expected 8 fields, saw 9\nSkipping line 121768: expected 8 fields, saw 9\n'
b'Skipping line 144058: expected 8 fields, saw 9\nSkipping line 150789: expected 8 fields, saw 9\nSkipping line 157128: expected 8 fields, saw 9\nSkipping line 180189: expected 8 fields, saw 9\nSkipping line 185738: expected 8 fields, saw 9\n'
b'Skipping line 209388: expected 8 fields, saw 9\nSkipping line 220626: expected 8 fields, saw 9\nSkipping line 227933: expected 8 fields, saw 11\nSkipping line 228957: expected 8 fields, saw 10\nSkipping line 245933: expected 8 fields, saw 9\nSkipping line 251296: expected 8 fields, saw 9\nSkipping line 259941: expected 8 fields, saw 9\nSkipping line 261529: expected 8 fields, saw 9\n'
  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
# Data cleaning 
# combining ratings file with book file on keeping ISBN and then dropping cols which are not required for further calculation
book_rating_file = pd.merge(book_rating_data_set, book_data_set, on='ISBN')
cols = ['Year-Of-Publication', 'Publisher', 'Book-Author', 'Image-URL-S', 'Image-URL-M', 'Image-URL-L']
book_rating_file.drop(cols, axis=1, inplace=True)
print(book_rating_file.head(10))

   User-ID        ISBN  Book-Rating            Book-Title
0   276725  034545104X            0  Flesh Tones: A Novel
1     2313  034545104X            5  Flesh Tones: A Novel
2     6543  034545104X            0  Flesh Tones: A Novel
3     8680  034545104X            5  Flesh Tones: A Novel
4    10314  034545104X            9  Flesh Tones: A Novel
5    23768  034545104X            0  Flesh Tones: A Novel
6    28266  034545104X            0  Flesh Tones: A Novel
7    28523  034545104X            0  Flesh Tones: A Novel
8    39002  034545104X            0  Flesh Tones: A Novel
9    50403  034545104X            9  Flesh Tones: A Novel


In [4]:
# extracting information for finding ratings count for each book based on title and rating by each user
rating_counting = (book_rating_file.
     groupby(by = ['Book-Title'])['Book-Rating'].
     count().
     reset_index().
     rename(columns = {'Book-Rating': 'RatingCount_book'})
     [['Book-Title', 'RatingCount_book']]
    )
print(rating_counting.head(10))

                                          Book-Title  RatingCount_book
0   A Light in the Storm: The Civil War Diary of ...                 4
1                              Always Have Popsicles                 1
2               Apple Magic (The Collector's series)                 1
3   Ask Lily (Young Women of Faith: Lily Series, ...                 1
4   Beyond IBM: Leadership Marketing and Finance ...                 1
5   Clifford Visita El Hospital (Clifford El Gran...                 1
6                                       Dark Justice                 1
7                                           Deceived                 2
8   Earth Prayers From around the World: 365 Pray...                10
9   Final Fantasy Anthology: Official Strategy Gu...                 4


In [5]:
# Finding out the counting of ratings for the threshold value to find out ratings for each book
threshold = 25
rating_counting = rating_counting.query('RatingCount_book >= @threshold')
print(rating_counting.head(10))


                          Book-Title  RatingCount_book
75                      'Salem's Lot                47
203                   10 Lb. Penalty                61
422                   101 Dalmatians                37
673  14,000 Things to Be Happy About                28
697               16 Lighthouse Road                65
764                             1984               284
818              1st to Die: A Novel               509
913            2001: A Space Odyssey                25
946                2010: Odyssey Two                90
955                204 Rosewood Lane                71


In [6]:
# combining above counted ratings with book rating data set for each book title and extracting the results from rating dataframe
user_rating = pd.merge(rating_counting, book_rating_file, left_on='Book-Title', right_on='Book-Title', how='left')
print(user_rating.head(10))


     Book-Title  RatingCount_book  User-ID        ISBN  Book-Rating
0  'Salem's Lot                47     8936  067103975X            0
1  'Salem's Lot                47   172245  067103975X            0
2  'Salem's Lot                47   189835  067103975X            5
3  'Salem's Lot                47     9226  0451168089            0
4  'Salem's Lot                47    33283  0451168089           10
5  'Salem's Lot                47    37950  0451168089            0
6  'Salem's Lot                47    55734  0451168089            0
7  'Salem's Lot                47    56044  0451168089            8
8  'Salem's Lot                47    59727  0451168089            0
9  'Salem's Lot                47    60263  0451168089           10


In [7]:
# Finding out user counts for each book rating  for the users
user_counting = (user_rating.
     groupby(by = ['User-ID'])['Book-Rating'].
     count().
     reset_index().
     rename(columns = {'Book-Rating': 'RatingCount_user'})
     [['User-ID', 'RatingCount_user']]
    )
print(user_counting.head(10))

   User-ID  RatingCount_user
0        8                 2
1        9                 2
2       10                 1
3       14                 1
4       16                 2
5       17                 4
6       19                 1
7       23                 1
8       26                 2
9       32                 2


In [8]:
# finding out common results based on user id for each user 
combined_results = user_rating.merge(user_counting, left_on = 'User-ID', right_on = 'User-ID', how = 'inner')
print(combined_results.head(10))

                                          Book-Title  RatingCount_book  \
0                                       'Salem's Lot                47   
1                                1st to Die: A Novel               509   
2                                     A Case of Need               236   
3                                 A Perfect Stranger                54   
4                                           Accident               126   
5                                  All I Need Is You                60   
6  All That Remains (Kay Scarpetta Mysteries (Pap...               184   
7                                          BODY FARM                50   
8                                       Bag of Bones               195   
9                                    Best Of Enemies                37   

   User-ID        ISBN  Book-Rating  RatingCount_user  
0     8936  067103975X            0               177  
1     8936  0446610038            0               177  
2     8936  04512

In [9]:
# processing and scaling the combined results for the book ratings
scaler = MinMaxScaler()
combined_results['Book-Rating'] = combined_results['Book-Rating'].values.astype(float)
rating_scaled = pd.DataFrame(scaler.fit_transform(combined_results['Book-Rating'].values.reshape(-1,1)))
combined_results['Book-Rating'] = rating_scaled

In [10]:
# dropping the duplicate results
combined_results = combined_results.drop_duplicates(['User-ID', 'Book-Title'])
user_book_matrix = combined_results.pivot(index='User-ID', columns='Book-Title', values='Book-Rating')
user_book_matrix.fillna(0, inplace=True)
users = user_book_matrix.index.tolist()
books = user_book_matrix.columns.tolist()

In [11]:
# Now implementing tensorflow on the above cleaned combined data results 
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()

Instructions for updating:
non-resource variables are not supported in the long term


In [12]:
# defining the parameters for building neural network from scratch
num_input = combined_results['Book-Title'].nunique()
num_hidden_1 = 10
num_hidden_2 = 5

X = tf.placeholder(tf.float64, [None, num_input])
# calcuating weights and biases for the layers 
weights = {
    'encoder_h1': tf.Variable(tf.random_normal([num_input, num_hidden_1], dtype=tf.float64)),
    'encoder_h2': tf.Variable(tf.random_normal([num_hidden_1, num_hidden_2], dtype=tf.float64)),
    'decoder_h1': tf.Variable(tf.random_normal([num_hidden_2, num_hidden_1], dtype=tf.float64)),
    'decoder_h2': tf.Variable(tf.random_normal([num_hidden_1, num_input], dtype=tf.float64)),
}

biases = {
    'encoder_b1': tf.Variable(tf.random_normal([num_hidden_1], dtype=tf.float64)),
    'encoder_b2': tf.Variable(tf.random_normal([num_hidden_2], dtype=tf.float64)),
    'decoder_b1': tf.Variable(tf.random_normal([num_hidden_1], dtype=tf.float64)),
    'decoder_b2': tf.Variable(tf.random_normal([num_input], dtype=tf.float64)),
}

In [13]:
# defining encoders and decoders
def encoder(x):
    layer_1 = tf.nn.sigmoid(tf.add(tf.matmul(x, weights['encoder_h1']), biases['encoder_b1']))
    layer_2 = tf.nn.sigmoid(tf.add(tf.matmul(layer_1, weights['encoder_h2']), biases['encoder_b2']))
    return layer_2

def decoder(x):
    layer_1 = tf.nn.sigmoid(tf.add(tf.matmul(x, weights['decoder_h1']), biases['decoder_b1']))
    layer_2 = tf.nn.sigmoid(tf.add(tf.matmul(layer_1, weights['decoder_h2']), biases['decoder_b2']))
    return layer_2

In [14]:
# invoking encoders and decoders functions
encoder_output = encoder(X)
decoder_output = decoder(encoder_output)

In [15]:
# predicting valuesbased on the decoder output
y_pred = decoder_output
y_true = X

In [None]:
# calculating loss values and optimizing the results  and then predicting the data
loss = tf.losses.mean_squared_error(y_true, y_pred)
optimizer = tf.train.RMSPropOptimizer(0.03).minimize(loss)
eval_x = tf.placeholder(tf.int32, )
eval_y = tf.placeholder(tf.int32, )
pre, pre_op = tf.metrics.precision(labels=eval_x, predictions=eval_y)
init = tf.global_variables_initializer()
local_init = tf.local_variables_initializer()
predicted_data = pd.DataFrame()
#iterating through data for calculating the predicting values for each epoch
with tf.Session() as session:
    epochs = 100
    batch_size = 35
    session.run(init)
    session.run(local_init)
    num_batches = int(user_book_matrix.shape[0] / batch_size)
    user_book_matrix = np.array_split(user_book_matrix, num_batches)
    
    for i in range(epochs):
        avg_cost = 0
        for batch in user_book_matrix:
            _, l = session.run([optimizer, loss], feed_dict={X: batch})
            avg_cost += l

        avg_cost /= num_batches

        print("epoch: {} Loss: {}".format(i + 1, avg_cost))
    # doing calculations on predicted data for users book rating 
    user_book_matrix = np.concatenate(user_book_matrix, axis=0)
    predictions = session.run(decoder_output, feed_dict={X: user_book_matrix})
    predicted_data = predicted_data.append(pd.DataFrame(predictions))
    predicted_data = predicted_data.stack().reset_index(name='Book-Rating')
    predicted_data.columns = ['User-ID', 'Book-Title', 'Book-Rating']
    predicted_data['User-ID'] = predicted_data['User-ID'].map(lambda value: users[value])
    predicted_data['Book-Title'] = predicted_data['Book-Title'].map(lambda value: books[value])
    keys = ['User-ID', 'Book-Title']
    index_first_column = predicted_data.set_index(keys).index
    index_second_column = combined_results.set_index(keys).index
    # finding out the top ranked results from the predicted values
    top_ranked = predicted_data[~index_first_column.isin(index_second_column)]
    top_ranked = top_ranked.sort_values(['User-ID', 'Book-Rating'], ascending=[True, False])
    top_ranked = top_ranked.groupby('User-ID').head(10)
    print(top_ranked)
    
    # testing results for one of the record
    print(top_ranked.loc[top_ranked['User-ID'] == 180187])
    print(book_rating_data_set.loc[book_rating_data_set['User-ID'] == 180187].sort_values(by=['Book-Rating'], ascending=False))
    

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
epoch: 1 Loss: 0.039228938362915265
epoch: 2 Loss: 0.00048308506106253596
epoch: 3 Loss: 0.0004738396160153606
epoch: 4 Loss: 0.00044496947321824657
epoch: 5 Loss: 0.00029771864527384763
epoch: 6 Loss: 0.00029651056739370975
epoch: 7 Loss: 0.00029569363205244405
epoch: 8 Loss: 0.00029510227303784206
epoch: 9 Loss: 0.00029465452004896016
epoch: 10 Loss: 0.0002943039915354508
epoch: 11 Loss: 0.0002940223132510237
epoch: 12 Loss: 0.000293791152537725
epoch: 13 Loss: 0.00029359817474370155
epoch: 14 Loss: 0.0002934347379617547
epoch: 15 Loss: 0.000293294551096658
epoch: 16 Loss: 0.0002931731208834618
epoch: 17 Loss: 0.00029306689731467984
epoch: 18 Loss: 0.0002929732449644564
epoch: 19 Loss: 0.0002928901389230875
epoch: 20 Loss: 0.0002928159677933353
epoch: 21 Loss: 0.00029274929174700413
epoch: 2