# Module 8–Association Rules Mining and Recommendation Systems

### Case Study I

In [35]:
import pandas as pd
import numpy as np

In [43]:
with open('BX-Book-Ratings.csv') as f:
    print(f)
df = pd.read_csv('BX-Book-Ratings.csv', encoding = 'UTF-8', nrows=10000)
df.head()
df_books = pd.read_csv('BX-Books.csv', encoding='latin-1')
df = pd.merge(df,df_books,on='isbn')
df.head()

<_io.TextIOWrapper name='BX-Book-Ratings.csv' mode='r' encoding='UTF-8'>


  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,user_id,isbn,rating,book_title,book_author,year_of_publication,publisher
0,276725,034545104X,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books
1,276726,155061224,5,Rites of Passage,Judith Rae,2001,Heinle
2,276727,446520802,0,The Notebook,Nicholas Sparks,1996,Warner Books
3,278418,446520802,0,The Notebook,Nicholas Sparks,1996,Warner Books
4,276729,052165615X,3,Help!: Level 1,Philip Prowse,1999,Cambridge University Press


In [44]:
isbn_list = df.isbn.unique()
print(" Length of isbn List:", len(isbn_list))
def get_isbn_numeric_id(isbn):
    #print ("  isbn is:" , isbn)
    itemindex = np.where(isbn_list==isbn)
    return itemindex[0][0]

 Length of isbn List: 8051


In [45]:
userid_list = df.user_id.unique()
print(" Length of user_id List:", len(userid_list))
def get_user_id_numeric_id(user_id):
    #print ("  isbn is:" , isbn)
    itemindex = np.where(userid_list==user_id)
    return itemindex[0][0]

 Length of user_id List: 828


In [46]:
df['user_id_order'] = df['user_id'].apply(get_user_id_numeric_id)
df['isbn_id'] = df['isbn'].apply(get_isbn_numeric_id)
df.head()

Unnamed: 0,user_id,isbn,rating,book_title,book_author,year_of_publication,publisher,user_id_order,isbn_id
0,276725,034545104X,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,0,0
1,276726,155061224,5,Rites of Passage,Judith Rae,2001,Heinle,1,1
2,276727,446520802,0,The Notebook,Nicholas Sparks,1996,Warner Books,2,2
3,278418,446520802,0,The Notebook,Nicholas Sparks,1996,Warner Books,3,2
4,276729,052165615X,3,Help!: Level 1,Philip Prowse,1999,Cambridge University Press,4,3


In [47]:
new_col_order = ['user_id_order', 'isbn_id', 'rating', 'book_title', 'book_author','year_of_publication','publisher','isbn','user_id']
df = df.reindex(columns= new_col_order)
df.head()

Unnamed: 0,user_id_order,isbn_id,rating,book_title,book_author,year_of_publication,publisher,isbn,user_id
0,0,0,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,034545104X,276725
1,1,1,5,Rites of Passage,Judith Rae,2001,Heinle,155061224,276726
2,2,2,0,The Notebook,Nicholas Sparks,1996,Warner Books,446520802,276727
3,3,2,0,The Notebook,Nicholas Sparks,1996,Warner Books,446520802,278418
4,4,3,3,Help!: Level 1,Philip Prowse,1999,Cambridge University Press,052165615X,276729


In [40]:
#df = pd.read_csv('BX-Book-Ratings.csv',names=['user_id', 'isbn', 'rating'])
df = pd.read_csv('BX-Book-Ratings.csv', encoding = "ISO-8859-1").head(10000)
df.head()

Unnamed: 0,user_id,isbn,rating
0,276725,034545104X,0
1,276726,155061224,5
2,276727,446520802,0
3,276729,052165615X,3
4,276729,521795028,6


In [41]:
from sklearn.model_selection import train_test_split
n_users = df.user_id.unique().shape[0] 
n_books = df.isbn.unique().shape[0]
print("n_users {}, n_books {}".format(n_users, n_books))
train_data, test_data = train_test_split(df, test_size=0.25)

n_users 941, n_books 9335


In [28]:
users_list = df.user_id.unique().tolist()
books_list = df.isbn.unique().tolist()

In [29]:
train_data_matrix = np.zeros((n_users, n_books))
for line in train_data.itertuples():
    #[user_id index, movie_id index] = given rating.
    train_data_matrix[users_list.index(line[1]), books_list.index(line[2])] = line[3] 
train_data_matrix

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 5., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 9., 0.]])

In [30]:
test_data_matrix = np.zeros((n_users, n_books))
for line in test_data.itertuples():
    #[user_id index, movie_id index] = given rating.
    test_data_matrix[users_list.index(line[1]), books_list.index(line[2])] = line[3]
test_data_matrix

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [31]:
#from sklearn.neighbors import KNeighborsClassifier
#classifier = KNeighborsClassifier(n_neighbors=15)
#classifier.fit(train_data_matrix)

In [32]:
from sklearn.metrics import pairwise_distances
user_similarity = pairwise_distances(train_data_matrix, metric='cosine')
book_similarity = pairwise_distances(train_data_matrix.T, metric='cosine')
mean_user_rating = train_data_matrix.mean(axis=1)[:, np.newaxis] 
ratings_diff = (train_data_matrix - mean_user_rating) 
user_pred = mean_user_rating + user_similarity.dot(ratings_diff) / np.array([np.abs(user_similarity).sum(axis=1)]).T
user_pred

array([[-0.00165278,  0.00366637, -0.00165278, ..., -0.00165278,
         0.00792169, -0.00165278],
       [-0.00111659, -0.00111659, -0.00111659, ..., -0.00111659,
         0.00845787, -0.00111659],
       [-0.00165278,  0.00366637, -0.00165278, ..., -0.00165278,
         0.00792169, -0.00165278],
       ...,
       [-0.00165278,  0.00366637, -0.00165278, ..., -0.00165278,
         0.00792169, -0.00165278],
       [ 0.02151056,  0.02682971,  0.02151056, ...,  0.02151056,
         0.03108503,  0.02151056],
       [ 0.00489424,  0.01021458,  0.00489424, ...,  0.00489424,
         0.00489424,  0.00489424]])

In [33]:
from sklearn.metrics import mean_squared_error
from math import sqrt
def rmse(pred, test):
    pred = pred[test.nonzero()].flatten() 
    test = test[test.nonzero()].flatten()
    return sqrt(mean_squared_error(pred, test))

In [34]:
rmse(user_pred, test_data_matrix)

7.711759361812985