In [None]:
import numpy as np 
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

#function to load the data 
def load_data(input_path):
    # Load and return all datasets
    users = pd.read_csv(os.path.join(input_path, 'BX-NewBooksUsers.csv'))
    ratings = pd.read_csv(os.path.join(input_path, 'BX-Ratings.csv'))
    books_old = pd.read_csv(os.path.join(input_path, 'BX-Books.csv'))
    new_books = pd.read_csv(os.path.join(input_path, 'BX-NewBooks.csv'))
    books = pd.concat([books_old, new_books]).drop_duplicates(subset=["Book-Title"], keep='first')
    return users, ratings, books, books_old, new_books

#function to create tags and vectorize the data
def prepare_data(books):
    # Prepare book data by creating a tags column and vectorizing it
    books['tags'] = books['Book-Author'] + " " + books['Book-Title'] + " " + books['Book-Publisher'] + " " + books['Year-Of-Publication'].astype(str)
    cv = CountVectorizer(max_features=500, stop_words="english")
    vectors = cv.fit_transform(books["tags"]).toarray()
    return vectors

#function to run cosine similrity and return first 20 similar books
def find_similar_books(book_id, books, new_books, vectors):
    # Calculate similarity and find similar books
    similarity = cosine_similarity(vectors)
    try:
        book_idx = books.index[books['ISBN'] == book_id].tolist()[0]
        similar_books = sorted(list(enumerate(similarity[book_idx])), key=lambda x: x[1], reverse=True)[1:20]
        return similar_books
    except IndexError:
        print("Book ID not found in dataset.")
        return []

#function to predict the rating of the given book 
def predict_rating(user_id, book_id, ratings, books, books_old, similar_books):
    # Recommend books based on the cosine similarity scores
    rating_sims = [] #for predicted ratings
    mean_sims=[] #for mean similarity score
    printed_books = 0
    
    book_isbn = books[books['ISBN']==book_id]
    book_title=book_isbn["Book-Title"].values
    print(f"recommended books for people who have read {book_title[0]}:\n")
    
    for idx, sim_score in similar_books:
        similar_isbn = books.iloc[idx]['ISBN']
        #we can only get ratings if the ISBN for the simialar book is in the old_books dataset
        if similar_isbn in books_old['ISBN'].values:
            book_title = books.iloc[idx]['Book-Title']
            book_author = books.iloc[idx]['Book-Author']
            isbn_ratings = ratings[ratings["ISBN"] == similar_isbn]
            #check if it actually has ratings
            if not isbn_ratings.empty:
                mean_rating = isbn_ratings['Book-Rating'].mean() 
                adjusted_score = mean_rating * sim_score #find predicted rating for the similar book
                rating_sims.append(float(adjusted_score)) #push the predicted score of the book to the array
                mean_sims.append(float(sim_score)) #push mean similarity score
                if printed_books < 5: #only print the first five similar books
                    printed_books= printed_books+1
                    print(f"\n{book_title} by {book_author} (Adjusted Score: {adjusted_score:.2f}) Sim Score = {sim_score:.4f}")
    if rating_sims:
        # find mean of similarity and predicted_rating
        avg_score = sum(rating_sims) / len(rating_sims)
        mean_sim = sum(mean_sims) / len(mean_sims) 
        average_adjusted_score = (avg_score/mean_sim) #mapping back to 1 and 10 
        book_isbn = books[books['ISBN']==book_id]
        book_title=book_isbn["Book-Title"].values
        print(f"\nPredicted Score for {book_title[0]}: {average_adjusted_score:.2f}") #print the predicted rating for given ISBN
    return average_adjusted_score

#function to find similar books and run predict on them
def predict(user_id, book_id, input_path):
    users, ratings, books, books_old, new_books = load_data(input_path)
    vectors = prepare_data(books)
    similar_books = find_similar_books(book_id, books, new_books, vectors)
    #if similar_books exist:
    if similar_books:
        avg_score = predict_rating(user_id, book_id, ratings, books, books_old, similar_books)
        return avg_score
    
def main(input_path):
    #input_path = '/kaggle/input/booksusers'
    predict(276762, "0836218655", input_path)

main('/kaggle/input/booksusers') #change it with your input path


#change all of this with your input path for the pre-processed data
users = pd.read_csv('/kaggle/input/booksusers/BX-NewBooksUsers.csv')
ratings = pd.read_csv('/kaggle/input/booksusers/BX-Ratings.csv')
books_old = pd.read_csv('/kaggle/input/booksusers/BX-Books.csv')
new_books = pd.read_csv('/kaggle/input/booksusers/BX-NewBooks.csv')
new_ratings = pd.read_csv('/kaggle/input/booksusers/BX-NewBooksRatings.csv')

new_ratings = new_ratings.head(10)
isbn_vals = new_ratings["ISBN"].values
input_path = '/kaggle/input/booksusers'
rated_vals =[]

actual_rates = new_ratings["Book-Rating"].values #store actual values in array
# run the function to predict the rating on the new books using the ISBNs of the same 100 selected new books 
for i in isbn_vals:
    pred_rating = predict(276762, str(i), input_path)
    rated_vals.append(pred_rating)
print(rated_vals)

#removes None from the array and replaces it with the mean
new_vals = []
# We ran the program on the 1st 100 files in the dataset and saved the predicted values in the array:
pred_vals =[6.780299296868589, 6.883247181439865, 6.491441234512691, 5.092070939010894, 4.825337676616067, 5.8903670359152205, 6.670018546736091, 6.797327773128877, 6.215137741899781, 7.31106536571452, 6.558625729282679, 6.765252514524346, 7.056554333195935, 6.489950526422479, 6.11304471064381, 6.90733517567666, 5.854211718175123, None, 5.71726227903091, 5.555935030769504, 5.947359844298425, 5.5693953559734535, 6.298941973637561, 6.569553759170713, 5.855623511015049, 5.589480884780647, 5.735125250401022, 6.814945942344286, 6.656405412048171, 5.751456260313828, 6.448911664036413, 5.538093664195579, 5.364355158664125, None, 6.362038264211947, 5.690925830637674, None, None, 6.57754404581198, 6.729824359998354, None, 5.516965715774633, 5.495337900117775, 5.377176963447635, 6.696485825415812, 5.9986595516771, 6.55900460775128, 5.8173472607226, None, 5.7606629197717965, 5.20461164534942, 5.817852828551997, 6.527926153214571, 6.087540539181012, 5.371760452991041, 6.797797595201976, None, 6.680991920934416, 5.920927699221332, 6.9678605136471115, 6.848365566172391, 5.008352686552237, 6.42738775445907, 4.0941123938064905, None, 6.643643687111696, 6.8091837363328205, 6.377581414109571, 7.090732744920675, 6.744665255271885, None, 6.1956046768719615, 6.93372600781848, 6.401923654948151, 5.6985439155188065, 5.687613695343721, 6.089168203811976, 5.6270839936187365, 7.164204005342974, 5.584881826489261, 6.109526555145154, 6.018047386376358, 6.3826210822445155, 6.729495719702318, 5.954620442798883, 7.292383333929558, 6.666142063485205, 6.855348094937719, None, 6.242415158020659, 6.5930687637163095, None, 6.417388392386845, 6.748462021489844, 6.4167945708357035, 6.651397253021553, None, 6.817983103236857, 6.314321711121775, 6.535561447372781]
total = sum(x for x in pred_vals if x is not None)
mean_vals = total/100
for i in range(0,len(pred_vals)) :
    if pred_vals[i] is not None:
        new_vals.append(pred_vals[i])
    else:
        new_vals.append(mean_vals)

new_ratings = new_ratings.head(100)
actual_rates = new_ratings["Book-Rating"].values #actual ratings for the first 100 entries
y_actual = actual_rates
y_predicted = new_vals # predicted ratings for the first 100 entries

#Find and print RMSE
MSE = np.square(np.subtract(actual_rates,new_vals)).mean()  
RMSE = math.sqrt(MSE)
print("Root Mean Square Error:\n")
print(RMSE)

#A first attemp to recommend books
'''
#reccomends books to user 27688 based on their readings  ##ignore
user_id=276688 # selected this user as they have a lot of ratings
user_rating = ratings[ratings['User-ID']==user_id]
sorted_df = user_rating.sort_values(by='Book-Rating', ascending=False) # get their highest rated books
sorted_df = sorted_df.head(3)
best_books = sorted_df["ISBN"].values
vectors =prepare_data(books)

for i in range(0,len(best_books)):
    sim_books = find_similar_books (best_books[i],books,new_books,vectors)
    sim_books = sim_books[0:1]
    for idx,score in sim_books:
        isbn_val = books.iloc[idx]['ISBN']
        print(isbn_val in books_old['ISBN'].values)

    

#run predict 
#for i in best_books:
    #if (i in books_old['ISBN'].values):
        #input_path = '/kaggle/input/booksusers'
        #predict(276762, str(i), input_path)
'''