In [1]:
# Source Domain :Reading "Ratings" CSV data files as dataframes

import pandas as pd
rating_dataset = pd.read_csv("ratings.csv")
rating_dataset = rating_dataset.drop(columns =['timestamp'])
rating_dataset.sample(5)

Unnamed: 0,userId,movieId,rating
9327547,64520,3949,4.5
10025471,69335,6951,3.0
4484210,30652,608,4.0
17635512,121980,31878,2.5
1285404,8758,2502,4.0


In [2]:
# Reading "Movies" CSV data files as dataframes

movie_dataset = pd.read_csv("movies.csv")
movie_dataset.sample(5)

Unnamed: 0,movieId,title,genres
13739,68833,How to Cook Your Life (2007),Documentary
24643,116539,Small Time (2014),Comedy|Drama
15879,80568,Come Blow Your Horn (1963),Comedy
4293,4388,Scary Movie 2 (2001),Comedy
4083,4177,"Mirror Crack'd, The (1980)",Crime|Mystery|Thriller


In [3]:
# Merging rating_dataset and movies_dataset w.r.t movieId

data = pd.merge(rating_dataset,movie_dataset, left_on='movieId',right_on ='movieId',how='left')
data= data[:5]
data.head(5)

Unnamed: 0,userId,movieId,rating,title,genres
0,1,2,3.5,Jumanji (1995),Adventure|Children|Fantasy
1,1,29,3.5,"City of Lost Children, The (Cité des enfants p...",Adventure|Drama|Fantasy|Mystery|Sci-Fi
2,1,32,3.5,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller
3,1,47,3.5,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,1,50,3.5,"Usual Suspects, The (1995)",Crime|Mystery|Thriller


In [4]:
data.isnull().values.any()  # check if there is any data in any column missing or not available

False

In [5]:
# get only top5 rated movies per user,threshold is 4.5

maxRating = data.loc[data['rating'].idxmax()]['rating'] 
toprated_user_movies = data.loc[data['rating'] >= maxRating/2 + 1]            # find threshold value of ratings by averging the movie ratings

toprated_user_movies = toprated_user_movies.sort_values(['userId','rating'],ascending=[True,False])   # sort the dataset in descending order of available ratings
toprated_user_movies.reset_index(inplace=True, drop=True)
toprated_user_movies.head(5)

# checking if user has rated a movie multiple times
# toprated_user_movies.duplicated(subset=['userId','movieId'])
# actual data is 20000263 after this step it is 646297

Unnamed: 0,userId,movieId,rating,title,genres
0,1,2,3.5,Jumanji (1995),Adventure|Children|Fantasy
1,1,29,3.5,"City of Lost Children, The (Cité des enfants p...",Adventure|Drama|Fantasy|Mystery|Sci-Fi
2,1,32,3.5,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller
3,1,47,3.5,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,1,50,3.5,"Usual Suspects, The (1995)",Crime|Mystery|Thriller


In [6]:
import numpy as np

toprated_user_movies['genres'] = toprated_user_movies['genres'].str.split('|')             # Break up the big genre string 
                                                                                                      # into a string array
toprated_user_movies['genres'] = np.array(toprated_user_movies['genres'])                  # Convert genres to string value
toprated_user_movies.head()

Unnamed: 0,userId,movieId,rating,title,genres
0,1,2,3.5,Jumanji (1995),"[Adventure, Children, Fantasy]"
1,1,29,3.5,"City of Lost Children, The (Cité des enfants p...","[Adventure, Drama, Fantasy, Mystery, Sci-Fi]"
2,1,32,3.5,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),"[Mystery, Sci-Fi, Thriller]"
3,1,47,3.5,Seven (a.k.a. Se7en) (1995),"[Mystery, Thriller]"
4,1,50,3.5,"Usual Suspects, The (1995)","[Crime, Mystery, Thriller]"


In [7]:
# Target domain : Reading "Books" csv as dataframe

books = pd.read_csv('book_data.csv')
books = books[['book_title','genres','book_rating']]        # Dataset with required columns, discarding rest

books = books.dropna(how='any', axis=0)                     #Dropping any null valued rows

books['genres'] = books['genres'].str.split('|')            # Break up the big genre string into a string array
books['genres'] = np.array(books['genres'])                 
books['length'] = books['genres'].str.len()                 # Convert genres to string value

books = books.sort_values(['length'])                       #sorting the dataset w.r.t to length of genres string such that
                                                                        #highest match is available inorder
books = books.drop(columns = ['length'])
books.reset_index(inplace=True, drop=True)
books.head(5)

Unnamed: 0,book_title,genres,book_rating
0,The Caine Batter,[Literature],3.2
1,الأيام,[Biography],3.53
2,Biological Big Bang: Panspermia and the Origin...,[Nonfiction],4.5
3,Mugglenet.Com's What Will Happen in Harry Pott...,[Nonfiction],4.2
4,"Crónica del rey cautivo (El último Qassatar, #1)",[Fantasy],4.13


In [8]:
# Finding similarity between movies and books using Jacard Similarity
import statistics as s

def book_basedon_movie(genre):
    book_titles = []
    scores = []
    ratings = []
    
    for i,row in books.iterrows():
        movies_set = set(genre)
        books_set = set(row.genres)
        
        intersection = movies_set.intersection(books_set)
        scores.append(float(len(intersection))/(len(movies_set)+len(books_set)))
        
        book_titles.append(row.book_title)
        ratings.append(row.book_rating)
        
    final = pd.DataFrame(columns = ['book_title','score','ratings'])
    final['book_title'] = book_titles
    final['score'] = scores
    final['ratings'] = ratings
    return final

# Books recommendation system

books_array = []
books_ratings = []

for i, row in toprated_user_movies.iterrows():
    book_df = book_basedon_movie(row.genres)
    maxRating_books = book_df.loc[book_df['ratings'].idxmax()]['ratings'] 

    book_df = book_df.loc[book_df['ratings'] >= maxRating_books/2 + 1]       
    book_df = book_df.sort_values(['score','ratings'],ascending=[False,False]).head(5)
    books_ratings=(book_df['ratings'].values).mean()

    toprated_user_movies.loc[i, 'books_rating'] = books_ratings
    books_array.append(','.join([str(elem) for elem in book_df['book_title']]))

toprated_user_movies['books'] = books_array
toprated_user_movies.head()

Unnamed: 0,userId,movieId,rating,title,genres,books_rating,books
0,1,2,3.5,Jumanji (1995),"[Adventure, Children, Fantasy]",4.008,Selections from Harry Potter and the Order of ...
1,1,29,3.5,"City of Lost Children, The (Cité des enfants p...","[Adventure, Drama, Fantasy, Mystery, Sci-Fi]",4.278,"The 39 Clues Complete Collection,This Book Is ..."
2,1,32,3.5,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),"[Mystery, Sci-Fi, Thriller]",4.258,"Shannon's Land,Rain of Terror,Repercussions,Do..."
3,1,47,3.5,Seven (a.k.a. Se7en) (1995),"[Mystery, Thriller]",4.258,"Shannon's Land,Rain of Terror,Repercussions,Do..."
4,1,50,3.5,"Usual Suspects, The (1995)","[Crime, Mystery, Thriller]",4.334,"Dead Souls,Blood Lines,Deadly Game,Der Augenjä..."


In [9]:
Actual_Rating = (toprated_user_movies['rating']).values
Predicted_Rating = (toprated_user_movies['books_rating']).values
print("X",Actual_Rating)
print("Y",Predicted_Rating)

X [3.5 3.5 3.5 3.5 3.5]
Y [4.008 4.278 4.258 4.258 4.334]


In [10]:
from sklearn.metrics import mean_absolute_error
print("Mean Absolute Error",mean_absolute_error(Predicted_Rating,Actual_Rating))

from sklearn.metrics import r2_score                #proportion of variation in the outcome that is explained by the predictor variables.
print("R2 Score",r2_score(Predicted_Rating,Actual_Rating))

from sklearn.metrics import mean_squared_error          #which measures the average error performed by the model in predicting the outcome for an observation.
print("Mean Squared Error",mean_squared_error(Predicted_Rating,Actual_Rating))

Mean Absolute Error 0.7272000000000002
R2 Score -41.3574753491165
Mean Squared Error 0.5416064000000003
